{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import requests\n",
"from bs4 import BeautifulSoup"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Change this\n",
"OUTPUT_FILE = 'euromomo.json'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Change this if the script breaks\n",
"BASE_PAGE = \"https://www.euromomo.eu/graphs-and-maps/\"\n",
"JS_FILE_PATTERN = \"src-templates-graphs-and-maps-js\""
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"# First, we need to find the JS link inside the webpage\n",
"r = requests.get(BASE_PAGE)\n",
"if r.status_code != 200:\n",
" raise Exception(f\"Cannot reach webpage {BASE_PAGE} {r.status_code}\")\n",
"soup = BeautifulSoup(r.text, features=\"lxml\")\n",
"link_to_file = None\n",
"for possible in soup.find_all(\"link\", attrs={\"as\": \"script\"}):\n",
" if JS_FILE_PATTERN in possible['href']:\n",
" link_to_file = \"https://www.euromomo.eu\" + possible['href']\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[,\n",
" ,\n",
" ,\n",
" ,\n",
" ,\n",
" ,\n",
" ,\n",
" ]"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"soup.find_all(\"link\", attrs={\"as\": \"script\"})"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"if link_to_file is None:\n",
" print(f\"Could not find a JS file with {JS_FILE_PATTERN} in its name :-(\")\n",
" exit(1)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'https://www.euromomo.eu/component---src-templates-graphs-and-maps-js-4be2e0d0712cbdd396d8.js'"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"link_to_file"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"# We need a bit of renaming\n",
"def rename_key(key, value):\n",
" print(f'doing {key}')\n",
" if isinstance(value, list):\n",
" if \"Belgium\" in value:\n",
" return \"countries\"\n",
" if \"Total\" in value:\n",
" return \"age_groups\"\n",
" elif isinstance(value, str):\n",
" value = int(value)\n",
" if value <= 52:\n",
" return \"week\"\n",
" if value >= 2020:\n",
" return \"year\"\n",
" elif isinstance(value, dict):\n",
" if set(value.keys()) == {\"counts\", \"zscores\", \"weeks\"}:\n",
" return \"data_totals\"\n",
" if set(value.keys()) == {\"data\", \"years\"}:\n",
" return \"excess_mortality\"\n",
" if set(value.keys()) == {\"data\", \"weeks\"}:\n",
" if len(value[\"data\"]) == 24:\n",
" return \"z_scores_country_age_groups\"\n",
" if len(value[\"data\"]) >= 276:\n",
" return \"z_scores_country\"\n",
" raise Exception(f\"Cannot find valid key name. Key is {key} with value of type {type(value)}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"js_file: str = requests.get(link_to_file).text"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"pos = 0\n",
"found = []\n",
"while True:\n",
" pos = js_file.find(\"JSON.parse('\", pos)\n",
" if pos == -1:\n",
" break\n",
" end_pos = js_file.find(\"')\", pos)\n",
"\n",
" content = js_file[pos+len(\"JSON.parse('\"):end_pos]\n",
" assert \"'\" not in content\n",
" found.append(content)\n",
" pos += 1\n",
"\n",
"# the biggest JSON part in the file is probably the good one ;-)\n",
"biggest = max(found, key=lambda x: len(x))"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"raw_data = json.loads(biggest)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"doing weeks\n"
]
},
{
"ename": "Exception",
"evalue": "Cannot find valid key name. Key is weeks with value of type ",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mException\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mrename_key\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mraw_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mrename_key\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mraw_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m\u001b[0m in \u001b[0;36mrename_key\u001b[0;34m(key, value)\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"data\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>=\u001b[0m \u001b[0;36m276\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;34m\"z_scores_country\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 25\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"Cannot find valid key name. Key is {key} with value of type {type(value)}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mException\u001b[0m: Cannot find valid key name. Key is weeks with value of type "
]
}
],
"source": [
"data = {rename_key(k, v): v for k, v in raw_data.items()}"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"with open('euromomo.json', 'w') as f:\n",
" json.dump(data, f)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(found)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['weeks', 'ages', 'pooled', 'countries', 'reportYear', 'reportWeek']"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[k for k in raw_data.keys()]"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"list"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type([1,2,3])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 4
}