General updates
[covid19.git] / euromomo / euromomo.ipynb
1 {
2 "cells": [
3 {
4 "cell_type": "code",
5 "execution_count": 1,
6 "metadata": {},
7 "outputs": [],
8 "source": [
9 "import json\n",
10 "import requests\n",
11 "from bs4 import BeautifulSoup"
12 ]
13 },
14 {
15 "cell_type": "code",
16 "execution_count": 2,
17 "metadata": {},
18 "outputs": [],
19 "source": [
20 "# Change this\n",
21 "OUTPUT_FILE = 'euromomo.json'"
22 ]
23 },
24 {
25 "cell_type": "code",
26 "execution_count": 3,
27 "metadata": {},
28 "outputs": [],
29 "source": [
30 "# Change this if the script breaks\n",
31 "BASE_PAGE = \"https://www.euromomo.eu/graphs-and-maps/\"\n",
32 "JS_FILE_PATTERN = \"src-templates-graphs-and-maps-js\""
33 ]
34 },
35 {
36 "cell_type": "code",
37 "execution_count": 30,
38 "metadata": {},
39 "outputs": [],
40 "source": [
41 "# First, we need to find the JS link inside the webpage\n",
42 "r = requests.get(BASE_PAGE)\n",
43 "if r.status_code != 200:\n",
44 " raise Exception(f\"Cannot reach webpage {BASE_PAGE} {r.status_code}\")\n",
45 "soup = BeautifulSoup(r.text, features=\"lxml\")\n",
46 "link_to_file = None\n",
47 "for possible in soup.find_all(\"link\", attrs={\"as\": \"script\"}):\n",
48 " if JS_FILE_PATTERN in possible['href']:\n",
49 " link_to_file = \"https://www.euromomo.eu\" + possible['href']\n",
50 " break"
51 ]
52 },
53 {
54 "cell_type": "code",
55 "execution_count": 31,
56 "metadata": {},
57 "outputs": [
58 {
59 "data": {
60 "text/plain": [
61 "[<link as=\"script\" href=\"/50fe40790468168eec944b4bf3a7387c41e08258-80a60393842707b4ef3f.js\" rel=\"preload\"/>,\n",
62 " <link as=\"script\" href=\"/framework-e94c62ba7ecbd1156bac.js\" rel=\"preload\"/>,\n",
63 " <link as=\"script\" href=\"/app-c8f47833a3ed3eeb35b5.js\" rel=\"preload\"/>,\n",
64 " <link as=\"script\" href=\"/component---src-templates-graphs-and-maps-js-4be2e0d0712cbdd396d8.js\" rel=\"preload\"/>,\n",
65 " <link as=\"script\" href=\"/commons-5c085e7cd1a604626d58.js\" rel=\"preload\"/>,\n",
66 " <link as=\"script\" href=\"/5e2a4920-b8478647cb0804da9c90.js\" rel=\"preload\"/>,\n",
67 " <link as=\"script\" href=\"/styles-c41920853eed0685ae46.js\" rel=\"preload\"/>,\n",
68 " <link as=\"script\" href=\"/webpack-runtime-ba300874af730e00e5e4.js\" rel=\"preload\"/>]"
69 ]
70 },
71 "execution_count": 31,
72 "metadata": {},
73 "output_type": "execute_result"
74 }
75 ],
76 "source": [
77 "soup.find_all(\"link\", attrs={\"as\": \"script\"})"
78 ]
79 },
80 {
81 "cell_type": "code",
82 "execution_count": 32,
83 "metadata": {},
84 "outputs": [],
85 "source": [
86 "if link_to_file is None:\n",
87 " print(f\"Could not find a JS file with {JS_FILE_PATTERN} in its name :-(\")\n",
88 " exit(1)"
89 ]
90 },
91 {
92 "cell_type": "code",
93 "execution_count": 33,
94 "metadata": {},
95 "outputs": [
96 {
97 "data": {
98 "text/plain": [
99 "'https://www.euromomo.eu/component---src-templates-graphs-and-maps-js-4be2e0d0712cbdd396d8.js'"
100 ]
101 },
102 "execution_count": 33,
103 "metadata": {},
104 "output_type": "execute_result"
105 }
106 ],
107 "source": [
108 "link_to_file"
109 ]
110 },
111 {
112 "cell_type": "code",
113 "execution_count": 34,
114 "metadata": {},
115 "outputs": [],
116 "source": [
117 "# We need a bit of renaming\n",
118 "def rename_key(key, value):\n",
119 " print(f'doing {key}')\n",
120 " if isinstance(value, list):\n",
121 " if \"Belgium\" in value:\n",
122 " return \"countries\"\n",
123 " if \"Total\" in value:\n",
124 " return \"age_groups\"\n",
125 " elif isinstance(value, str):\n",
126 " value = int(value)\n",
127 " if value <= 52:\n",
128 " return \"week\"\n",
129 " if value >= 2020:\n",
130 " return \"year\"\n",
131 " elif isinstance(value, dict):\n",
132 " if set(value.keys()) == {\"counts\", \"zscores\", \"weeks\"}:\n",
133 " return \"data_totals\"\n",
134 " if set(value.keys()) == {\"data\", \"years\"}:\n",
135 " return \"excess_mortality\"\n",
136 " if set(value.keys()) == {\"data\", \"weeks\"}:\n",
137 " if len(value[\"data\"]) == 24:\n",
138 " return \"z_scores_country_age_groups\"\n",
139 " if len(value[\"data\"]) >= 276:\n",
140 " return \"z_scores_country\"\n",
141 " raise Exception(f\"Cannot find valid key name. Key is {key} with value of type {type(value)}\")\n"
142 ]
143 },
144 {
145 "cell_type": "code",
146 "execution_count": 35,
147 "metadata": {},
148 "outputs": [],
149 "source": [
150 "js_file: str = requests.get(link_to_file).text"
151 ]
152 },
153 {
154 "cell_type": "code",
155 "execution_count": 36,
156 "metadata": {},
157 "outputs": [],
158 "source": [
159 "pos = 0\n",
160 "found = []\n",
161 "while True:\n",
162 " pos = js_file.find(\"JSON.parse('\", pos)\n",
163 " if pos == -1:\n",
164 " break\n",
165 " end_pos = js_file.find(\"')\", pos)\n",
166 "\n",
167 " content = js_file[pos+len(\"JSON.parse('\"):end_pos]\n",
168 " assert \"'\" not in content\n",
169 " found.append(content)\n",
170 " pos += 1\n",
171 "\n",
172 "# the biggest JSON part in the file is probably the good one ;-)\n",
173 "biggest = max(found, key=lambda x: len(x))"
174 ]
175 },
176 {
177 "cell_type": "code",
178 "execution_count": 37,
179 "metadata": {},
180 "outputs": [],
181 "source": [
182 "raw_data = json.loads(biggest)"
183 ]
184 },
185 {
186 "cell_type": "code",
187 "execution_count": 38,
188 "metadata": {},
189 "outputs": [
190 {
191 "name": "stdout",
192 "output_type": "stream",
193 "text": [
194 "doing weeks\n"
195 ]
196 },
197 {
198 "ename": "Exception",
199 "evalue": "Cannot find valid key name. Key is weeks with value of type <class 'list'>",
200 "output_type": "error",
201 "traceback": [
202 "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
203 "\u001b[0;31mException\u001b[0m Traceback (most recent call last)",
204 "\u001b[0;32m<ipython-input-38-7fbed730e1a3>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mrename_key\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mraw_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
205 "\u001b[0;32m<ipython-input-38-7fbed730e1a3>\u001b[0m in \u001b[0;36m<dictcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mrename_key\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mraw_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
206 "\u001b[0;32m<ipython-input-34-f58db1eb5796>\u001b[0m in \u001b[0;36mrename_key\u001b[0;34m(key, value)\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"data\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>=\u001b[0m \u001b[0;36m276\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;34m\"z_scores_country\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 25\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"Cannot find valid key name. Key is {key} with value of type {type(value)}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
207 "\u001b[0;31mException\u001b[0m: Cannot find valid key name. Key is weeks with value of type <class 'list'>"
208 ]
209 }
210 ],
211 "source": [
212 "data = {rename_key(k, v): v for k, v in raw_data.items()}"
213 ]
214 },
215 {
216 "cell_type": "code",
217 "execution_count": 39,
218 "metadata": {},
219 "outputs": [],
220 "source": [
221 "with open('euromomo.json', 'w') as f:\n",
222 " json.dump(data, f)"
223 ]
224 },
225 {
226 "cell_type": "code",
227 "execution_count": 13,
228 "metadata": {},
229 "outputs": [
230 {
231 "data": {
232 "text/plain": [
233 "1"
234 ]
235 },
236 "execution_count": 13,
237 "metadata": {},
238 "output_type": "execute_result"
239 }
240 ],
241 "source": [
242 "len(found)"
243 ]
244 },
245 {
246 "cell_type": "code",
247 "execution_count": 23,
248 "metadata": {},
249 "outputs": [
250 {
251 "data": {
252 "text/plain": [
253 "['weeks', 'ages', 'pooled', 'countries', 'reportYear', 'reportWeek']"
254 ]
255 },
256 "execution_count": 23,
257 "metadata": {},
258 "output_type": "execute_result"
259 }
260 ],
261 "source": [
262 "[k for k in raw_data.keys()]"
263 ]
264 },
265 {
266 "cell_type": "code",
267 "execution_count": 26,
268 "metadata": {},
269 "outputs": [
270 {
271 "data": {
272 "text/plain": [
273 "list"
274 ]
275 },
276 "execution_count": 26,
277 "metadata": {},
278 "output_type": "execute_result"
279 }
280 ],
281 "source": [
282 "type([1,2,3])"
283 ]
284 },
285 {
286 "cell_type": "code",
287 "execution_count": null,
288 "metadata": {},
289 "outputs": [],
290 "source": []
291 }
292 ],
293 "metadata": {
294 "kernelspec": {
295 "display_name": "Python 3",
296 "language": "python",
297 "name": "python3"
298 },
299 "language_info": {
300 "codemirror_mode": {
301 "name": "ipython",
302 "version": 3
303 },
304 "file_extension": ".py",
305 "mimetype": "text/x-python",
306 "name": "python",
307 "nbconvert_exporter": "python",
308 "pygments_lexer": "ipython3",
309 "version": "3.7.4"
310 }
311 },
312 "nbformat": 4,
313 "nbformat_minor": 4
314 }