Now using py files, for automation
[covid19.git] / uk_deaths_import.md
1 ---
2 jupyter:
3 jupytext:
4 formats: ipynb,md
5 text_representation:
6 extension: .md
7 format_name: markdown
8 format_version: '1.2'
9 jupytext_version: 1.9.1
10 kernelspec:
11 display_name: Python 3
12 language: python
13 name: python3
14 ---
15
16 <!-- #region Collapsed="false" -->
17 Data from:
18
19 * [Office of National Statistics](https://www.ons.gov.uk/peoplepopulationandcommunity/birthsdeathsandmarriages/deaths/datasets/weeklyprovisionalfiguresondeathsregisteredinenglandandwales) (Endland and Wales) Weeks start on a Saturday.
20 * [Northern Ireland Statistics and Research Agency](https://www.nisra.gov.uk/publications/weekly-deaths) (Northern Ireland). Weeks start on a Saturday. Note that the week numbers don't match the England and Wales data.
21 * [National Records of Scotland](https://www.nrscotland.gov.uk/statistics-and-data/statistics/statistics-by-theme/vital-events/general-publications/weekly-and-monthly-data-on-births-and-deaths/weekly-data-on-births-and-deaths) (Scotland). Note that Scotland uses ISO8601 week numbers, which start on a Monday.
22
23 <!-- #endregion -->
24
25 ```python Collapsed="false"
26 import itertools
27 import collections
28 import json
29 import pandas as pd
30 import numpy as np
31 from scipy.stats import gmean
32 import datetime
33
34 import matplotlib as mpl
35 import matplotlib.pyplot as plt
36 %matplotlib inline
37
38 from sqlalchemy.types import Integer, Text, String, DateTime, Float
39 from sqlalchemy import create_engine
40 %load_ext sql
41 ```
42
43 ```python Collapsed="false"
44 connection_string = 'postgresql://covid:3NbjJTkT63@localhost/covid'
45 ```
46
47 ```python Collapsed="false"
48 %sql $connection_string
49 ```
50
51 ```python Collapsed="false"
52 engine = create_engine(connection_string)
53 ```
54
55 ```python Collapsed="false"
56 england_wales_filename = 'uk-deaths-data/copyofpublishedweek042021.xlsx'
57 ```
58
59 ```python Collapsed="false"
60 scotland_filename = 'uk-deaths-data/Scottish Government COVID-19 data (10 February 2021).xlsx'
61 ```
62
63 ```python
64 n_ireland_filename = 'uk-deaths-data/Weekly_Deaths_0.xlsx'
65 ```
66
67 ```python Collapsed="false"
68 eng_xls = pd.read_excel(england_wales_filename,
69 sheet_name="Weekly figures 2021",
70 skiprows=[0, 1, 2, 3],
71 skipfooter=11,
72 header=0,
73 index_col=[1]
74 ).iloc[:91].T
75 eng_xls
76 ```
77
78 ```python Collapsed="false"
79 eng_xls_columns = list(eng_xls.columns)
80
81 for i, c in enumerate(eng_xls_columns):
82 # print(i, c, type(c), isinstance(c, float))
83 if isinstance(c, float) and np.isnan(c):
84 if eng_xls.iloc[0].iloc[i] is not pd.NaT:
85 eng_xls_columns[i] = eng_xls.iloc[0].iloc[i]
86
87 # np.isnan(eng_xls_columns[0])
88 # eng_xls_columns
89
90 eng_xls.columns = eng_xls_columns
91 # eng_xls.columns
92 ```
93
94 ```python
95 eng_xls
96 ```
97
98 ```python
99 rd = eng_xls.iloc[1:][['Week ended', 'Wales']].reset_index(level=0).rename(
100 columns={'Week ended': 'date_up_to', 'Wales': 'deaths',
101 'index': 'week'}
102 )
103 rd['year'] = 2021
104 rd['nation'] = 'Wales'
105 rd.dropna(inplace=True)
106 rd.head()
107 ```
108
109 ```python
110 query_string = '''
111 delete from all_causes_deaths
112 where nation = 'Wales'
113 and year = 2021;
114 '''
115 with engine.connect() as connection:
116 connection.execute(query_string)
117 ```
118
119 ```python
120 rd.to_sql(
121 'all_causes_deaths',
122 engine,
123 if_exists='append',
124 index=False)
125 ```
126
127 ```python
128 %sql select * from all_causes_deaths where year = 2021 limit 10
129 ```
130
131 ```python
132 rd = eng_xls.iloc[1:][['Week ended', 'Total deaths, all ages (2021)', 'Wales']].reset_index(level=0).rename(
133 columns={'Week ended': 'date_up_to',
134 'Total deaths, all ages (2021)': 'ew_deaths',
135 'Wales': 'w_deaths',
136 'index': 'week'}
137 )
138 rd['year'] = 2021
139 rd['nation'] = 'England'
140 rd['deaths'] = rd['ew_deaths'] - rd['w_deaths']
141 rd.drop(labels=['ew_deaths', 'w_deaths'], axis='columns', inplace=True)
142 rd.dropna(inplace=True)
143 rd.tail()
144 ```
145
146 ```python
147 query_string = '''
148 delete from all_causes_deaths
149 where nation = 'England'
150 and year = 2021;
151 '''
152 with engine.connect() as connection:
153 connection.execute(query_string)
154 ```
155
156 ```python
157 rd.to_sql(
158 'all_causes_deaths',
159 engine,
160 if_exists='append',
161 index=False)
162 ```
163
164 ```python
165 %sql select nation, sum(deaths), count(*) from all_causes_deaths where year = 2021 group by nation
166 ```
167
168 ```python
169 ni_xls = pd.read_excel(n_ireland_filename,
170 sheet_name='Table 1',
171 skiprows=[0, 1, 2, 3],
172 header=0,
173 ).rename(
174 columns={'Week Ending (Friday)': 'date_up_to', 'Total Number of Deaths Registered in Week (2021P)': 'deaths',
175 'Registration Week': 'week'})
176 rd = ni_xls[ni_xls['deaths'].notna()][['week', 'date_up_to', 'deaths']]
177 rd['year'] = 2021
178 rd['nation'] = 'Northern Ireland'
179 rd
180 ```
181
182 ```python
183 query_string = '''
184 delete from all_causes_deaths
185 where nation = 'Northern Ireland'
186 and year = 2021;
187 '''
188 with engine.connect() as connection:
189 connection.execute(query_string)
190 ```
191
192 ```python
193 rd.to_sql(
194 'all_causes_deaths',
195 engine,
196 if_exists='append',
197 index=False)
198 ```
199
200 ```python
201 %sql select nation from all_causes_deaths group by nation
202 ```
203
204 ```python
205 # sco_xls = pd.read_excel(scotland_filename,
206 # sheet_name="2.2_excess",
207 # skiprows=[0, 1, 2, 3],
208 # skipfooter=3,
209 # header=0,
210 # index_col=[1]
211 # ).iloc[:91].T
212 ```
213
214 ```python
215 import openpyxl
216 ```
217
218 ```python
219 wb = openpyxl.load_workbook(scotland_filename, read_only=True)#, data_only=True, keep_links=False)
220 sheet = wb.worksheets[7]
221 sheet
222 ```
223
224 ```python
225 sheet.reset_dimensions()
226 ```
227
228 ```python
229 scot_elems = [[value for value in row] for row in sheet.values]
230 scot_cols = scot_elems[3]
231 scot_dicts = [{k: v for k, v in zip(scot_cols, row)} for row in scot_elems[4:]]
232 scot_data = pd.DataFrame(scot_dicts)
233 rd = scot_data[scot_data.date >= '2021'].rename(
234 columns={'week_number': 'week', 'date': 'date_up_to', 'total_deaths': 'deaths'})[['week', 'date_up_to', 'deaths']]
235 rd['year'] = 2021
236 rd['nation'] = 'Scotland'
237 rd
238 ```
239
240 ```python
241 query_string = '''
242 delete from all_causes_deaths
243 where nation = 'Scotland'
244 and year = 2021;
245 '''
246 with engine.connect() as connection:
247 connection.execute(query_string)
248 ```
249
250 ```python
251 rd.to_sql(
252 'all_causes_deaths',
253 engine,
254 if_exists='append',
255 index=False)
256 ```
257
258 ```python
259 %sql select nation, sum(deaths), count(*) from all_causes_deaths where year = 2021 group by nation
260 ```
261
262 ```python
263
264 ```
265
266 # Create graphs
267
268 ```python
269 qstr = '''select week, year, deaths
270 from all_causes_deaths
271 where nation = 'England' '''
272 deaths_e = pd.read_sql_query(qstr, engine).pivot(index='week', columns='year', values='deaths')
273 deaths_e.head()
274 ```
275
276 ```python
277 qstr = '''select week, year, deaths
278 from all_causes_deaths
279 where nation = 'Wales' '''
280 deaths_w = pd.read_sql_query(qstr, engine).pivot(index='week', columns='year', values='deaths')
281 deaths_w.head()
282 ```
283
284 ```python
285 qstr = '''select week, year, deaths
286 from all_causes_deaths
287 where nation = 'Scotland' '''
288 deaths_s = pd.read_sql_query(qstr, engine).pivot(index='week', columns='year', values='deaths')
289 deaths_s.head()
290 ```
291
292 ```python
293 qstr = '''select week, year, deaths
294 from all_causes_deaths
295 where nation = 'Northern Ireland' '''
296 deaths_i = pd.read_sql_query(qstr, engine).pivot(index='week', columns='year', values='deaths')
297 deaths_i.head()
298 ```
299
300 ```python
301 qstr = '''select week, avg(deaths) as prev_mean
302 from all_causes_deaths
303 where year <= 2019 and nation='England'
304 group by week
305 order by week'''
306 deaths_prev = pd.read_sql_query(qstr, engine, index_col='week')
307 deaths_prev.head()
308 deaths_e = deaths_e.merge(deaths_prev, on='week')
309 deaths_e.head()
310 ```
311
312 ```python
313 qstr = '''select week, avg(deaths) as prev_mean
314 from all_causes_deaths
315 where year <= 2019 and nation='Wales'
316 group by week
317 order by week'''
318 deaths_prev = pd.read_sql_query(qstr, engine, index_col='week')
319 deaths_prev.head()
320 deaths_w = deaths_w.merge(deaths_prev, on='week')
321 deaths_w.head()
322 ```
323
324 ```python
325 qstr = '''select week, avg(deaths) as prev_mean
326 from all_causes_deaths
327 where year <= 2019 and nation='Scotland'
328 group by week
329 order by week'''
330 deaths_prev = pd.read_sql_query(qstr, engine, index_col='week')
331 deaths_prev.head()
332 deaths_s = deaths_s.merge(deaths_prev, on='week')
333 deaths_s.head()
334 ```
335
336 ```python
337 qstr = '''select week, avg(deaths) as prev_mean
338 from all_causes_deaths
339 where year <= 2019 and nation='Northern Ireland'
340 group by week
341 order by week'''
342 deaths_prev = pd.read_sql_query(qstr, engine, index_col='week')
343 deaths_prev.head()
344 deaths_i = deaths_i.merge(deaths_prev, on='week')
345 deaths_i.head()
346 ```
347
348 ```python Collapsed="false"
349 deaths = deaths_e + deaths_w + deaths_i + deaths_s
350 deaths.head()
351 ```
352
353 ```python Collapsed="false"
354 deaths[[2021, 2020, 2019, 2018, 2017, 2016, 2015]].plot(figsize=(14, 8))
355 ```
356
357 ```python Collapsed="false"
358 deaths[[2021, 2020, 'prev_mean']].plot(figsize=(10, 8))
359 ```
360
361 ```python Collapsed="false"
362 deaths_i.plot()
363 ```
364
365 ```python
366 deaths[2020].sum() - deaths.prev_mean.sum()
367 ```
368
369 ```python
370 def _closeline(line):
371 x, y = line.get_data()
372 x = np.concatenate((x, [x[0]]))
373 y = np.concatenate((y, [y[0]]))
374 line.set_data(x, y)
375
376 def _closeline_connect(lines):
377 for line0, line1 in zip(lines, lines[1:]):
378 x0, y0 = line0.get_data()
379 x1, y1 = line1.get_data()
380
381 x0 = np.concatenate((x0, [x1[0]]))
382 y0 = np.concatenate((y0, [y1[0]]))
383 line0.set_data(x0, y0)
384 ```
385
386 ```python Collapsed="false"
387 # Radar plot code taken from example at https://stackoverflow.com/questions/42878485/getting-matplotlib-radar-plot-with-pandas#
388
389 def create_and_save_radar_plot(dataset, title_string, filename_suffix):
390
391 fig = plt.figure(figsize=(10, 10))
392 ax = fig.add_subplot(111, projection="polar")
393
394 theta = np.roll(
395 np.flip(
396 np.arange(len(dataset))/float(len(dataset))*2.*np.pi),
397 14)
398 l15, = ax.plot(theta, dataset[2015], color="#e47d7d", label="2015") # 0
399 l16, = ax.plot(theta, dataset[2016], color="#afc169", label="2016") # 72 , d0e47d
400 l17, = ax.plot(theta, dataset[2017], color="#7de4a6", label="2017") # 144
401 l18, = ax.plot(theta, dataset[2018], color="#7da6e4", label="2018") # 216
402 l19, = ax.plot(theta, dataset[2019], color="#d07de4", label="2019") # 288
403
404 lmean, = ax.plot(theta, dataset['prev_mean'], color="black", linestyle='dashed', label="mean, 15–19")
405
406 l20, = ax.plot(theta, dataset[2020], color="#bb0000", label="2020")
407 l21, = ax.plot(theta, dataset[2021], color="#ff0000", label="2021")
408
409 # deaths_headlines.total_2019.plot(ax=ax)
410
411 _closeline(lmean)
412 _closeline_connect([l15, l16, l17, l18, l19, l20, l21])
413
414 ax.set_xticks(theta)
415 ax.set_xticklabels(dataset.index)
416 plt.legend()
417 plt.title(f"Deaths by week over years, {title_string}")
418 plt.savefig(f'deaths-radar-2021{filename_suffix}.png')
419 plt.show()
420 ```
421
422 ```python
423 create_and_save_radar_plot(deaths, 'all UK', '')
424 ```
425
426 ```python Collapsed="false"
427 # # Radar plot code taken from example at https://stackoverflow.com/questions/42878485/getting-matplotlib-radar-plot-with-pandas#
428
429 # dhna = deaths# .dropna()
430
431 # fig = plt.figure(figsize=(10, 10))
432 # ax = fig.add_subplot(111, projection="polar")
433
434 # theta = np.roll(
435 # np.flip(
436 # np.arange(len(dhna))/float(len(dhna))*2.*np.pi),
437 # 14)
438 # # l15, = ax.plot(theta, deaths_headlines['total_2015'], color="#b56363", label="2015") # 0
439 # # l16, = ax.plot(theta, deaths_headlines['total_2016'], color="#a4b563", label="2016") # 72
440 # # l17, = ax.plot(theta, deaths_headlines['total_2017'], color="#63b584", label="2017") # 144
441 # # l18, = ax.plot(theta, deaths_headlines['total_2018'], color="#6384b5", label="2018") # 216
442 # # l19, = ax.plot(theta, deaths_headlines['total_2019'], color="#a4635b", label="2019") # 288
443 # l15, = ax.plot(theta, dhna[2015], color="#e47d7d", label="2015") # 0
444 # l16, = ax.plot(theta, dhna[2016], color="#afc169", label="2016") # 72 , d0e47d
445 # l17, = ax.plot(theta, dhna[2017], color="#7de4a6", label="2017") # 144
446 # l18, = ax.plot(theta, dhna[2018], color="#7da6e4", label="2018") # 216
447 # l19, = ax.plot(theta, dhna[2019], color="#d07de4", label="2019") # 288
448
449 # lmean, = ax.plot(theta, dhna['prev_mean'], color="black", linestyle='dashed', label="mean, 15–19")
450
451 # l20, = ax.plot(theta, dhna[2020], color="#bb0000", label="2020")
452 # l21, = ax.plot(theta, dhna[2021], color="#ff0000", label="2021")
453
454 # # deaths_headlines.total_2019.plot(ax=ax)
455
456 # _closeline(lmean)
457 # _closeline_connect([l15, l16, l17, l18, l19, l20, l21])
458
459 # ax.set_xticks(theta)
460 # ax.set_xticklabels(dhna.index)
461 # plt.legend()
462 # plt.title("Deaths by week over years, all UK")
463 # plt.savefig('deaths-radar-2021.png')
464 # plt.show()
465 ```
466
467 <!-- #region Collapsed="false" -->
468 # Plots for UK nations
469 <!-- #endregion -->
470
471 ```python
472 create_and_save_radar_plot(deaths_e, 'England', '-england')
473 ```
474
475 ```python
476 create_and_save_radar_plot(deaths_w, 'Wales', '-wales')
477 ```
478
479 ```python
480 create_and_save_radar_plot(deaths_s, 'Scotland', '-scotland')
481 ```
482
483 ```python
484 create_and_save_radar_plot(deaths_i, 'Northern Ireland', '-northern-ireland')
485 ```
486
487 ```python Collapsed="false"
488
489 ```