uk_deaths_import.md

   1 ---
   2 jupyter:
   3   jupytext:
   4     formats: ipynb,md
   5     text_representation:
   6       extension: .md
   7       format_name: markdown
   8       format_version: '1.3'
   9       jupytext_version: 1.11.1
  10   kernelspec:
  11     display_name: Python 3
  12     language: python
  13     name: python3
  14 ---
  15
  16 <!-- #region Collapsed="false" -->
  17 Data from:
  18
  19 * [Office of National Statistics](https://www.ons.gov.uk/peoplepopulationandcommunity/birthsdeathsandmarriages/deaths/datasets/weeklyprovisionalfiguresondeathsregisteredinenglandandwales) (Endland and Wales) Weeks start on a Saturday.
  20 * [Northern Ireland Statistics and Research Agency](https://www.nisra.gov.uk/publications/weekly-deaths) (Northern Ireland). Weeks start on a Saturday. Note that the week numbers don't match the England and Wales data.
  21 * [National Records of Scotland](https://www.nrscotland.gov.uk/statistics-and-data/statistics/statistics-by-theme/vital-events/general-publications/weekly-and-monthly-data-on-births-and-deaths/weekly-data-on-births-and-deaths) (Scotland). Note that Scotland uses ISO8601 week numbers, which start on a Monday.
  22
  23 <!-- #endregion -->
  24
  25 ```python Collapsed="false"
  26 import itertools
  27 import collections
  28 import json
  29 import pandas as pd
  30 import numpy as np
  31 from scipy.stats import gmean
  32 import datetime
  33
  34 import matplotlib as mpl
  35 import matplotlib.pyplot as plt
  36 %matplotlib inline
  37
  38 from sqlalchemy.types import Integer, Text, String, DateTime, Float
  39 from sqlalchemy import create_engine
  40 %load_ext sql
  41 ```
  42
  43 ```python Collapsed="false"
  44 connection_string = 'postgresql://covid:3NbjJTkT63@localhost/covid'
  45 ```
  46
  47 ```python Collapsed="false"
  48 %sql $connection_string
  49 ```
  50
  51 ```python Collapsed="false"
  52 engine = create_engine(connection_string)
  53 ```
  54
  55 ```python
  56 !ls uk-deaths-data/publishedweek*2021*xlsx
  57 ```
  58
  59 ```python Collapsed="false"
  60 england_wales_filename = 'uk-deaths-data/publishedweek122021.xlsx'
  61 ```
  62
  63 ```python Collapsed="false"
  64 scotland_filename = 'uk-deaths-data/Scottish Government COVID-19 data (08 April 2021).xlsx'
  65 ```
  66
  67 ```python
  68 n_ireland_filename = 'uk-deaths-data/Weekly_Deaths - w e 2nd April 2021.XLSX'
  69 ```
  70
  71 ```python Collapsed="false"
  72 # eng_xls = pd.read_excel(england_wales_filename,
  73 #                         sheet_name="Weekly figures 2021",
  74 #                         skiprows=[0, 1, 2, 3],
  75 #                         skipfooter=11,
  76 #                         header=0,
  77 #                         index_col=[1]
  78 #                        ).iloc[:99].T
  79 # eng_xls
  80 ```
  81
  82 ```python Collapsed="false"
  83 eng_xls = pd.read_excel(england_wales_filename,
  84                         sheet_name="Weekly figures 2021",
  85                         skiprows=[0, 1, 2, 3],
  86                         skipfooter=10,
  87                         header=0,
  88                         index_col=[1]
  89                        ).T
  90 eng_xls
  91 ```
  92
  93 ```python Collapsed="false"
  94 eng_xls_columns = list(eng_xls.columns)
  95
  96 for i, c in enumerate(eng_xls_columns):
  97 #     print(i, c, type(c), isinstance(c, float))
  98     if isinstance(c, float) and np.isnan(c):
  99         if eng_xls.iloc[0].iloc[i] is not pd.NaT:
 100             eng_xls_columns[i] = eng_xls.iloc[0].iloc[i]
 101
 102 # np.isnan(eng_xls_columns[0])
 103 # eng_xls_columns
 104
 105 eng_xls.columns = eng_xls_columns
 106 # eng_xls.columns
 107 ```
 108
 109 ```python
 110 eng_xls
 111 ```
 112
 113 ```python
 114 rd = eng_xls.iloc[1:][['Week ended', 'Wales']].reset_index(level=0).rename(
 115     columns={'Week ended': 'date_up_to', 'Wales': 'deaths',
 116             'index': 'week'}
 117     )
 118 rd['year'] = 2021
 119 rd['nation'] = 'Wales'
 120 rd.dropna(inplace=True)
 121 rd.tail()
 122 ```
 123
 124 ```python
 125 query_string = '''
 126 delete from all_causes_deaths
 127 where nation = 'Wales'
 128   and year = 2021;
 129 '''
 130 with engine.connect() as connection:
 131     connection.execute(query_string)
 132 ```
 133
 134 ```python
 135 rd.to_sql(
 136     'all_causes_deaths',
 137     engine,
 138     if_exists='append',
 139     index=False)
 140 ```
 141
 142 ```python
 143 %sql select * from all_causes_deaths where year = 2021 order by date_up_to desc limit 10
 144 ```
 145
 146 ```python
 147 rd = eng_xls.iloc[1:][['Week ended', 'Total deaths, all ages (2021)', 'Wales']].reset_index(level=0).rename(
 148     columns={'Week ended': 'date_up_to',
 149              'Total deaths, all ages (2021)': 'ew_deaths',
 150              'Wales': 'w_deaths',
 151             'index': 'week'}
 152     )
 153 rd['year'] = 2021
 154 rd['nation'] = 'England'
 155 rd['deaths'] = rd['ew_deaths'] - rd['w_deaths']
 156 rd.drop(labels=['ew_deaths', 'w_deaths'], axis='columns', inplace=True)
 157 rd.dropna(inplace=True)
 158 rd.tail()
 159 ```
 160
 161 ```python
 162 query_string = '''
 163 delete from all_causes_deaths
 164 where nation = 'England'
 165   and year = 2021;
 166 '''
 167 with engine.connect() as connection:
 168     connection.execute(query_string)
 169 ```
 170
 171 ```python
 172 rd.to_sql(
 173     'all_causes_deaths',
 174     engine,
 175     if_exists='append',
 176     index=False)
 177 ```
 178
 179 ```python
 180 %sql select nation, sum(deaths), count(*) from all_causes_deaths where year = 2021 group by nation
 181 ```
 182
 183 ```python
 184 ni_xls = pd.read_excel(n_ireland_filename,
 185                         sheet_name='Table 1',
 186                         skiprows=[0, 1, 2],
 187                         header=0,
 188                       ).rename(
 189     columns={'Week Ending (Friday)': 'date_up_to', 'Total Number of Deaths Registered in Week (2021P)': 'deaths',
 190             'Registration Week': 'week'})
 191 rd = ni_xls[ni_xls['deaths'].notna()][['week', 'date_up_to', 'deaths']]
 192 rd['year'] = 2021
 193 rd['nation'] = 'Northern Ireland'
 194 rd.tail()
 195 ```
 196
 197 ```python
 198 query_string = '''
 199 delete from all_causes_deaths
 200 where nation = 'Northern Ireland'
 201   and year = 2021;
 202 '''
 203 with engine.connect() as connection:
 204     connection.execute(query_string)
 205 ```
 206
 207 ```python
 208 rd.to_sql(
 209     'all_causes_deaths',
 210     engine,
 211     if_exists='append',
 212     index=False)
 213 ```
 214
 215 ```python
 216 %sql select nation, count(*) from all_causes_deaths group by nation
 217 ```
 218
 219 ```python
 220 # sco_xls = pd.read_excel(scotland_filename,
 221 #                         sheet_name="2.2_excess",
 222 #                         skiprows=[0, 1, 2, 3],
 223 #                         skipfooter=3,
 224 #                         header=0,
 225 #                         index_col=[1]
 226 #                        ).iloc[:91].T
 227 ```
 228
 229 ```python
 230 import openpyxl
 231 ```
 232
 233 ```python
 234 wb = openpyxl.load_workbook(scotland_filename, read_only=True)#, data_only=True, keep_links=False)
 235 sheet = wb.worksheets[7]
 236 sheet
 237 ```
 238
 239 ```python
 240 sheet.reset_dimensions()
 241 ```
 242
 243 ```python
 244 scot_elems = [[value for value in row] for row in sheet.values]
 245 scot_cols = scot_elems[3]
 246 scot_dicts = [{k: v for k, v in zip(scot_cols, row)} for row in scot_elems[4:]]
 247 scot_data = pd.DataFrame(scot_dicts)
 248 rd = scot_data[scot_data.date >= '2021'].rename(
 249     columns={'week_number': 'week', 'date': 'date_up_to', 'total_deaths': 'deaths'})[['week', 'date_up_to', 'deaths']]
 250 rd['year'] = 2021
 251 rd['nation'] = 'Scotland'
 252 rd
 253 ```
 254
 255 ```python
 256 query_string = '''
 257 delete from all_causes_deaths
 258 where nation = 'Scotland'
 259   and year = 2021;
 260 '''
 261 with engine.connect() as connection:
 262     connection.execute(query_string)
 263 ```
 264
 265 ```python
 266 rd.to_sql(
 267     'all_causes_deaths',
 268     engine,
 269     if_exists='append',
 270     index=False)
 271 ```
 272
 273 ```python
 274 %sql select nation, sum(deaths), count(*) from all_causes_deaths where year = 2021 group by nation
 275 ```
 276
 277 ```python
 278
 279 ```
 280
 281 # Create graphs
 282
 283 ```python
 284 qstr = '''select week, year, deaths
 285 from all_causes_deaths
 286 where nation = 'England' '''
 287 deaths_e = pd.read_sql_query(qstr, engine).pivot(index='week', columns='year', values='deaths')
 288 deaths_e.head()
 289 ```
 290
 291 ```python
 292 qstr = '''select week, year, deaths
 293 from all_causes_deaths
 294 where nation = 'Wales' '''
 295 deaths_w = pd.read_sql_query(qstr, engine).pivot(index='week', columns='year', values='deaths')
 296 deaths_w.head()
 297 ```
 298
 299 ```python
 300 qstr = '''select week, year, deaths
 301 from all_causes_deaths
 302 where nation = 'Scotland' '''
 303 deaths_s = pd.read_sql_query(qstr, engine).pivot(index='week', columns='year', values='deaths')
 304 deaths_s.head()
 305 ```
 306
 307 ```python
 308 qstr = '''select week, year, deaths
 309 from all_causes_deaths
 310 where nation = 'Northern Ireland' '''
 311 deaths_i = pd.read_sql_query(qstr, engine).pivot(index='week', columns='year', values='deaths')
 312 deaths_i.head()
 313 ```
 314
 315 ```python
 316 qstr = '''select week, avg(deaths) as prev_mean
 317 from all_causes_deaths
 318 where year <= 2019 and nation='England'
 319 group by week
 320 order by week'''
 321 deaths_prev = pd.read_sql_query(qstr, engine, index_col='week')
 322 deaths_prev.head()
 323 deaths_e = deaths_e.merge(deaths_prev, on='week')
 324 deaths_e.head()
 325 ```
 326
 327 ```python
 328 qstr = '''select week, avg(deaths) as prev_mean
 329 from all_causes_deaths
 330 where year <= 2019 and nation='Wales'
 331 group by week
 332 order by week'''
 333 deaths_prev = pd.read_sql_query(qstr, engine, index_col='week')
 334 deaths_prev.head()
 335 deaths_w = deaths_w.merge(deaths_prev, on='week')
 336 deaths_w.head()
 337 ```
 338
 339 ```python
 340 qstr = '''select week, avg(deaths) as prev_mean
 341 from all_causes_deaths
 342 where year <= 2019 and nation='Scotland'
 343 group by week
 344 order by week'''
 345 deaths_prev = pd.read_sql_query(qstr, engine, index_col='week')
 346 deaths_prev.head()
 347 deaths_s = deaths_s.merge(deaths_prev, on='week')
 348 deaths_s.head()
 349 ```
 350
 351 ```python
 352 qstr = '''select week, avg(deaths) as prev_mean
 353 from all_causes_deaths
 354 where year <= 2019 and nation='Northern Ireland'
 355 group by week
 356 order by week'''
 357 deaths_prev = pd.read_sql_query(qstr, engine, index_col='week')
 358 deaths_prev.head()
 359 deaths_i = deaths_i.merge(deaths_prev, on='week')
 360 deaths_i.head()
 361 ```
 362
 363 ```python Collapsed="false"
 364 deaths = deaths_e + deaths_w + deaths_i + deaths_s
 365 deaths.head()
 366 ```
 367
 368 ```python Collapsed="false"
 369 deaths[[2021, 2020, 2019, 2018, 2017, 2016, 2015]].plot(figsize=(14, 8))
 370 ```
 371
 372 ```python Collapsed="false"
 373 deaths[[2021, 2020, 'prev_mean']].plot(figsize=(10, 8))
 374 ```
 375
 376 ```python Collapsed="false"
 377 deaths_i.plot()
 378 ```
 379
 380 ```python
 381 deaths[2020].sum() - deaths.prev_mean.sum()
 382 ```
 383
 384 ```python
 385 def _closeline(line):
 386     x, y = line.get_data()
 387     x = np.concatenate((x, [x[0]]))
 388     y = np.concatenate((y, [y[0]]))
 389     line.set_data(x, y)
 390
 391 def _closeline_connect(lines):
 392     for line0, line1 in zip(lines, lines[1:]):
 393         x0, y0 = line0.get_data()
 394         x1, y1 = line1.get_data()
 395
 396         x0 = np.concatenate((x0, [x1[0]]))
 397         y0 = np.concatenate((y0, [y1[0]]))
 398         line0.set_data(x0, y0)
 399 ```
 400
 401 ```python Collapsed="false"
 402 # Radar plot code taken from example at https://stackoverflow.com/questions/42878485/getting-matplotlib-radar-plot-with-pandas#
 403
 404 def create_and_save_radar_plot(dataset, title_string, filename_suffix):
 405
 406     fig = plt.figure(figsize=(10, 10))
 407     ax = fig.add_subplot(111, projection="polar")
 408
 409     theta = np.roll(
 410         np.flip(
 411             np.arange(len(dataset))/float(len(dataset))*2.*np.pi),
 412         14)
 413     l15, = ax.plot(theta, dataset[2015], color="#e47d7d", label="2015") # 0
 414     l16, = ax.plot(theta, dataset[2016], color="#afc169", label="2016") # 72 , d0e47d
 415     l17, = ax.plot(theta, dataset[2017], color="#7de4a6", label="2017") # 144
 416     l18, = ax.plot(theta, dataset[2018], color="#7da6e4", label="2018") # 216
 417     l19, = ax.plot(theta, dataset[2019], color="#d07de4", label="2019") # 288
 418
 419     lmean, = ax.plot(theta, dataset['prev_mean'], color="black", linestyle='dashed', label="mean, 15–19")
 420
 421     l20, = ax.plot(theta, dataset[2020], color="#bb0000", label="2020")
 422     l21, = ax.plot(theta, dataset[2021], color="#ff0000", label="2021")
 423
 424     # deaths_headlines.total_2019.plot(ax=ax)
 425
 426     _closeline(lmean)
 427     _closeline_connect([l15, l16, l17, l18, l19, l20, l21])
 428
 429     ax.set_xticks(theta)
 430     ax.set_xticklabels(dataset.index)
 431     plt.legend()
 432     plt.title(f"Deaths by week over years, {title_string}")
 433     plt.savefig(f'deaths_radar_2021{filename_suffix}.png')
 434     plt.show()
 435 ```
 436
 437 ```python
 438 create_and_save_radar_plot(deaths, 'all UK', '')
 439 ```
 440
 441 ```python Collapsed="false"
 442 # # Radar plot code taken from example at https://stackoverflow.com/questions/42878485/getting-matplotlib-radar-plot-with-pandas#
 443
 444 # dhna = deaths# .dropna()
 445
 446 # fig = plt.figure(figsize=(10, 10))
 447 # ax = fig.add_subplot(111, projection="polar")
 448
 449 # theta = np.roll(
 450 #     np.flip(
 451 #         np.arange(len(dhna))/float(len(dhna))*2.*np.pi),
 452 #     14)
 453 # # l15, = ax.plot(theta, deaths_headlines['total_2015'], color="#b56363", label="2015") # 0
 454 # # l16, = ax.plot(theta, deaths_headlines['total_2016'], color="#a4b563", label="2016") # 72
 455 # # l17, = ax.plot(theta, deaths_headlines['total_2017'], color="#63b584", label="2017") # 144
 456 # # l18, = ax.plot(theta, deaths_headlines['total_2018'], color="#6384b5", label="2018") # 216
 457 # # l19, = ax.plot(theta, deaths_headlines['total_2019'], color="#a4635b", label="2019") # 288
 458 # l15, = ax.plot(theta, dhna[2015], color="#e47d7d", label="2015") # 0
 459 # l16, = ax.plot(theta, dhna[2016], color="#afc169", label="2016") # 72 , d0e47d
 460 # l17, = ax.plot(theta, dhna[2017], color="#7de4a6", label="2017") # 144
 461 # l18, = ax.plot(theta, dhna[2018], color="#7da6e4", label="2018") # 216
 462 # l19, = ax.plot(theta, dhna[2019], color="#d07de4", label="2019") # 288
 463
 464 # lmean, = ax.plot(theta, dhna['prev_mean'], color="black", linestyle='dashed', label="mean, 15–19")
 465
 466 # l20, = ax.plot(theta, dhna[2020], color="#bb0000", label="2020")
 467 # l21, = ax.plot(theta, dhna[2021], color="#ff0000", label="2021")
 468
 469 # # deaths_headlines.total_2019.plot(ax=ax)
 470
 471 # _closeline(lmean)
 472 # _closeline_connect([l15, l16, l17, l18, l19, l20, l21])
 473
 474 # ax.set_xticks(theta)
 475 # ax.set_xticklabels(dhna.index)
 476 # plt.legend()
 477 # plt.title("Deaths by week over years, all UK")
 478 # plt.savefig('deaths-radar-2021.png')
 479 # plt.show()
 480 ```
 481
 482 <!-- #region Collapsed="false" -->
 483 # Plots for UK nations
 484 <!-- #endregion -->
 485
 486 ```python
 487 create_and_save_radar_plot(deaths_e, 'England', '_england')
 488 ```
 489
 490 ```python
 491 create_and_save_radar_plot(deaths_w, 'Wales', '_wales')
 492 ```
 493
 494 ```python
 495 create_and_save_radar_plot(deaths_s, 'Scotland', '_scotland')
 496 ```
 497
 498 ```python
 499 create_and_save_radar_plot(deaths_i, 'Northern Ireland', '_northern_ireland')
 500 ```
 501
 502 ```python Collapsed="false"
 503
 504 ```