Now using py files, for automation
[covid19.git] / uk_deaths_import.md
diff --git a/uk_deaths_import.md b/uk_deaths_import.md
new file mode 100644 (file)
index 0000000..e775661
--- /dev/null
@@ -0,0 +1,489 @@
+---
+jupyter:
+  jupytext:
+    formats: ipynb,md
+    text_representation:
+      extension: .md
+      format_name: markdown
+      format_version: '1.2'
+      jupytext_version: 1.9.1
+  kernelspec:
+    display_name: Python 3
+    language: python
+    name: python3
+---
+
+<!-- #region Collapsed="false" -->
+Data from:
+
+* [Office of National Statistics](https://www.ons.gov.uk/peoplepopulationandcommunity/birthsdeathsandmarriages/deaths/datasets/weeklyprovisionalfiguresondeathsregisteredinenglandandwales) (Endland and Wales) Weeks start on a Saturday.
+* [Northern Ireland Statistics and Research Agency](https://www.nisra.gov.uk/publications/weekly-deaths) (Northern Ireland). Weeks start on a Saturday. Note that the week numbers don't match the England and Wales data.
+* [National Records of Scotland](https://www.nrscotland.gov.uk/statistics-and-data/statistics/statistics-by-theme/vital-events/general-publications/weekly-and-monthly-data-on-births-and-deaths/weekly-data-on-births-and-deaths) (Scotland). Note that Scotland uses ISO8601 week numbers, which start on a Monday.
+
+<!-- #endregion -->
+
+```python Collapsed="false"
+import itertools
+import collections
+import json
+import pandas as pd
+import numpy as np
+from scipy.stats import gmean
+import datetime
+
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+%matplotlib inline
+
+from sqlalchemy.types import Integer, Text, String, DateTime, Float
+from sqlalchemy import create_engine
+%load_ext sql
+```
+
+```python Collapsed="false"
+connection_string = 'postgresql://covid:3NbjJTkT63@localhost/covid'
+```
+
+```python Collapsed="false"
+%sql $connection_string
+```
+
+```python Collapsed="false"
+engine = create_engine(connection_string)
+```
+
+```python Collapsed="false"
+england_wales_filename = 'uk-deaths-data/copyofpublishedweek042021.xlsx'
+```
+
+```python Collapsed="false"
+scotland_filename = 'uk-deaths-data/Scottish Government COVID-19 data (10 February 2021).xlsx'
+```
+
+```python
+n_ireland_filename = 'uk-deaths-data/Weekly_Deaths_0.xlsx'
+```
+
+```python Collapsed="false"
+eng_xls = pd.read_excel(england_wales_filename, 
+                        sheet_name="Weekly figures 2021",
+                        skiprows=[0, 1, 2, 3],
+                        skipfooter=11,
+                        header=0,
+                        index_col=[1]
+                       ).iloc[:91].T
+eng_xls
+```
+
+```python Collapsed="false"
+eng_xls_columns = list(eng_xls.columns)
+
+for i, c in enumerate(eng_xls_columns):
+#     print(i, c, type(c), isinstance(c, float))
+    if isinstance(c, float) and np.isnan(c):
+        if eng_xls.iloc[0].iloc[i] is not pd.NaT:
+            eng_xls_columns[i] = eng_xls.iloc[0].iloc[i]
+
+# np.isnan(eng_xls_columns[0])
+# eng_xls_columns
+
+eng_xls.columns = eng_xls_columns
+# eng_xls.columns
+```
+
+```python
+eng_xls
+```
+
+```python
+rd = eng_xls.iloc[1:][['Week ended', 'Wales']].reset_index(level=0).rename(
+    columns={'Week ended': 'date_up_to', 'Wales': 'deaths',
+            'index': 'week'}
+    )
+rd['year'] = 2021
+rd['nation'] = 'Wales'
+rd.dropna(inplace=True)
+rd.head()
+```
+
+```python
+query_string = '''
+delete from all_causes_deaths
+where nation = 'Wales'
+  and year = 2021;
+'''
+with engine.connect() as connection:
+    connection.execute(query_string)
+```
+
+```python
+rd.to_sql(
+    'all_causes_deaths',
+    engine,
+    if_exists='append',
+    index=False)
+```
+
+```python
+%sql select * from all_causes_deaths where year = 2021 limit 10
+```
+
+```python
+rd = eng_xls.iloc[1:][['Week ended', 'Total deaths, all ages (2021)', 'Wales']].reset_index(level=0).rename(
+    columns={'Week ended': 'date_up_to', 
+             'Total deaths, all ages (2021)': 'ew_deaths',
+             'Wales': 'w_deaths',
+            'index': 'week'}
+    )
+rd['year'] = 2021
+rd['nation'] = 'England'
+rd['deaths'] = rd['ew_deaths'] - rd['w_deaths']
+rd.drop(labels=['ew_deaths', 'w_deaths'], axis='columns', inplace=True)
+rd.dropna(inplace=True)
+rd.tail()
+```
+
+```python
+query_string = '''
+delete from all_causes_deaths
+where nation = 'England'
+  and year = 2021;
+'''
+with engine.connect() as connection:
+    connection.execute(query_string)
+```
+
+```python
+rd.to_sql(
+    'all_causes_deaths',
+    engine,
+    if_exists='append',
+    index=False)
+```
+
+```python
+%sql select nation, sum(deaths), count(*) from all_causes_deaths where year = 2021 group by nation
+```
+
+```python
+ni_xls = pd.read_excel(n_ireland_filename, 
+                        sheet_name='Table 1',
+                        skiprows=[0, 1, 2, 3],
+                        header=0,
+                      ).rename(
+    columns={'Week Ending (Friday)': 'date_up_to', 'Total Number of Deaths Registered in Week (2021P)': 'deaths',
+            'Registration Week': 'week'})
+rd = ni_xls[ni_xls['deaths'].notna()][['week', 'date_up_to', 'deaths']]
+rd['year'] = 2021
+rd['nation'] = 'Northern Ireland'
+rd
+```
+
+```python
+query_string = '''
+delete from all_causes_deaths
+where nation = 'Northern Ireland'
+  and year = 2021;
+'''
+with engine.connect() as connection:
+    connection.execute(query_string)
+```
+
+```python
+rd.to_sql(
+    'all_causes_deaths',
+    engine,
+    if_exists='append',
+    index=False)
+```
+
+```python
+%sql select nation from all_causes_deaths group by nation
+```
+
+```python
+# sco_xls = pd.read_excel(scotland_filename, 
+#                         sheet_name="2.2_excess",
+#                         skiprows=[0, 1, 2, 3],
+#                         skipfooter=3,
+#                         header=0,
+#                         index_col=[1]
+#                        ).iloc[:91].T
+```
+
+```python
+import openpyxl
+```
+
+```python
+wb = openpyxl.load_workbook(scotland_filename, read_only=True)#, data_only=True, keep_links=False)
+sheet = wb.worksheets[7]
+sheet
+```
+
+```python
+sheet.reset_dimensions()
+```
+
+```python
+scot_elems = [[value for value in row] for row in sheet.values]
+scot_cols = scot_elems[3]
+scot_dicts = [{k: v for k, v in zip(scot_cols, row)} for row in scot_elems[4:]]
+scot_data = pd.DataFrame(scot_dicts)
+rd = scot_data[scot_data.date >= '2021'].rename(
+    columns={'week_number': 'week', 'date': 'date_up_to', 'total_deaths': 'deaths'})[['week', 'date_up_to', 'deaths']]
+rd['year'] = 2021
+rd['nation'] = 'Scotland'
+rd
+```
+
+```python
+query_string = '''
+delete from all_causes_deaths
+where nation = 'Scotland'
+  and year = 2021;
+'''
+with engine.connect() as connection:
+    connection.execute(query_string)
+```
+
+```python
+rd.to_sql(
+    'all_causes_deaths',
+    engine,
+    if_exists='append',
+    index=False)
+```
+
+```python
+%sql select nation, sum(deaths), count(*) from all_causes_deaths where year = 2021 group by nation
+```
+
+```python
+
+```
+
+# Create graphs
+
+```python
+qstr = '''select week, year, deaths
+from all_causes_deaths
+where nation = 'England' '''
+deaths_e = pd.read_sql_query(qstr, engine).pivot(index='week', columns='year', values='deaths')
+deaths_e.head()
+```
+
+```python
+qstr = '''select week, year, deaths
+from all_causes_deaths
+where nation = 'Wales' '''
+deaths_w = pd.read_sql_query(qstr, engine).pivot(index='week', columns='year', values='deaths')
+deaths_w.head()
+```
+
+```python
+qstr = '''select week, year, deaths
+from all_causes_deaths
+where nation = 'Scotland' '''
+deaths_s = pd.read_sql_query(qstr, engine).pivot(index='week', columns='year', values='deaths')
+deaths_s.head()
+```
+
+```python
+qstr = '''select week, year, deaths
+from all_causes_deaths
+where nation = 'Northern Ireland' '''
+deaths_i = pd.read_sql_query(qstr, engine).pivot(index='week', columns='year', values='deaths')
+deaths_i.head()
+```
+
+```python
+qstr = '''select week, avg(deaths) as prev_mean
+from all_causes_deaths
+where year <= 2019 and nation='England' 
+group by week
+order by week'''
+deaths_prev = pd.read_sql_query(qstr, engine, index_col='week')
+deaths_prev.head()
+deaths_e = deaths_e.merge(deaths_prev, on='week')
+deaths_e.head()
+```
+
+```python
+qstr = '''select week, avg(deaths) as prev_mean
+from all_causes_deaths
+where year <= 2019 and nation='Wales' 
+group by week
+order by week'''
+deaths_prev = pd.read_sql_query(qstr, engine, index_col='week')
+deaths_prev.head()
+deaths_w = deaths_w.merge(deaths_prev, on='week')
+deaths_w.head()
+```
+
+```python
+qstr = '''select week, avg(deaths) as prev_mean
+from all_causes_deaths
+where year <= 2019 and nation='Scotland' 
+group by week
+order by week'''
+deaths_prev = pd.read_sql_query(qstr, engine, index_col='week')
+deaths_prev.head()
+deaths_s = deaths_s.merge(deaths_prev, on='week')
+deaths_s.head()
+```
+
+```python
+qstr = '''select week, avg(deaths) as prev_mean
+from all_causes_deaths
+where year <= 2019 and nation='Northern Ireland' 
+group by week
+order by week'''
+deaths_prev = pd.read_sql_query(qstr, engine, index_col='week')
+deaths_prev.head()
+deaths_i = deaths_i.merge(deaths_prev, on='week')
+deaths_i.head()
+```
+
+```python Collapsed="false"
+deaths = deaths_e + deaths_w + deaths_i + deaths_s
+deaths.head()
+```
+
+```python Collapsed="false"
+deaths[[2021, 2020, 2019, 2018, 2017, 2016, 2015]].plot(figsize=(14, 8))
+```
+
+```python Collapsed="false"
+deaths[[2021, 2020, 'prev_mean']].plot(figsize=(10, 8))
+```
+
+```python Collapsed="false"
+deaths_i.plot()
+```
+
+```python
+deaths[2020].sum() - deaths.prev_mean.sum()
+```
+
+```python
+def _closeline(line):
+    x, y = line.get_data()
+    x = np.concatenate((x, [x[0]]))
+    y = np.concatenate((y, [y[0]]))
+    line.set_data(x, y)
+
+def _closeline_connect(lines):
+    for line0, line1 in zip(lines, lines[1:]):
+        x0, y0 = line0.get_data()
+        x1, y1 = line1.get_data()
+
+        x0 = np.concatenate((x0, [x1[0]]))
+        y0 = np.concatenate((y0, [y1[0]]))
+        line0.set_data(x0, y0) 
+```
+
+```python Collapsed="false"
+# Radar plot code taken from example at https://stackoverflow.com/questions/42878485/getting-matplotlib-radar-plot-with-pandas#
+
+def create_and_save_radar_plot(dataset, title_string, filename_suffix):
+
+    fig = plt.figure(figsize=(10, 10))
+    ax = fig.add_subplot(111, projection="polar")
+
+    theta = np.roll(
+        np.flip(
+            np.arange(len(dataset))/float(len(dataset))*2.*np.pi),
+        14)
+    l15, = ax.plot(theta, dataset[2015], color="#e47d7d", label="2015") # 0
+    l16, = ax.plot(theta, dataset[2016], color="#afc169", label="2016") # 72 , d0e47d
+    l17, = ax.plot(theta, dataset[2017], color="#7de4a6", label="2017") # 144
+    l18, = ax.plot(theta, dataset[2018], color="#7da6e4", label="2018") # 216
+    l19, = ax.plot(theta, dataset[2019], color="#d07de4", label="2019") # 288
+
+    lmean, = ax.plot(theta, dataset['prev_mean'], color="black", linestyle='dashed', label="mean, 15–19")
+
+    l20, = ax.plot(theta, dataset[2020], color="#bb0000", label="2020")
+    l21, = ax.plot(theta, dataset[2021], color="#ff0000", label="2021")
+
+    # deaths_headlines.total_2019.plot(ax=ax)
+
+    _closeline(lmean)
+    _closeline_connect([l15, l16, l17, l18, l19, l20, l21])
+
+    ax.set_xticks(theta)
+    ax.set_xticklabels(dataset.index)
+    plt.legend()
+    plt.title(f"Deaths by week over years, {title_string}")
+    plt.savefig(f'deaths-radar-2021{filename_suffix}.png')
+    plt.show()
+```
+
+```python
+create_and_save_radar_plot(deaths, 'all UK', '')
+```
+
+```python Collapsed="false"
+# # Radar plot code taken from example at https://stackoverflow.com/questions/42878485/getting-matplotlib-radar-plot-with-pandas#
+
+# dhna = deaths# .dropna()
+
+# fig = plt.figure(figsize=(10, 10))
+# ax = fig.add_subplot(111, projection="polar")
+
+# theta = np.roll(
+#     np.flip(
+#         np.arange(len(dhna))/float(len(dhna))*2.*np.pi),
+#     14)
+# # l15, = ax.plot(theta, deaths_headlines['total_2015'], color="#b56363", label="2015") # 0
+# # l16, = ax.plot(theta, deaths_headlines['total_2016'], color="#a4b563", label="2016") # 72
+# # l17, = ax.plot(theta, deaths_headlines['total_2017'], color="#63b584", label="2017") # 144
+# # l18, = ax.plot(theta, deaths_headlines['total_2018'], color="#6384b5", label="2018") # 216
+# # l19, = ax.plot(theta, deaths_headlines['total_2019'], color="#a4635b", label="2019") # 288
+# l15, = ax.plot(theta, dhna[2015], color="#e47d7d", label="2015") # 0
+# l16, = ax.plot(theta, dhna[2016], color="#afc169", label="2016") # 72 , d0e47d
+# l17, = ax.plot(theta, dhna[2017], color="#7de4a6", label="2017") # 144
+# l18, = ax.plot(theta, dhna[2018], color="#7da6e4", label="2018") # 216
+# l19, = ax.plot(theta, dhna[2019], color="#d07de4", label="2019") # 288
+
+# lmean, = ax.plot(theta, dhna['prev_mean'], color="black", linestyle='dashed', label="mean, 15–19")
+
+# l20, = ax.plot(theta, dhna[2020], color="#bb0000", label="2020")
+# l21, = ax.plot(theta, dhna[2021], color="#ff0000", label="2021")
+
+# # deaths_headlines.total_2019.plot(ax=ax)
+
+# _closeline(lmean)
+# _closeline_connect([l15, l16, l17, l18, l19, l20, l21])
+
+# ax.set_xticks(theta)
+# ax.set_xticklabels(dhna.index)
+# plt.legend()
+# plt.title("Deaths by week over years, all UK")
+# plt.savefig('deaths-radar-2021.png')
+# plt.show()
+```
+
+<!-- #region Collapsed="false" -->
+# Plots for UK nations
+<!-- #endregion -->
+
+```python
+create_and_save_radar_plot(deaths_e, 'England', '-england')
+```
+
+```python
+create_and_save_radar_plot(deaths_w, 'Wales', '-wales')
+```
+
+```python
+create_and_save_radar_plot(deaths_s, 'Scotland', '-scotland')
+```
+
+```python
+create_and_save_radar_plot(deaths_i, 'Northern Ireland', '-northern-ireland')
+```
+
+```python Collapsed="false"
+
+```