Updated for imported data format
[covid19.git] / international_comparison-large-countries.md
diff --git a/international_comparison-large-countries.md b/international_comparison-large-countries.md
new file mode 100644 (file)
index 0000000..9a750df
--- /dev/null
@@ -0,0 +1,234 @@
+---
+jupyter:
+  jupytext:
+    formats: ipynb,md
+    text_representation:
+      extension: .md
+      format_name: markdown
+      format_version: '1.2'
+      jupytext_version: 1.9.1
+  kernelspec:
+    display_name: Python 3
+    language: python
+    name: python3
+---
+
+<!-- #region Collapsed="false" -->
+Data from [European Centre for Disease Prevention and Control](https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide)
+<!-- #endregion -->
+
+```python Collapsed="false"
+import itertools
+import collections
+import json
+import pandas as pd
+import numpy as np
+from scipy.stats import gmean
+import datetime
+
+from sqlalchemy.types import Integer, Text, String, DateTime, Date, Float
+from sqlalchemy import create_engine
+
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+%matplotlib inline
+%load_ext sql
+```
+
+```python Collapsed="false"
+connection_string = 'postgresql://covid:3NbjJTkT63@localhost/covid'
+```
+
+```python Collapsed="false"
+%sql $connection_string
+```
+
+```python
+engine = create_engine(connection_string)
+```
+
+```python Collapsed="false"
+DEATH_COUNT_THRESHOLD = 10
+COUNTRIES_CORE = tuple(sorted('IT DE UK ES IE FR BE'.split()))
+COUNTRIES_NORDIC = tuple('SE NO DK FI UK'.split())
+COUNTRIES_FRIENDS = tuple('IT UK ES BE SI MX'.split())
+# COUNTRIES_FRIENDS = 'IT UK ES BE SI PT'.split()
+
+COUNTRIES_AMERICAS = ('AG', 'AR', 'AW', 'BS', 'BB', 'BZ', 'BM', 'BO', 'BR', 'VG', 'KY', # excluding Canada and USA
+       'CL', 'CO', 'CR', 'CU', 'CW', 'DM', 'DO', 'EC', 'SV', 'GL', 'GD', 'GT',
+       'GY', 'HT', 'HN', 'JM', 'MX', 'MS', 'NI', 'PA', 'PY', 'PE', 'PR', 'KN',
+       'LC', 'VC', 'SX', 'SR', 'TT', 'TC', 'VI', 'UY', 'VE')
+COUNTRIES_OF_INTEREST = tuple(set(COUNTRIES_CORE + COUNTRIES_FRIENDS))
+COUNTRIES_ALL = tuple(set(COUNTRIES_CORE + COUNTRIES_FRIENDS + COUNTRIES_NORDIC + COUNTRIES_AMERICAS))
+```
+
+```python Collapsed="false"
+# res = %sql select report_date, geo_id, deaths_weekly, culm_deaths from weekly_cases where geo_id in $COUNTRIES_CORE order by report_date, geo_id
+```
+
+```python Collapsed="false"
+# country_data = res.DataFrame()
+# country_data['report_date'] = country_data.report_date.astype('datetime64[ns]')
+# country_data.set_index('report_date', inplace=True)
+# country_data.tail(10)
+```
+
+```python
+%sql select geo_id, country_name, population_2019 from countries where population_2019 > 30000000
+```
+
+```python
+query_string = f'''select report_date, weekly_cases.geo_id, country_name,
+  10000000.0 * deaths_weekly / population_2019 as deaths_weekly_pc, 
+  10000000.0 * culm_deaths / population_2019 as culm_deaths_pc
+from weekly_cases, countries 
+where weekly_cases.geo_id = countries.geo_id
+and countries.population_2019 > 20000000
+order by report_date, geo_id'''
+
+country_data = pd.read_sql_query(query_string,
+                  engine,
+                  index_col = 'report_date',
+                  parse_dates = ['report_date']
+                 )
+```
+
+```python
+country_data.tail()
+```
+
+```python
+latest_date = country_data.index.max()
+latest_date
+```
+
+```python
+# highest_deaths_geo_ids = country_data.loc[latest_date, :].nlargest(10, 'culm_deaths_pc')['geo_id'].values
+highest_deaths_geo_ids = (
+  country_data.loc[latest_date, :]
+  .nlargest(10, 'culm_deaths_pc')
+  ['country_name']
+  .values
+)
+
+highest_deaths_geo_ids
+```
+
+```python
+high_deaths_country_data = (
+  country_data[
+    country_data.country_name.isin(highest_deaths_geo_ids)]
+)
+high_deaths_country_data.tail()
+```
+
+```python Collapsed="false"
+deaths_culm = high_deaths_country_data.pivot(columns='country_name', 
+                                             values='culm_deaths_pc')
+deaths_culm.tail()
+```
+
+```python Collapsed="false"
+deaths_culm.plot()
+```
+
+```python Collapsed="false"
+ax = deaths_culm.loc['2020-03-15':].plot(figsize=(10, 6), title="Total deaths, linear")
+ax.set_xlabel(f"Date")
+for c in highest_deaths_geo_ids:
+    lvi = deaths_culm[c].last_valid_index()
+    ax.text(x = lvi + pd.Timedelta(days=1), y = deaths_culm[c][lvi], s = f"{c}: {deaths_culm[c][lvi]:.0f}")
+# plt.savefig('covid_deaths_total_linear.png')    
+```
+
+```python
+deaths_culm.loc[latest_date].sort_values()
+```
+
+```python Collapsed="false"
+deaths_weekly = high_deaths_country_data.pivot(columns='country_name', values='deaths_weekly_pc')
+deaths_weekly.tail()
+```
+
+```python Collapsed="false"
+deaths_weekly.plot()
+```
+
+```python Collapsed="false"
+ax = deaths_weekly.loc['2020-03-01':].plot(figsize=(10, 6), title="Deaths per week")
+ax.set_xlabel('Date')
+for c in highest_deaths_geo_ids:
+    lvi = deaths_weekly[c].last_valid_index()
+    ax.text(x = lvi + pd.Timedelta(days=1), y = deaths_weekly[c][lvi], s = c)
+# plt.savefig('covid_deaths_per_week.png') 
+```
+
+```python Collapsed="false"
+ax = deaths_weekly.iloc[-6:].plot(figsize=(10, 6), title="Deaths per week")#, ylim=(-10, 100))
+ax.set_xlabel("Date")
+
+text_x_pos = deaths_weekly.last_valid_index() + pd.Timedelta(days=0.5)
+
+for c in highest_deaths_geo_ids:
+    lvi = deaths_weekly[c].last_valid_index()
+#     if c != 'ES':
+    ax.text(x = text_x_pos, y = deaths_weekly[c][lvi], s = f"{c}: {deaths_weekly[c][lvi]:.0f}")
+plt.savefig('deaths_by_date_last_6_weeks.png') 
+```
+
+```python
+query_string = f'''select report_date, geo_id, 
+(cast(deaths_weekly as float) / population_2019) deaths_weekly_pc,
+(cast(culm_deaths as float) / population_2019) as culm_deaths_pc
+from weekly_cases join countries using (geo_id)
+where  geo_id in {COUNTRIES_CORE} 
+order by report_date, geo_id'''
+
+deaths_pc_data = pd.read_sql_query(query_string,
+                  engine,
+                  index_col = 'report_date',
+                  parse_dates = ['report_date']
+                 )
+deaths_pc_data.tail()
+```
+
+```python Collapsed="false"
+deaths_pc_culm = deaths_pc_data.pivot(columns='geo_id', values='culm_deaths_pc')
+deaths_pc_culm.tail()
+```
+
+```python
+ax = deaths_pc_culm.loc['2020-03-15':].plot(figsize=(10, 6), title="Deaths per million, linear")
+ax.set_xlabel(f"Date")
+for c in COUNTRIES_CORE:
+    lvi = deaths_pc_culm[c].last_valid_index()
+    ax.text(x = lvi + pd.Timedelta(days=1), y = deaths_pc_culm[c][lvi], s = f"{c}: {deaths_pc_culm[c][lvi]:.0f}")
+# plt.savefig('covid_deaths_total_linear.png')    
+```
+
+```python Collapsed="false"
+deaths_pc = deaths_pc_data.pivot(columns='geo_id', values='deaths_weekly_pc')
+deaths_pc.tail()
+```
+
+```python
+ax = deaths_pc.loc['2020-03-15':].plot(figsize=(10, 6), title="Deaths per million, linear")
+ax.set_xlabel(f"Date")
+for c in COUNTRIES_CORE:
+    lvi = deaths_pc[c].last_valid_index()
+    ax.text(x = lvi + pd.Timedelta(days=1), y = deaths_pc[c][lvi], s = f"{c}: {deaths_pc[c][lvi]:.0f}")
+# plt.savefig('covid_deaths_total_linear.png')    
+```
+
+```python
+ax = deaths_pc.iloc[-6:].plot(figsize=(10, 6), title="Deaths per million, linear")
+ax.set_xlabel(f"Date")
+for c in COUNTRIES_CORE:
+    lvi = deaths_pc[c].last_valid_index()
+    ax.text(x = lvi, y = deaths_pc[c][lvi], s = f"{c}: {deaths_pc[c][lvi]*10**6:.0f}")
+# plt.savefig('covid_deaths_total_linear.png')    
+```
+
+```python
+
+```