Updated for imported data format
[covid19.git] / international_comparison-large-countries.md
1 ---
2 jupyter:
3 jupytext:
4 formats: ipynb,md
5 text_representation:
6 extension: .md
7 format_name: markdown
8 format_version: '1.2'
9 jupytext_version: 1.9.1
10 kernelspec:
11 display_name: Python 3
12 language: python
13 name: python3
14 ---
15
16 <!-- #region Collapsed="false" -->
17 Data from [European Centre for Disease Prevention and Control](https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide)
18 <!-- #endregion -->
19
20 ```python Collapsed="false"
21 import itertools
22 import collections
23 import json
24 import pandas as pd
25 import numpy as np
26 from scipy.stats import gmean
27 import datetime
28
29 from sqlalchemy.types import Integer, Text, String, DateTime, Date, Float
30 from sqlalchemy import create_engine
31
32 import matplotlib as mpl
33 import matplotlib.pyplot as plt
34 %matplotlib inline
35 %load_ext sql
36 ```
37
38 ```python Collapsed="false"
39 connection_string = 'postgresql://covid:3NbjJTkT63@localhost/covid'
40 ```
41
42 ```python Collapsed="false"
43 %sql $connection_string
44 ```
45
46 ```python
47 engine = create_engine(connection_string)
48 ```
49
50 ```python Collapsed="false"
51 DEATH_COUNT_THRESHOLD = 10
52 COUNTRIES_CORE = tuple(sorted('IT DE UK ES IE FR BE'.split()))
53 COUNTRIES_NORDIC = tuple('SE NO DK FI UK'.split())
54 COUNTRIES_FRIENDS = tuple('IT UK ES BE SI MX'.split())
55 # COUNTRIES_FRIENDS = 'IT UK ES BE SI PT'.split()
56
57 COUNTRIES_AMERICAS = ('AG', 'AR', 'AW', 'BS', 'BB', 'BZ', 'BM', 'BO', 'BR', 'VG', 'KY', # excluding Canada and USA
58 'CL', 'CO', 'CR', 'CU', 'CW', 'DM', 'DO', 'EC', 'SV', 'GL', 'GD', 'GT',
59 'GY', 'HT', 'HN', 'JM', 'MX', 'MS', 'NI', 'PA', 'PY', 'PE', 'PR', 'KN',
60 'LC', 'VC', 'SX', 'SR', 'TT', 'TC', 'VI', 'UY', 'VE')
61 COUNTRIES_OF_INTEREST = tuple(set(COUNTRIES_CORE + COUNTRIES_FRIENDS))
62 COUNTRIES_ALL = tuple(set(COUNTRIES_CORE + COUNTRIES_FRIENDS + COUNTRIES_NORDIC + COUNTRIES_AMERICAS))
63 ```
64
65 ```python Collapsed="false"
66 # res = %sql select report_date, geo_id, deaths_weekly, culm_deaths from weekly_cases where geo_id in $COUNTRIES_CORE order by report_date, geo_id
67 ```
68
69 ```python Collapsed="false"
70 # country_data = res.DataFrame()
71 # country_data['report_date'] = country_data.report_date.astype('datetime64[ns]')
72 # country_data.set_index('report_date', inplace=True)
73 # country_data.tail(10)
74 ```
75
76 ```python
77 %sql select geo_id, country_name, population_2019 from countries where population_2019 > 30000000
78 ```
79
80 ```python
81 query_string = f'''select report_date, weekly_cases.geo_id, country_name,
82 10000000.0 * deaths_weekly / population_2019 as deaths_weekly_pc,
83 10000000.0 * culm_deaths / population_2019 as culm_deaths_pc
84 from weekly_cases, countries
85 where weekly_cases.geo_id = countries.geo_id
86 and countries.population_2019 > 20000000
87 order by report_date, geo_id'''
88
89 country_data = pd.read_sql_query(query_string,
90 engine,
91 index_col = 'report_date',
92 parse_dates = ['report_date']
93 )
94 ```
95
96 ```python
97 country_data.tail()
98 ```
99
100 ```python
101 latest_date = country_data.index.max()
102 latest_date
103 ```
104
105 ```python
106 # highest_deaths_geo_ids = country_data.loc[latest_date, :].nlargest(10, 'culm_deaths_pc')['geo_id'].values
107 highest_deaths_geo_ids = (
108 country_data.loc[latest_date, :]
109 .nlargest(10, 'culm_deaths_pc')
110 ['country_name']
111 .values
112 )
113
114 highest_deaths_geo_ids
115 ```
116
117 ```python
118 high_deaths_country_data = (
119 country_data[
120 country_data.country_name.isin(highest_deaths_geo_ids)]
121 )
122 high_deaths_country_data.tail()
123 ```
124
125 ```python Collapsed="false"
126 deaths_culm = high_deaths_country_data.pivot(columns='country_name',
127 values='culm_deaths_pc')
128 deaths_culm.tail()
129 ```
130
131 ```python Collapsed="false"
132 deaths_culm.plot()
133 ```
134
135 ```python Collapsed="false"
136 ax = deaths_culm.loc['2020-03-15':].plot(figsize=(10, 6), title="Total deaths, linear")
137 ax.set_xlabel(f"Date")
138 for c in highest_deaths_geo_ids:
139 lvi = deaths_culm[c].last_valid_index()
140 ax.text(x = lvi + pd.Timedelta(days=1), y = deaths_culm[c][lvi], s = f"{c}: {deaths_culm[c][lvi]:.0f}")
141 # plt.savefig('covid_deaths_total_linear.png')
142 ```
143
144 ```python
145 deaths_culm.loc[latest_date].sort_values()
146 ```
147
148 ```python Collapsed="false"
149 deaths_weekly = high_deaths_country_data.pivot(columns='country_name', values='deaths_weekly_pc')
150 deaths_weekly.tail()
151 ```
152
153 ```python Collapsed="false"
154 deaths_weekly.plot()
155 ```
156
157 ```python Collapsed="false"
158 ax = deaths_weekly.loc['2020-03-01':].plot(figsize=(10, 6), title="Deaths per week")
159 ax.set_xlabel('Date')
160 for c in highest_deaths_geo_ids:
161 lvi = deaths_weekly[c].last_valid_index()
162 ax.text(x = lvi + pd.Timedelta(days=1), y = deaths_weekly[c][lvi], s = c)
163 # plt.savefig('covid_deaths_per_week.png')
164 ```
165
166 ```python Collapsed="false"
167 ax = deaths_weekly.iloc[-6:].plot(figsize=(10, 6), title="Deaths per week")#, ylim=(-10, 100))
168 ax.set_xlabel("Date")
169
170 text_x_pos = deaths_weekly.last_valid_index() + pd.Timedelta(days=0.5)
171
172 for c in highest_deaths_geo_ids:
173 lvi = deaths_weekly[c].last_valid_index()
174 # if c != 'ES':
175 ax.text(x = text_x_pos, y = deaths_weekly[c][lvi], s = f"{c}: {deaths_weekly[c][lvi]:.0f}")
176 plt.savefig('deaths_by_date_last_6_weeks.png')
177 ```
178
179 ```python
180 query_string = f'''select report_date, geo_id,
181 (cast(deaths_weekly as float) / population_2019) deaths_weekly_pc,
182 (cast(culm_deaths as float) / population_2019) as culm_deaths_pc
183 from weekly_cases join countries using (geo_id)
184 where geo_id in {COUNTRIES_CORE}
185 order by report_date, geo_id'''
186
187 deaths_pc_data = pd.read_sql_query(query_string,
188 engine,
189 index_col = 'report_date',
190 parse_dates = ['report_date']
191 )
192 deaths_pc_data.tail()
193 ```
194
195 ```python Collapsed="false"
196 deaths_pc_culm = deaths_pc_data.pivot(columns='geo_id', values='culm_deaths_pc')
197 deaths_pc_culm.tail()
198 ```
199
200 ```python
201 ax = deaths_pc_culm.loc['2020-03-15':].plot(figsize=(10, 6), title="Deaths per million, linear")
202 ax.set_xlabel(f"Date")
203 for c in COUNTRIES_CORE:
204 lvi = deaths_pc_culm[c].last_valid_index()
205 ax.text(x = lvi + pd.Timedelta(days=1), y = deaths_pc_culm[c][lvi], s = f"{c}: {deaths_pc_culm[c][lvi]:.0f}")
206 # plt.savefig('covid_deaths_total_linear.png')
207 ```
208
209 ```python Collapsed="false"
210 deaths_pc = deaths_pc_data.pivot(columns='geo_id', values='deaths_weekly_pc')
211 deaths_pc.tail()
212 ```
213
214 ```python
215 ax = deaths_pc.loc['2020-03-15':].plot(figsize=(10, 6), title="Deaths per million, linear")
216 ax.set_xlabel(f"Date")
217 for c in COUNTRIES_CORE:
218 lvi = deaths_pc[c].last_valid_index()
219 ax.text(x = lvi + pd.Timedelta(days=1), y = deaths_pc[c][lvi], s = f"{c}: {deaths_pc[c][lvi]:.0f}")
220 # plt.savefig('covid_deaths_total_linear.png')
221 ```
222
223 ```python
224 ax = deaths_pc.iloc[-6:].plot(figsize=(10, 6), title="Deaths per million, linear")
225 ax.set_xlabel(f"Date")
226 for c in COUNTRIES_CORE:
227 lvi = deaths_pc[c].last_valid_index()
228 ax.text(x = lvi, y = deaths_pc[c][lvi], s = f"{c}: {deaths_pc[c][lvi]*10**6:.0f}")
229 # plt.savefig('covid_deaths_total_linear.png')
230 ```
231
232 ```python
233
234 ```