Changed European data source and format
[covid19.git] / data_import.py
1 #!/usr/bin/env python
2 # coding: utf-8
3 # %%
4 # Data from [European Centre for Disease Prevention and Control](https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide)
5
6 from sqlalchemy.types import Integer, Text, String, DateTime, Date, Float
7 from sqlalchemy import create_engine
8
9 # import itertools
10 # import collections
11 # import json
12 import pandas as pd
13 import numpy as np
14 # from scipy.stats import gmean
15 import datetime
16 import os
17 # import matplotlib as mpl
18 # import matplotlib.pyplot as plt
19 # import matplotlib.animation as ani
20
21 # # %matplotlib inline
22
23
24 # %%
25 connection_string = 'postgresql://covid:3NbjJTkT63@localhost/covid'
26
27 eng = create_engine(connection_string)
28 engine = eng.execution_options(isolation_level="AUTOCOMMIT")
29
30
31 # %%
32 # DEATH_COUNT_THRESHOLD = 10
33 # COUNTRIES_CORE = 'IT DE UK ES IE FR BE'.split()
34 # COUNTRIES_NORDIC = 'SE NO DK FI UK'.split()
35 # COUNTRIES_FRIENDS = 'IT UK ES BE SI MX'.split()
36 # # COUNTRIES_FRIENDS = 'IT UK ES BE SI PT'.split()
37
38 # COUNTRIES_AMERICAS = ['AG', 'AR', 'AW', 'BS', 'BB', 'BZ', 'BM', 'BO', 'BR', 'VG', 'KY', # excluding Canada and USA
39 # 'CL', 'CO', 'CR', 'CU', 'CW', 'DM', 'DO', 'EC', 'SV', 'GL', 'GD', 'GT',
40 # 'GY', 'HT', 'HN', 'JM', 'MX', 'MS', 'NI', 'PA', 'PY', 'PE', 'PR', 'KN',
41 # 'LC', 'VC', 'SX', 'SR', 'TT', 'TC', 'VI', 'UY', 'VE']
42 # COUNTRIES_OF_INTEREST = list(set(COUNTRIES_CORE + COUNTRIES_FRIENDS))
43 # COUNTRIES_ALL = list(set(COUNTRIES_CORE + COUNTRIES_FRIENDS + COUNTRIES_NORDIC + COUNTRIES_AMERICAS))
44
45
46 # %%
47 ## Old data source
48 # os.system('curl https://opendata.ecdc.europa.eu/covid19/casedistribution/csv/ > covid.csv')
49
50 ## In outer shell script, as it doesn't seem to work here
51 os.system('curl https://opendata.ecdc.europa.eu/covid19/nationalcasedeath/csv/ > covid.csv')
52
53
54 # %%
55 # First col is a date, treat geoId of NA as 'Namibia', not "NA" value
56 raw_data = pd.read_csv('covid.csv',
57 parse_dates=[7], dayfirst=True,
58 keep_default_na=False, na_values = [''],
59 # dtype = {'day': np.int64,
60 # 'month': np.int64,
61 # 'year': np.int64,
62 # 'cases': np.int64,
63 # 'deaths': np.int64,
64 # 'countriesAndTerritories': str,
65 # 'geoId': str,
66 # 'countryterritoryCode': str,
67 # 'popData2019': np.int64,
68 # 'continentExp': str,
69 # }
70 )
71
72 raw_data.fillna(0, inplace=True)
73 raw_data = raw_data[raw_data.country_code != 0]
74 # raw_data
75
76
77 # %%
78 raw_data['date'] = raw_data.year_week.apply(lambda yw: datetime.datetime.fromisocalendar(int(yw[:4]), int(yw[5:]), 7 ))
79 # raw_data
80
81 # %%
82 # raw_data.dtypes
83
84 # %%
85 # raw_data.head()
86
87 # %%
88 # raw_data[raw_data.country_code == 'GBR'].sort_values(by='date').tail(20)
89
90 # %%
91 case_data = raw_data[raw_data.indicator == 'cases'][['country_code', 'date', 'weekly_count', 'cumulative_count']]
92 case_data.rename(columns={'weekly_count': 'cases_weekly', 'cumulative_count': 'culm_cases'}, inplace=True)
93 # case_data.head()
94
95 # %%
96 deaths_data = raw_data[raw_data.indicator == 'deaths'][['country_code', 'date', 'weekly_count', 'cumulative_count']]
97 deaths_data.rename(columns={'weekly_count': 'deaths_weekly', 'cumulative_count': 'culm_deaths'}, inplace=True)
98 # deaths_data.head()
99
100 # %%
101 case_death_data = case_data.merge(deaths_data)
102 # case_death_data
103
104 # %%
105 # raw_data.columns
106
107 # %%
108 country_data = raw_data[['country', 'country_code', 'continent', 'population']].drop_duplicates()
109 # country_data
110
111 # %%
112
113 # %%
114 # raw_data.rename(columns={'dateRep':'report_date', 'geoId': 'geo_id',
115 # 'countriesAndTerritories': 'country_name',
116 # 'countryterritoryCode': 'country_territory_code',
117 # 'popData2019': 'population_2019',
118 # 'continentExp': 'continent'}, inplace=True)
119
120 # raw_data.rename(columns={'dateRep':'report_date', 'geoId': 'geo_id',
121 # 'countriesAndTerritories': 'country_name',
122 # 'countryterritoryCode': 'country_territory_code',
123 # 'cases': 'cases_weekly',
124 # 'deaths': 'deaths_weekly',
125 # 'popData2019': 'population_2019',
126 # 'continentExp': 'continent',
127 # # 'Cumulative_number_for_14_days_of_COVID-19_cases_per_100000': 'notification_rate_per_100000_population_14-days'
128 # },
129 # inplace=True)
130
131
132 # %%
133 case_death_data.to_sql(
134 'weekly_cases',
135 engine,
136 if_exists='replace',
137 index=False,
138 chunksize=500,
139 dtype={
140 "report_date": Date,
141 "cases_weekly": Integer,
142 "cases_culm": Integer,
143 "deaths_weekly": Integer,
144 "deaths_culm": Integer,
145 "country_code": String
146 }
147 )
148
149
150 # %%
151 country_data.to_sql(
152 'countries',
153 engine,
154 if_exists='replace',
155 index=False,
156 chunksize=500,
157 dtype={
158 "country_name": Text,
159 "country_code": String,
160 "population": Integer,
161 "continent": Text
162 }
163 )
164
165
166 # %%
167 uk_query_string = (
168 "https://api.coronavirus.data.gov.uk/v2/data?areaType=overview&"
169 "metric=covidOccupiedMVBeds&"
170 "metric=newAdmissions&"
171 "metric=newCasesBySpecimenDate&"
172 "metric=hospitalCases&"
173 "metric=newDeaths28DaysByPublishDate&"
174 "format=csv"
175 )
176
177 os.system(f'curl "{uk_query_string}" > uk_data.csv')
178
179
180 # %%
181 test_query_string = (
182 "https://api.coronavirus.data.gov.uk/v2/data?areaType=overview&"
183 "metric=newPCRTestsByPublishDate&"
184 "metric=newTestsByPublishDate&"
185 "metric=newPillarOneTwoTestsByPublishDate&"
186 "format=csv"
187 )
188 os.system(f'curl "{test_query_string}" > test_data.csv')
189
190
191 # %%
192 uk_data = pd.read_csv('uk_data.csv',
193 parse_dates=[0], dayfirst=True)
194
195
196 # %%
197 test_data = pd.read_csv('test_data.csv',
198 parse_dates=[0], dayfirst=True)
199
200
201 # %%
202 uk_data = uk_data.merge(test_data[['date', 'newPCRTestsByPublishDate',
203 'newTestsByPublishDate', 'newPillarOneTwoTestsByPublishDate']], how='outer', on='date')
204
205
206 # %%
207 uk_data.rename(
208 columns={
209 'covidOccupiedMVBeds': 'ventilator_beds',
210 'newCasesBySpecimenDate': 'new_cases',
211 'hospitalCases': 'hospital_cases',
212 'newDeaths28DaysByPublishDate': 'new_deaths',
213 'newAdmissions': 'new_admissions',
214 'newPCRTestsByPublishDate': 'new_pcr_tests',
215 'newTestsByPublishDate': 'new_tests',
216 'newPillarOneTwoTestsByPublishDate': 'new_pillar_1_2_tests'
217 }, inplace=True)
218
219
220 # %%
221 uk_data[['date',
222 'hospital_cases', 'ventilator_beds',
223 'new_cases', 'new_deaths',
224 'hospital_cases', 'new_admissions',
225 'new_pcr_tests', 'new_tests', 'new_pillar_1_2_tests'
226 ]].to_sql(
227 'uk_data',
228 engine,
229 if_exists='replace',
230 index=False,
231 chunksize=500,
232 dtype={
233 "date": Date,
234 "hospital_cases": Integer,
235 "ventilator_beds": Integer,
236 "new_cases": Integer,
237 "hospital_cases": Integer,
238 "new_deaths": Integer,
239 "new_admissions": Integer,
240 'new_pcr_tests': Integer,
241 'new_tests': Integer,
242 'new_pillar_1_2_tests': Integer
243 }
244 )
245
246
247 # %%
248 query_string = '''drop table if exists uk_data_7;
249 create table uk_data_7
250 (date date primary key,
251 hospital_cases real,
252 ventilator_beds real,
253 new_cases real,
254 new_deaths real,
255 new_admissions real,
256 new_pcr_tests real,
257 new_tests real,
258 new_pillar_1_2_tests real
259 );'''
260
261 with engine.connect() as connection:
262 connection.execute(query_string)
263
264
265 # %%
266 query_string = '''with ownd as (
267 select date,
268 avg(hospital_cases) over wnd as w_hospital_cases,
269 avg(ventilator_beds) over wnd as w_ventilator_beds,
270 avg(new_cases) over wnd as w_new_cases,
271 avg(new_deaths) over wnd as w_new_deaths,
272 avg(new_admissions) over wnd as w_new_admissions,
273 avg(new_pcr_tests) over wnd as w_new_pcr_tests,
274 avg(new_tests) over wnd as w_new_tests,
275 avg(new_pillar_1_2_tests) over wnd as w_new_pillar_1_2_tests,
276 count(*) over wnd as w_size
277 from uk_data
278 window wnd as (
279 order by uk_data.date
280 rows between 3 preceding and 3 following
281 )
282 )
283 insert into uk_data_7(date,
284 hospital_cases,
285 ventilator_beds,
286 new_cases,
287 new_deaths,
288 new_admissions,
289 new_pcr_tests,
290 new_tests,
291 new_pillar_1_2_tests
292 )
293 (select date,
294 w_hospital_cases,
295 w_ventilator_beds,
296 w_new_cases,
297 w_new_deaths,
298 w_new_admissions,
299 w_new_pcr_tests,
300 w_new_tests,
301 w_new_pillar_1_2_tests
302 from ownd
303 where w_size = 7
304 )'''
305 with engine.connect() as connection:
306 connection.execute(query_string)
307