General updates
[covid19.git] / data_import.py
1 #!/usr/bin/env python
2 # coding: utf-8
3 # %%
4 # Data from [European Centre for Disease Prevention and Control](https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide)
5
6 from sqlalchemy.types import Integer, Text, String, DateTime, Date, Float
7 from sqlalchemy import create_engine
8
9 # import itertools
10 # import collections
11 # import json
12 import pandas as pd
13 import numpy as np
14 # from scipy.stats import gmean
15 import datetime
16 import os
17 # import matplotlib as mpl
18 # import matplotlib.pyplot as plt
19 # import matplotlib.animation as ani
20
21 # # %matplotlib inline
22
23
24 # %%
25 connection_string = 'postgresql://covid:3NbjJTkT63@localhost/covid'
26
27 eng = create_engine(connection_string)
28 engine = eng.execution_options(isolation_level="AUTOCOMMIT")
29
30
31 # %%
32 # DEATH_COUNT_THRESHOLD = 10
33 # COUNTRIES_CORE = 'IT DE UK ES IE FR BE'.split()
34 # COUNTRIES_NORDIC = 'SE NO DK FI UK'.split()
35 # COUNTRIES_FRIENDS = 'IT UK ES BE SI MX'.split()
36 # # COUNTRIES_FRIENDS = 'IT UK ES BE SI PT'.split()
37
38 # COUNTRIES_AMERICAS = ['AG', 'AR', 'AW', 'BS', 'BB', 'BZ', 'BM', 'BO', 'BR', 'VG', 'KY', # excluding Canada and USA
39 # 'CL', 'CO', 'CR', 'CU', 'CW', 'DM', 'DO', 'EC', 'SV', 'GL', 'GD', 'GT',
40 # 'GY', 'HT', 'HN', 'JM', 'MX', 'MS', 'NI', 'PA', 'PY', 'PE', 'PR', 'KN',
41 # 'LC', 'VC', 'SX', 'SR', 'TT', 'TC', 'VI', 'UY', 'VE']
42 # COUNTRIES_OF_INTEREST = list(set(COUNTRIES_CORE + COUNTRIES_FRIENDS))
43 # COUNTRIES_ALL = list(set(COUNTRIES_CORE + COUNTRIES_FRIENDS + COUNTRIES_NORDIC + COUNTRIES_AMERICAS))
44
45
46 # %%
47 ## Old data source
48 # os.system('curl https://opendata.ecdc.europa.eu/covid19/casedistribution/csv/ > covid.csv')
49
50 ## In outer shell script, as it doesn't seem to work here
51 os.system('curl https://opendata.ecdc.europa.eu/covid19/nationalcasedeath/csv/ > covid.csv')
52
53
54 # %%
55 # First col is a date, treat geoId of NA as 'Namibia', not "NA" value
56 raw_data = pd.read_csv('covid.csv',
57 parse_dates=[7], dayfirst=True,
58 keep_default_na=False, na_values = [''],
59 # dtype = {'day': np.int64,
60 # 'month': np.int64,
61 # 'year': np.int64,
62 # 'cases': np.int64,
63 # 'deaths': np.int64,
64 # 'countriesAndTerritories': str,
65 # 'geoId': str,
66 # 'countryterritoryCode': str,
67 # 'popData2019': np.int64,
68 # 'continentExp': str,
69 # }
70 )
71
72 raw_data.fillna(0, inplace=True)
73 raw_data = raw_data[raw_data.country_code != 0]
74 # raw_data
75
76
77 # %%
78 raw_data['date'] = raw_data.year_week.apply(lambda yw: datetime.datetime.fromisocalendar(int(yw[:4]), int(yw[5:]), 7 ))
79 # raw_data
80
81 # %%
82 # raw_data.dtypes
83
84 # %%
85 # raw_data.head()
86
87 # %%
88 # raw_data[raw_data.country_code == 'GBR'].sort_values(by='date').tail(20)
89
90 # %%
91 case_data = raw_data[raw_data.indicator == 'cases'][['country_code', 'date', 'weekly_count', 'cumulative_count']]
92 case_data.rename(columns={'weekly_count': 'cases_weekly', 'cumulative_count': 'culm_cases'}, inplace=True)
93 # case_data.head()
94
95 # %%
96 deaths_data = raw_data[raw_data.indicator == 'deaths'][['country_code', 'date', 'weekly_count', 'cumulative_count']]
97 deaths_data.rename(columns={'weekly_count': 'deaths_weekly', 'cumulative_count': 'culm_deaths'}, inplace=True)
98 # deaths_data.head()
99
100 # %%
101 case_death_data = case_data.merge(deaths_data)
102 # case_death_data
103
104 # %%
105 # raw_data.columns
106
107 # %%
108 country_data = raw_data[['country', 'country_code', 'continent', 'population']].drop_duplicates()
109 # country_data
110
111 # %%
112
113 # %%
114 # raw_data.rename(columns={'dateRep':'report_date', 'geoId': 'geo_id',
115 # 'countriesAndTerritories': 'country_name',
116 # 'countryterritoryCode': 'country_territory_code',
117 # 'popData2019': 'population_2019',
118 # 'continentExp': 'continent'}, inplace=True)
119
120 # raw_data.rename(columns={'dateRep':'report_date', 'geoId': 'geo_id',
121 # 'countriesAndTerritories': 'country_name',
122 # 'countryterritoryCode': 'country_territory_code',
123 # 'cases': 'cases_weekly',
124 # 'deaths': 'deaths_weekly',
125 # 'popData2019': 'population_2019',
126 # 'continentExp': 'continent',
127 # # 'Cumulative_number_for_14_days_of_COVID-19_cases_per_100000': 'notification_rate_per_100000_population_14-days'
128 # },
129 # inplace=True)
130
131
132 # %%
133 case_death_data.to_sql(
134 'weekly_cases',
135 engine,
136 if_exists='replace',
137 index=False,
138 chunksize=500,
139 dtype={
140 "report_date": Date,
141 "cases_weekly": Integer,
142 "cases_culm": Integer,
143 "deaths_weekly": Integer,
144 "deaths_culm": Integer,
145 "country_code": String
146 }
147 )
148
149
150 # %%
151 country_data.to_sql(
152 'countries',
153 engine,
154 if_exists='replace',
155 index=False,
156 chunksize=500,
157 dtype={
158 "country_name": Text,
159 "country_code": String,
160 "population": Integer,
161 "continent": Text
162 }
163 )
164
165
166 # %%
167 uk_query_string = (
168 "https://api.coronavirus.data.gov.uk/v2/data?areaType=overview&"
169 "metric=covidOccupiedMVBeds&"
170 "metric=newAdmissions&"
171 "metric=newCasesBySpecimenDate&"
172 "metric=hospitalCases&"
173 "metric=newDeaths28DaysByPublishDate&"
174 "metric=cumDeaths28DaysByPublishDate&"
175 "format=csv"
176 )
177
178 os.system(f'curl "{uk_query_string}" > uk_data.csv')
179
180
181 # %%
182 test_query_string = (
183 "https://api.coronavirus.data.gov.uk/v2/data?areaType=overview&"
184 "metric=newPCRTestsByPublishDate&"
185 "metric=newTestsByPublishDate&"
186 "metric=newPillarOneTwoTestsByPublishDate&"
187 "format=csv"
188 )
189 os.system(f'curl "{test_query_string}" > test_data.csv')
190
191
192 # %%
193 uk_data = pd.read_csv('uk_data.csv',
194 parse_dates=[0], dayfirst=True)
195
196
197 # %%
198 test_data = pd.read_csv('test_data.csv',
199 parse_dates=[0], dayfirst=True)
200
201
202 # %%
203 uk_data = uk_data.merge(test_data[['date', 'newPCRTestsByPublishDate',
204 'newTestsByPublishDate', 'newPillarOneTwoTestsByPublishDate']], how='outer', on='date')
205
206
207 # %%
208 uk_data.rename(
209 columns={
210 'covidOccupiedMVBeds': 'ventilator_beds',
211 'newCasesBySpecimenDate': 'new_cases',
212 'hospitalCases': 'hospital_cases',
213 'newDeaths28DaysByPublishDate': 'new_deaths',
214 'cumDeaths28DaysByPublishDate': 'cum_deaths',
215 'newAdmissions': 'new_admissions',
216 'newPCRTestsByPublishDate': 'new_pcr_tests',
217 'newTestsByPublishDate': 'new_tests',
218 'newPillarOneTwoTestsByPublishDate': 'new_pillar_1_2_tests'
219 }, inplace=True)
220
221
222 # %%
223 uk_data[['date',
224 'hospital_cases', 'ventilator_beds',
225 'new_cases', 'new_deaths',
226 'cum_deaths',
227 'hospital_cases', 'new_admissions',
228 'new_pcr_tests', 'new_tests', 'new_pillar_1_2_tests'
229 ]].to_sql(
230 'uk_data',
231 engine,
232 if_exists='replace',
233 index=False,
234 chunksize=500,
235 dtype={
236 "date": Date,
237 "hospital_cases": Integer,
238 "ventilator_beds": Integer,
239 "new_cases": Integer,
240 "hospital_cases": Integer,
241 "new_deaths": Integer,
242 "cum_deaths": Integer,
243 "new_admissions": Integer,
244 'new_pcr_tests': Integer,
245 'new_tests': Integer,
246 'new_pillar_1_2_tests': Integer
247 }
248 )
249
250
251 # %%
252 query_string = '''drop table if exists uk_data_7;
253 create table uk_data_7
254 (date date primary key,
255 hospital_cases real,
256 ventilator_beds real,
257 new_cases real,
258 new_deaths real,
259 new_admissions real,
260 new_pcr_tests real,
261 new_tests real,
262 new_pillar_1_2_tests real
263 );'''
264
265 with engine.connect() as connection:
266 connection.execute(query_string)
267
268
269 # %%
270 query_string = '''with ownd as (
271 select date,
272 avg(hospital_cases) over wnd as w_hospital_cases,
273 avg(ventilator_beds) over wnd as w_ventilator_beds,
274 avg(new_cases) over wnd as w_new_cases,
275 avg(new_deaths) over wnd as w_new_deaths,
276 avg(new_admissions) over wnd as w_new_admissions,
277 avg(new_pcr_tests) over wnd as w_new_pcr_tests,
278 avg(new_tests) over wnd as w_new_tests,
279 avg(new_pillar_1_2_tests) over wnd as w_new_pillar_1_2_tests,
280 count(*) over wnd as w_size
281 from uk_data
282 window wnd as (
283 order by uk_data.date
284 rows between 3 preceding and 3 following
285 )
286 )
287 insert into uk_data_7(date,
288 hospital_cases,
289 ventilator_beds,
290 new_cases,
291 new_deaths,
292 new_admissions,
293 new_pcr_tests,
294 new_tests,
295 new_pillar_1_2_tests
296 )
297 (select date,
298 w_hospital_cases,
299 w_ventilator_beds,
300 w_new_cases,
301 w_new_deaths,
302 w_new_admissions,
303 w_new_pcr_tests,
304 w_new_tests,
305 w_new_pillar_1_2_tests
306 from ownd
307 where w_size = 7
308 )'''
309 with engine.connect() as connection:
310 connection.execute(query_string)
311
312
313 # %%