Now using py files, for automation
[covid19.git] / data_import.py
1 #!/usr/bin/env python
2 # coding: utf-8
3 # %%
4 # Data from [European Centre for Disease Prevention and Control](https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide)
5
6 from sqlalchemy.types import Integer, Text, String, DateTime, Date, Float
7 from sqlalchemy import create_engine
8
9 # import itertools
10 # import collections
11 # import json
12 import pandas as pd
13 import numpy as np
14 # from scipy.stats import gmean
15 import datetime
16 import os
17
18
19 # %%
20 connection_string = 'postgresql://covid:3NbjJTkT63@localhost/covid'
21
22 eng = create_engine(connection_string)
23 engine = eng.execution_options(isolation_level="AUTOCOMMIT")
24
25
26 # %%
27 DEATH_COUNT_THRESHOLD = 10
28 COUNTRIES_CORE = 'IT DE UK ES IE FR BE'.split()
29 COUNTRIES_NORDIC = 'SE NO DK FI UK'.split()
30 COUNTRIES_FRIENDS = 'IT UK ES BE SI MX'.split()
31 # COUNTRIES_FRIENDS = 'IT UK ES BE SI PT'.split()
32
33 COUNTRIES_AMERICAS = ['AG', 'AR', 'AW', 'BS', 'BB', 'BZ', 'BM', 'BO', 'BR', 'VG', 'KY', # excluding Canada and USA
34 'CL', 'CO', 'CR', 'CU', 'CW', 'DM', 'DO', 'EC', 'SV', 'GL', 'GD', 'GT',
35 'GY', 'HT', 'HN', 'JM', 'MX', 'MS', 'NI', 'PA', 'PY', 'PE', 'PR', 'KN',
36 'LC', 'VC', 'SX', 'SR', 'TT', 'TC', 'VI', 'UY', 'VE']
37 COUNTRIES_OF_INTEREST = list(set(COUNTRIES_CORE + COUNTRIES_FRIENDS))
38 COUNTRIES_ALL = list(set(COUNTRIES_CORE + COUNTRIES_FRIENDS + COUNTRIES_NORDIC + COUNTRIES_AMERICAS))
39
40
41 # %%
42 os.system('curl https://opendata.ecdc.europa.eu/covid19/casedistribution/csv/ > covid.csv')
43
44
45 # %%
46 # First col is a date, treat geoId of NA as 'Namibia', not "NA" value
47 raw_data = pd.read_csv('covid.csv',
48 parse_dates=[0], dayfirst=True,
49 keep_default_na=False, na_values = [''],
50 # dtype = {'day': np.int64,
51 # 'month': np.int64,
52 # 'year': np.int64,
53 # 'cases': np.int64,
54 # 'deaths': np.int64,
55 # 'countriesAndTerritories': str,
56 # 'geoId': str,
57 # 'countryterritoryCode': str,
58 # 'popData2019': np.int64,
59 # 'continentExp': str,
60 # }
61 )
62
63 raw_data.fillna(0, inplace=True)
64
65
66 # %%
67 raw_data.rename(columns={'dateRep':'report_date', 'geoId': 'geo_id',
68 'countriesAndTerritories': 'country_name',
69 'countryterritoryCode': 'country_territory_code',
70 'popData2019': 'population_2019',
71 'continentExp': 'continent'}, inplace=True)
72
73
74 # %%
75 raw_data[['report_date', 'cases_weekly', 'deaths_weekly', 'geo_id', 'notification_rate_per_100000_population_14-days']].to_sql(
76 'weekly_cases',
77 engine,
78 if_exists='replace',
79 index=False,
80 chunksize=500,
81 dtype={
82 "report_date": Date,
83 "cases_weekly": Integer,
84 "deaths_weekly": Integer,
85 "geo_id": String,
86 "notification_rate_per_100000_population_14-days": Float
87 }
88 )
89
90
91 # %%
92 raw_data[['country_name', 'geo_id', 'country_territory_code',
93 'population_2019', 'continent']].drop_duplicates().to_sql(
94 'countries',
95 engine,
96 if_exists='replace',
97 index=False,
98 chunksize=500,
99 dtype={
100 "country_name": Text,
101 "geo_id": String,
102 "country_territory_code": String,
103 "population_2019": Integer,
104 "continent": Text
105 }
106 )
107
108
109 # %%
110 with engine.connect() as connection:
111 connection.execute('alter table weekly_cases add primary key (geo_id, report_date)')
112 connection.execute('alter table countries add primary key (geo_id);')
113 connection.execute('alter table weekly_cases add foreign key (geo_id) references countries(geo_id);')
114 connection.execute('alter table weekly_cases add culm_cases integer;')
115 connection.execute('alter table weekly_cases add culm_deaths integer;')
116
117
118 # %%
119 query_string = '''with culm as
120 (select report_date, geo_id,
121 sum(cases_weekly) over (partition by geo_id
122 order by report_date) as culm_data
123 from weekly_cases
124 )
125 update weekly_cases
126 set culm_cases = culm_data
127 from culm
128 where weekly_cases.report_date = culm.report_date and
129 weekly_cases.geo_id = culm.geo_id'''
130 with engine.connect() as connection:
131 connection.execute(query_string)
132
133
134 # %%
135 query_string = '''with culm as
136 (select report_date, geo_id,
137 sum(deaths_weekly) over (partition by geo_id
138 order by report_date) as culm_data
139 from weekly_cases
140 )
141 update weekly_cases
142 set culm_deaths = culm_data
143 from culm
144 where weekly_cases.report_date = culm.report_date and
145 weekly_cases.geo_id = culm.geo_id'''
146 with engine.connect() as connection:
147 connection.execute(query_string)
148
149
150 # %%
151 uk_query_string = (
152 "https://api.coronavirus.data.gov.uk/v2/data?areaType=overview&"
153 "metric=covidOccupiedMVBeds&"
154 "metric=newAdmissions&"
155 "metric=newCasesBySpecimenDate&"
156 "metric=hospitalCases&"
157 "metric=newDeaths28DaysByPublishDate&"
158 "format=csv"
159 )
160
161 os.system(f'curl "{uk_query_string}" > uk_data.csv')
162
163
164 # %%
165 test_query_string = (
166 "https://api.coronavirus.data.gov.uk/v2/data?areaType=overview&"
167 "metric=newPCRTestsByPublishDate&"
168 "metric=newTestsByPublishDate&"
169 "metric=newPillarOneTwoTestsByPublishDate&"
170 "format=csv"
171 )
172 os.system(f'curl "{test_query_string}" > test_data.csv')
173
174
175 # %%
176 uk_data = pd.read_csv('uk_data.csv',
177 parse_dates=[0], dayfirst=True)
178
179
180 # %%
181 test_data = pd.read_csv('test_data.csv',
182 parse_dates=[0], dayfirst=True)
183
184
185 # %%
186 uk_data = uk_data.merge(test_data[['date', 'newPCRTestsByPublishDate',
187 'newTestsByPublishDate', 'newPillarOneTwoTestsByPublishDate']], how='outer', on='date')
188
189
190 # %%
191 uk_data.rename(
192 columns={
193 'covidOccupiedMVBeds': 'ventilator_beds',
194 'newCasesBySpecimenDate': 'new_cases',
195 'hospitalCases': 'hospital_cases',
196 'newDeaths28DaysByPublishDate': 'new_deaths',
197 'newAdmissions': 'new_admissions',
198 'newPCRTestsByPublishDate': 'new_pcr_tests',
199 'newTestsByPublishDate': 'new_tests',
200 'newPillarOneTwoTestsByPublishDate': 'new_pillar_1_2_tests'
201 }, inplace=True)
202
203
204 # %%
205 uk_data[['date',
206 'hospital_cases', 'ventilator_beds',
207 'new_cases', 'new_deaths',
208 'hospital_cases', 'new_admissions',
209 'new_pcr_tests', 'new_tests', 'new_pillar_1_2_tests'
210 ]].to_sql(
211 'uk_data',
212 engine,
213 if_exists='replace',
214 index=False,
215 chunksize=500,
216 dtype={
217 "date": Date,
218 "hospital_cases": Integer,
219 "ventilator_beds": Integer,
220 "new_cases": Integer,
221 "hospital_cases": Integer,
222 "new_deaths": Integer,
223 "new_admissions": Integer,
224 'new_pcr_tests': Integer,
225 'new_tests': Integer,
226 'new_pillar_1_2_tests': Integer
227 }
228 )
229
230
231 # %%
232 query_string = '''drop table if exists uk_data_7;
233 create table uk_data_7
234 (date date primary key,
235 hospital_cases real,
236 ventilator_beds real,
237 new_cases real,
238 new_deaths real,
239 new_admissions real,
240 new_pcr_tests real,
241 new_tests real,
242 new_pillar_1_2_tests real
243 );'''
244
245 with engine.connect() as connection:
246 connection.execute(query_string)
247
248
249 # %%
250 query_string = '''with ownd as (
251 select date,
252 avg(hospital_cases) over wnd as w_hospital_cases,
253 avg(ventilator_beds) over wnd as w_ventilator_beds,
254 avg(new_cases) over wnd as w_new_cases,
255 avg(new_deaths) over wnd as w_new_deaths,
256 avg(new_admissions) over wnd as w_new_admissions,
257 avg(new_pcr_tests) over wnd as w_new_pcr_tests,
258 avg(new_tests) over wnd as w_new_tests,
259 avg(new_pillar_1_2_tests) over wnd as w_new_pillar_1_2_tests,
260 count(*) over wnd as w_size
261 from uk_data
262 window wnd as (
263 order by uk_data.date
264 rows between 3 preceding and 3 following
265 )
266 )
267 insert into uk_data_7(date,
268 hospital_cases,
269 ventilator_beds,
270 new_cases,
271 new_deaths,
272 new_admissions,
273 new_pcr_tests,
274 new_tests,
275 new_pillar_1_2_tests
276 )
277 (select date,
278 w_hospital_cases,
279 w_ventilator_beds,
280 w_new_cases,
281 w_new_deaths,
282 w_new_admissions,
283 w_new_pcr_tests,
284 w_new_tests,
285 w_new_pillar_1_2_tests
286 from ownd
287 where w_size = 7
288 )'''
289 with engine.connect() as connection:
290 connection.execute(query_string)
291