data_import.py

   1 #!/usr/bin/env python
   2 # coding: utf-8
   3 # %%
   4 # Data from [European Centre for Disease Prevention and Control](https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide)
   5
   6 from sqlalchemy.types import Integer, Text, String, DateTime, Date, Float
   7 from sqlalchemy import create_engine
   8
   9 # import itertools
  10 # import collections
  11 # import json
  12 import pandas as pd
  13 import numpy as np
  14 # from scipy.stats import gmean
  15 import datetime
  16 import os
  17
  18
  19 # %%
  20 connection_string = 'postgresql://covid:3NbjJTkT63@localhost/covid'
  21
  22 eng = create_engine(connection_string)
  23 engine = eng.execution_options(isolation_level="AUTOCOMMIT")
  24
  25
  26 # %%
  27 DEATH_COUNT_THRESHOLD = 10
  28 COUNTRIES_CORE = 'IT DE UK ES IE FR BE'.split()
  29 COUNTRIES_NORDIC = 'SE NO DK FI UK'.split()
  30 COUNTRIES_FRIENDS = 'IT UK ES BE SI MX'.split()
  31 # COUNTRIES_FRIENDS = 'IT UK ES BE SI PT'.split()
  32
  33 COUNTRIES_AMERICAS = ['AG', 'AR', 'AW', 'BS', 'BB', 'BZ', 'BM', 'BO', 'BR', 'VG', 'KY', # excluding Canada and USA
  34        'CL', 'CO', 'CR', 'CU', 'CW', 'DM', 'DO', 'EC', 'SV', 'GL', 'GD', 'GT',
  35        'GY', 'HT', 'HN', 'JM', 'MX', 'MS', 'NI', 'PA', 'PY', 'PE', 'PR', 'KN',
  36        'LC', 'VC', 'SX', 'SR', 'TT', 'TC', 'VI', 'UY', 'VE']
  37 COUNTRIES_OF_INTEREST = list(set(COUNTRIES_CORE + COUNTRIES_FRIENDS))
  38 COUNTRIES_ALL = list(set(COUNTRIES_CORE + COUNTRIES_FRIENDS + COUNTRIES_NORDIC + COUNTRIES_AMERICAS))
  39
  40
  41 # %%
  42 os.system('curl https://opendata.ecdc.europa.eu/covid19/casedistribution/csv/ > covid.csv')
  43
  44
  45 # %%
  46 # First col is a date, treat geoId of NA as 'Namibia', not "NA" value
  47 raw_data = pd.read_csv('covid.csv',
  48                        parse_dates=[0], dayfirst=True,
  49                        keep_default_na=False, na_values = [''],
  50 #                        dtype = {'day': np.int64,
  51 #                                 'month': np.int64,
  52 #                                 'year': np.int64,
  53 #                                 'cases': np.int64,
  54 #                                 'deaths': np.int64,
  55 #                                 'countriesAndTerritories': str,
  56 #                                 'geoId': str,
  57 #                                 'countryterritoryCode': str,
  58 #                                 'popData2019': np.int64,
  59 #                                 'continentExp': str,
  60 #                                 }
  61                       )
  62
  63 raw_data.fillna(0, inplace=True)
  64
  65
  66 # %%
  67 raw_data.rename(columns={'dateRep':'report_date', 'geoId': 'geo_id',
  68                         'countriesAndTerritories': 'country_name',
  69                         'countryterritoryCode': 'country_territory_code',
  70                         'popData2019': 'population_2019',
  71                         'continentExp': 'continent'}, inplace=True)
  72
  73
  74 # %%
  75 raw_data[['report_date', 'cases_weekly', 'deaths_weekly', 'geo_id', 'notification_rate_per_100000_population_14-days']].to_sql(
  76     'weekly_cases',
  77     engine,
  78     if_exists='replace',
  79     index=False,
  80     chunksize=500,
  81     dtype={
  82         "report_date": Date,
  83         "cases_weekly": Integer,
  84         "deaths_weekly": Integer,
  85         "geo_id": String,
  86         "notification_rate_per_100000_population_14-days": Float
  87     }
  88 )
  89
  90
  91 # %%
  92 raw_data[['country_name', 'geo_id', 'country_territory_code',
  93        'population_2019', 'continent']].drop_duplicates().to_sql(
  94     'countries',
  95     engine,
  96     if_exists='replace',
  97     index=False,
  98     chunksize=500,
  99     dtype={
 100         "country_name": Text,
 101         "geo_id": String,
 102         "country_territory_code": String,
 103         "population_2019": Integer,
 104         "continent": Text
 105     }
 106 )
 107
 108
 109 # %%
 110 with engine.connect() as connection:
 111     connection.execute('alter table weekly_cases add primary key (geo_id, report_date)')
 112     connection.execute('alter table countries add primary key (geo_id);')
 113     connection.execute('alter table weekly_cases add foreign key (geo_id) references countries(geo_id);')
 114     connection.execute('alter table weekly_cases add culm_cases integer;')
 115     connection.execute('alter table weekly_cases add culm_deaths integer;')
 116
 117
 118 # %%
 119 query_string = '''with culm as
 120     (select report_date, geo_id,
 121         sum(cases_weekly) over (partition by geo_id
 122                                 order by report_date) as culm_data
 123      from weekly_cases
 124     )
 125 update weekly_cases
 126     set culm_cases = culm_data
 127     from culm
 128     where weekly_cases.report_date = culm.report_date and
 129           weekly_cases.geo_id = culm.geo_id'''
 130 with engine.connect() as connection:
 131     connection.execute(query_string)
 132
 133
 134 # %%
 135 query_string = '''with culm as
 136     (select report_date, geo_id,
 137         sum(deaths_weekly) over (partition by geo_id
 138                                 order by report_date) as culm_data
 139      from weekly_cases
 140     )
 141 update weekly_cases
 142     set culm_deaths = culm_data
 143     from culm
 144     where weekly_cases.report_date = culm.report_date and
 145           weekly_cases.geo_id = culm.geo_id'''
 146 with engine.connect() as connection:
 147     connection.execute(query_string)
 148
 149
 150 # %%
 151 uk_query_string = (
 152 "https://api.coronavirus.data.gov.uk/v2/data?areaType=overview&"
 153 "metric=covidOccupiedMVBeds&"
 154 "metric=newAdmissions&"
 155 "metric=newCasesBySpecimenDate&"
 156 "metric=hospitalCases&"
 157 "metric=newDeaths28DaysByPublishDate&"
 158 "format=csv"
 159 )
 160
 161 os.system(f'curl "{uk_query_string}" > uk_data.csv')
 162
 163
 164 # %%
 165 test_query_string = (
 166 "https://api.coronavirus.data.gov.uk/v2/data?areaType=overview&"
 167 "metric=newPCRTestsByPublishDate&"
 168 "metric=newTestsByPublishDate&"
 169 "metric=newPillarOneTwoTestsByPublishDate&"
 170 "format=csv"
 171 )
 172 os.system(f'curl "{test_query_string}" > test_data.csv')
 173
 174
 175 # %%
 176 uk_data = pd.read_csv('uk_data.csv',
 177                        parse_dates=[0], dayfirst=True)
 178
 179
 180 # %%
 181 test_data = pd.read_csv('test_data.csv',
 182                        parse_dates=[0], dayfirst=True)
 183
 184
 185 # %%
 186 uk_data = uk_data.merge(test_data[['date', 'newPCRTestsByPublishDate',
 187        'newTestsByPublishDate', 'newPillarOneTwoTestsByPublishDate']], how='outer', on='date')
 188
 189
 190 # %%
 191 uk_data.rename(
 192     columns={
 193         'covidOccupiedMVBeds': 'ventilator_beds',
 194         'newCasesBySpecimenDate': 'new_cases',
 195         'hospitalCases': 'hospital_cases',
 196         'newDeaths28DaysByPublishDate': 'new_deaths',
 197         'newAdmissions': 'new_admissions',
 198         'newPCRTestsByPublishDate': 'new_pcr_tests',
 199         'newTestsByPublishDate': 'new_tests',
 200         'newPillarOneTwoTestsByPublishDate': 'new_pillar_1_2_tests'
 201     }, inplace=True)
 202
 203
 204 # %%
 205 uk_data[['date',
 206          'hospital_cases', 'ventilator_beds',
 207          'new_cases', 'new_deaths',
 208          'hospital_cases', 'new_admissions',
 209          'new_pcr_tests', 'new_tests', 'new_pillar_1_2_tests'
 210         ]].to_sql(
 211     'uk_data',
 212     engine,
 213     if_exists='replace',
 214     index=False,
 215     chunksize=500,
 216     dtype={
 217         "date": Date,
 218         "hospital_cases": Integer,
 219         "ventilator_beds": Integer,
 220         "new_cases": Integer,
 221         "hospital_cases": Integer,
 222         "new_deaths": Integer,
 223         "new_admissions": Integer,
 224         'new_pcr_tests': Integer,
 225         'new_tests': Integer,
 226         'new_pillar_1_2_tests': Integer
 227     }
 228 )
 229
 230
 231 # %%
 232 query_string = '''drop table if exists uk_data_7;
 233 create table uk_data_7
 234 (date date primary key,
 235  hospital_cases real,
 236  ventilator_beds real,
 237  new_cases real,
 238  new_deaths real,
 239  new_admissions real,
 240  new_pcr_tests real,
 241  new_tests real,
 242  new_pillar_1_2_tests real
 243 );'''
 244
 245 with engine.connect() as connection:
 246     connection.execute(query_string)
 247
 248
 249 # %%
 250 query_string = '''with ownd as (
 251     select date,
 252         avg(hospital_cases) over wnd as w_hospital_cases,
 253         avg(ventilator_beds) over wnd as w_ventilator_beds,
 254         avg(new_cases) over wnd as w_new_cases,
 255         avg(new_deaths) over wnd as w_new_deaths,
 256         avg(new_admissions) over wnd as w_new_admissions,
 257         avg(new_pcr_tests) over wnd as w_new_pcr_tests,
 258         avg(new_tests) over wnd as w_new_tests,
 259         avg(new_pillar_1_2_tests) over wnd as w_new_pillar_1_2_tests,
 260         count(*) over wnd as w_size
 261     from uk_data
 262     window wnd as (
 263         order by uk_data.date
 264         rows between 3 preceding and 3 following
 265     )
 266 )
 267 insert into uk_data_7(date,
 268     hospital_cases,
 269     ventilator_beds,
 270     new_cases,
 271     new_deaths,
 272     new_admissions,
 273     new_pcr_tests,
 274     new_tests,
 275     new_pillar_1_2_tests
 276     )
 277 (select date,
 278         w_hospital_cases,
 279         w_ventilator_beds,
 280         w_new_cases,
 281         w_new_deaths,
 282         w_new_admissions,
 283         w_new_pcr_tests,
 284         w_new_tests,
 285         w_new_pillar_1_2_tests
 286     from ownd
 287     where w_size = 7
 288 )'''
 289 with engine.connect() as connection:
 290     connection.execute(query_string)
 291