data_import.py

   1 #!/usr/bin/env python
   2 # coding: utf-8
   3 # %%
   4 # Data from [European Centre for Disease Prevention and Control](https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide)
   5
   6 from sqlalchemy.types import Integer, Text, String, DateTime, Date, Float
   7 from sqlalchemy import create_engine
   8
   9 # import itertools
  10 # import collections
  11 # import json
  12 import pandas as pd
  13 import numpy as np
  14 # from scipy.stats import gmean
  15 import datetime
  16 import os
  17 # import matplotlib as mpl
  18 # import matplotlib.pyplot as plt
  19 # import matplotlib.animation as ani
  20
  21 # # %matplotlib inline
  22
  23
  24 # %%
  25 connection_string = 'postgresql://covid:3NbjJTkT63@localhost/covid'
  26
  27 eng = create_engine(connection_string)
  28 engine = eng.execution_options(isolation_level="AUTOCOMMIT")
  29
  30
  31 # %%
  32 # DEATH_COUNT_THRESHOLD = 10
  33 # COUNTRIES_CORE = 'IT DE UK ES IE FR BE'.split()
  34 # COUNTRIES_NORDIC = 'SE NO DK FI UK'.split()
  35 # COUNTRIES_FRIENDS = 'IT UK ES BE SI MX'.split()
  36 # # COUNTRIES_FRIENDS = 'IT UK ES BE SI PT'.split()
  37
  38 # COUNTRIES_AMERICAS = ['AG', 'AR', 'AW', 'BS', 'BB', 'BZ', 'BM', 'BO', 'BR', 'VG', 'KY', # excluding Canada and USA
  39 #        'CL', 'CO', 'CR', 'CU', 'CW', 'DM', 'DO', 'EC', 'SV', 'GL', 'GD', 'GT',
  40 #        'GY', 'HT', 'HN', 'JM', 'MX', 'MS', 'NI', 'PA', 'PY', 'PE', 'PR', 'KN',
  41 #        'LC', 'VC', 'SX', 'SR', 'TT', 'TC', 'VI', 'UY', 'VE']
  42 # COUNTRIES_OF_INTEREST = list(set(COUNTRIES_CORE + COUNTRIES_FRIENDS))
  43 # COUNTRIES_ALL = list(set(COUNTRIES_CORE + COUNTRIES_FRIENDS + COUNTRIES_NORDIC + COUNTRIES_AMERICAS))
  44
  45
  46 # %%
  47 ## Old data source
  48 # os.system('curl https://opendata.ecdc.europa.eu/covid19/casedistribution/csv/ > covid.csv')
  49
  50 ## In outer shell script, as it doesn't seem to work here
  51 os.system('curl   https://opendata.ecdc.europa.eu/covid19/nationalcasedeath/csv/ > covid.csv')
  52
  53
  54 # %%
  55 # First col is a date, treat geoId of NA as 'Namibia', not "NA" value
  56 raw_data = pd.read_csv('covid.csv',
  57                        parse_dates=[7], dayfirst=True,
  58                        keep_default_na=False, na_values = [''],
  59 #                        dtype = {'day': np.int64,
  60 #                                 'month': np.int64,
  61 #                                 'year': np.int64,
  62 #                                 'cases': np.int64,
  63 #                                 'deaths': np.int64,
  64 #                                 'countriesAndTerritories': str,
  65 #                                 'geoId': str,
  66 #                                 'countryterritoryCode': str,
  67 #                                 'popData2019': np.int64,
  68 #                                 'continentExp': str,
  69 #                                 }
  70                       )
  71
  72 raw_data.fillna(0, inplace=True)
  73 raw_data = raw_data[raw_data.country_code != 0]
  74 # raw_data
  75
  76
  77 # %%
  78 raw_data['date'] = raw_data.year_week.apply(lambda yw: datetime.datetime.fromisocalendar(int(yw[:4]), int(yw[5:]), 7 ))
  79 # raw_data
  80
  81 # %%
  82 # raw_data.dtypes
  83
  84 # %%
  85 # raw_data.head()
  86
  87 # %%
  88 # raw_data[raw_data.country_code == 'GBR'].sort_values(by='date').tail(20)
  89
  90 # %%
  91 case_data = raw_data[raw_data.indicator == 'cases'][['country_code', 'date', 'weekly_count', 'cumulative_count']]
  92 case_data.rename(columns={'weekly_count': 'cases_weekly', 'cumulative_count': 'culm_cases'}, inplace=True)
  93 # case_data.head()
  94
  95 # %%
  96 deaths_data = raw_data[raw_data.indicator == 'deaths'][['country_code', 'date', 'weekly_count', 'cumulative_count']]
  97 deaths_data.rename(columns={'weekly_count': 'deaths_weekly', 'cumulative_count': 'culm_deaths'}, inplace=True)
  98 # deaths_data.head()
  99
 100 # %%
 101 case_death_data = case_data.merge(deaths_data)
 102 # case_death_data
 103
 104 # %%
 105 # raw_data.columns
 106
 107 # %%
 108 country_data = raw_data[['country', 'country_code', 'continent', 'population']].drop_duplicates()
 109 # country_data
 110
 111 # %%
 112
 113 # %%
 114 # raw_data.rename(columns={'dateRep':'report_date', 'geoId': 'geo_id',
 115 #                         'countriesAndTerritories': 'country_name',
 116 #                         'countryterritoryCode': 'country_territory_code',
 117 #                         'popData2019': 'population_2019',
 118 #                         'continentExp': 'continent'}, inplace=True)
 119
 120 # raw_data.rename(columns={'dateRep':'report_date', 'geoId': 'geo_id',
 121 #                         'countriesAndTerritories': 'country_name',
 122 #                         'countryterritoryCode': 'country_territory_code',
 123 #                         'cases': 'cases_weekly',
 124 #                         'deaths': 'deaths_weekly',
 125 #                         'popData2019': 'population_2019',
 126 #                         'continentExp': 'continent',
 127 # #                         'Cumulative_number_for_14_days_of_COVID-19_cases_per_100000': 'notification_rate_per_100000_population_14-days'
 128 #                         },
 129 #                 inplace=True)
 130
 131
 132 # %%
 133 case_death_data.to_sql(
 134     'weekly_cases',
 135     engine,
 136     if_exists='replace',
 137     index=False,
 138     chunksize=500,
 139     dtype={
 140         "report_date": Date,
 141         "cases_weekly": Integer,
 142         "cases_culm": Integer,
 143         "deaths_weekly": Integer,
 144         "deaths_culm": Integer,
 145         "country_code": String
 146     }
 147 )
 148
 149
 150 # %%
 151 country_data.to_sql(
 152     'countries',
 153     engine,
 154     if_exists='replace',
 155     index=False,
 156     chunksize=500,
 157     dtype={
 158         "country_name": Text,
 159         "country_code": String,
 160         "population": Integer,
 161         "continent": Text
 162     }
 163 )
 164
 165
 166 # %%
 167 uk_query_string = (
 168 "https://api.coronavirus.data.gov.uk/v2/data?areaType=overview&"
 169 "metric=covidOccupiedMVBeds&"
 170 "metric=newAdmissions&"
 171 "metric=newCasesBySpecimenDate&"
 172 "metric=hospitalCases&"
 173 "metric=newDeaths28DaysByPublishDate&"
 174 "format=csv"
 175 )
 176
 177 os.system(f'curl "{uk_query_string}" > uk_data.csv')
 178
 179
 180 # %%
 181 test_query_string = (
 182 "https://api.coronavirus.data.gov.uk/v2/data?areaType=overview&"
 183 "metric=newPCRTestsByPublishDate&"
 184 "metric=newTestsByPublishDate&"
 185 "metric=newPillarOneTwoTestsByPublishDate&"
 186 "format=csv"
 187 )
 188 os.system(f'curl "{test_query_string}" > test_data.csv')
 189
 190
 191 # %%
 192 uk_data = pd.read_csv('uk_data.csv',
 193                        parse_dates=[0], dayfirst=True)
 194
 195
 196 # %%
 197 test_data = pd.read_csv('test_data.csv',
 198                        parse_dates=[0], dayfirst=True)
 199
 200
 201 # %%
 202 uk_data = uk_data.merge(test_data[['date', 'newPCRTestsByPublishDate',
 203        'newTestsByPublishDate', 'newPillarOneTwoTestsByPublishDate']], how='outer', on='date')
 204
 205
 206 # %%
 207 uk_data.rename(
 208     columns={
 209         'covidOccupiedMVBeds': 'ventilator_beds',
 210         'newCasesBySpecimenDate': 'new_cases',
 211         'hospitalCases': 'hospital_cases',
 212         'newDeaths28DaysByPublishDate': 'new_deaths',
 213         'newAdmissions': 'new_admissions',
 214         'newPCRTestsByPublishDate': 'new_pcr_tests',
 215         'newTestsByPublishDate': 'new_tests',
 216         'newPillarOneTwoTestsByPublishDate': 'new_pillar_1_2_tests'
 217     }, inplace=True)
 218
 219
 220 # %%
 221 uk_data[['date',
 222          'hospital_cases', 'ventilator_beds',
 223          'new_cases', 'new_deaths',
 224          'hospital_cases', 'new_admissions',
 225          'new_pcr_tests', 'new_tests', 'new_pillar_1_2_tests'
 226         ]].to_sql(
 227     'uk_data',
 228     engine,
 229     if_exists='replace',
 230     index=False,
 231     chunksize=500,
 232     dtype={
 233         "date": Date,
 234         "hospital_cases": Integer,
 235         "ventilator_beds": Integer,
 236         "new_cases": Integer,
 237         "hospital_cases": Integer,
 238         "new_deaths": Integer,
 239         "new_admissions": Integer,
 240         'new_pcr_tests': Integer,
 241         'new_tests': Integer,
 242         'new_pillar_1_2_tests': Integer
 243     }
 244 )
 245
 246
 247 # %%
 248 query_string = '''drop table if exists uk_data_7;
 249 create table uk_data_7
 250 (date date primary key,
 251  hospital_cases real,
 252  ventilator_beds real,
 253  new_cases real,
 254  new_deaths real,
 255  new_admissions real,
 256  new_pcr_tests real,
 257  new_tests real,
 258  new_pillar_1_2_tests real
 259 );'''
 260
 261 with engine.connect() as connection:
 262     connection.execute(query_string)
 263
 264
 265 # %%
 266 query_string = '''with ownd as (
 267     select date,
 268         avg(hospital_cases) over wnd as w_hospital_cases,
 269         avg(ventilator_beds) over wnd as w_ventilator_beds,
 270         avg(new_cases) over wnd as w_new_cases,
 271         avg(new_deaths) over wnd as w_new_deaths,
 272         avg(new_admissions) over wnd as w_new_admissions,
 273         avg(new_pcr_tests) over wnd as w_new_pcr_tests,
 274         avg(new_tests) over wnd as w_new_tests,
 275         avg(new_pillar_1_2_tests) over wnd as w_new_pillar_1_2_tests,
 276         count(*) over wnd as w_size
 277     from uk_data
 278     window wnd as (
 279         order by uk_data.date
 280         rows between 3 preceding and 3 following
 281     )
 282 )
 283 insert into uk_data_7(date,
 284     hospital_cases,
 285     ventilator_beds,
 286     new_cases,
 287     new_deaths,
 288     new_admissions,
 289     new_pcr_tests,
 290     new_tests,
 291     new_pillar_1_2_tests
 292     )
 293 (select date,
 294         w_hospital_cases,
 295         w_ventilator_beds,
 296         w_new_cases,
 297         w_new_deaths,
 298         w_new_admissions,
 299         w_new_pcr_tests,
 300         w_new_tests,
 301         w_new_pillar_1_2_tests
 302     from ownd
 303     where w_size = 7
 304 )'''
 305 with engine.connect() as connection:
 306     connection.execute(query_string)
 307