data_import.py

   1 #!/usr/bin/env python
   2 # coding: utf-8
   3 # %%
   4 # Data from [European Centre for Disease Prevention and Control](https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide)
   5
   6 from sqlalchemy.types import Integer, Text, String, DateTime, Date, Float
   7 from sqlalchemy import create_engine
   8
   9 # import itertools
  10 # import collections
  11 # import json
  12 import pandas as pd
  13 import numpy as np
  14 # from scipy.stats import gmean
  15 import datetime
  16 import os
  17 # import matplotlib as mpl
  18 # import matplotlib.pyplot as plt
  19 # import matplotlib.animation as ani
  20
  21 # # %matplotlib inline
  22
  23
  24 # %%
  25 connection_string = 'postgresql://covid:3NbjJTkT63@localhost/covid'
  26
  27 eng = create_engine(connection_string)
  28 engine = eng.execution_options(isolation_level="AUTOCOMMIT")
  29
  30
  31 # %%
  32 # DEATH_COUNT_THRESHOLD = 10
  33 # COUNTRIES_CORE = 'IT DE UK ES IE FR BE'.split()
  34 # COUNTRIES_NORDIC = 'SE NO DK FI UK'.split()
  35 # COUNTRIES_FRIENDS = 'IT UK ES BE SI MX'.split()
  36 # # COUNTRIES_FRIENDS = 'IT UK ES BE SI PT'.split()
  37
  38 # COUNTRIES_AMERICAS = ['AG', 'AR', 'AW', 'BS', 'BB', 'BZ', 'BM', 'BO', 'BR', 'VG', 'KY', # excluding Canada and USA
  39 #        'CL', 'CO', 'CR', 'CU', 'CW', 'DM', 'DO', 'EC', 'SV', 'GL', 'GD', 'GT',
  40 #        'GY', 'HT', 'HN', 'JM', 'MX', 'MS', 'NI', 'PA', 'PY', 'PE', 'PR', 'KN',
  41 #        'LC', 'VC', 'SX', 'SR', 'TT', 'TC', 'VI', 'UY', 'VE']
  42 # COUNTRIES_OF_INTEREST = list(set(COUNTRIES_CORE + COUNTRIES_FRIENDS))
  43 # COUNTRIES_ALL = list(set(COUNTRIES_CORE + COUNTRIES_FRIENDS + COUNTRIES_NORDIC + COUNTRIES_AMERICAS))
  44
  45
  46 # %%
  47 ## Old data source
  48 # os.system('curl https://opendata.ecdc.europa.eu/covid19/casedistribution/csv/ > covid.csv')
  49
  50 ## In outer shell script, as it doesn't seem to work here
  51 os.system('curl   https://opendata.ecdc.europa.eu/covid19/nationalcasedeath/csv/ > covid.csv')
  52
  53
  54 # %%
  55 # First col is a date, treat geoId of NA as 'Namibia', not "NA" value
  56 raw_data = pd.read_csv('covid.csv',
  57                        parse_dates=[7], dayfirst=True,
  58                        keep_default_na=False, na_values = [''],
  59 #                        dtype = {'day': np.int64,
  60 #                                 'month': np.int64,
  61 #                                 'year': np.int64,
  62 #                                 'cases': np.int64,
  63 #                                 'deaths': np.int64,
  64 #                                 'countriesAndTerritories': str,
  65 #                                 'geoId': str,
  66 #                                 'countryterritoryCode': str,
  67 #                                 'popData2019': np.int64,
  68 #                                 'continentExp': str,
  69 #                                 }
  70                       )
  71
  72 raw_data.fillna(0, inplace=True)
  73 raw_data = raw_data[raw_data.country_code != 0]
  74 # raw_data
  75
  76
  77 # %%
  78 raw_data['date'] = raw_data.year_week.apply(lambda yw: datetime.datetime.fromisocalendar(int(yw[:4]), int(yw[5:]), 7 ))
  79 # raw_data
  80
  81 # %%
  82 # raw_data.dtypes
  83
  84 # %%
  85 # raw_data.head()
  86
  87 # %%
  88 # raw_data[raw_data.country_code == 'GBR'].sort_values(by='date').tail(20)
  89
  90 # %%
  91 case_data = raw_data[raw_data.indicator == 'cases'][['country_code', 'date', 'weekly_count', 'cumulative_count']]
  92 case_data.rename(columns={'weekly_count': 'cases_weekly', 'cumulative_count': 'culm_cases'}, inplace=True)
  93 # case_data.head()
  94
  95 # %%
  96 deaths_data = raw_data[raw_data.indicator == 'deaths'][['country_code', 'date', 'weekly_count', 'cumulative_count']]
  97 deaths_data.rename(columns={'weekly_count': 'deaths_weekly', 'cumulative_count': 'culm_deaths'}, inplace=True)
  98 # deaths_data.head()
  99
 100 # %%
 101 case_death_data = case_data.merge(deaths_data)
 102 # case_death_data
 103
 104 # %%
 105 # raw_data.columns
 106
 107 # %%
 108 country_data = raw_data[['country', 'country_code', 'continent', 'population']].drop_duplicates()
 109 # country_data
 110
 111 # %%
 112
 113 # %%
 114 # raw_data.rename(columns={'dateRep':'report_date', 'geoId': 'geo_id',
 115 #                         'countriesAndTerritories': 'country_name',
 116 #                         'countryterritoryCode': 'country_territory_code',
 117 #                         'popData2019': 'population_2019',
 118 #                         'continentExp': 'continent'}, inplace=True)
 119
 120 # raw_data.rename(columns={'dateRep':'report_date', 'geoId': 'geo_id',
 121 #                         'countriesAndTerritories': 'country_name',
 122 #                         'countryterritoryCode': 'country_territory_code',
 123 #                         'cases': 'cases_weekly',
 124 #                         'deaths': 'deaths_weekly',
 125 #                         'popData2019': 'population_2019',
 126 #                         'continentExp': 'continent',
 127 # #                         'Cumulative_number_for_14_days_of_COVID-19_cases_per_100000': 'notification_rate_per_100000_population_14-days'
 128 #                         },
 129 #                 inplace=True)
 130
 131
 132 # %%
 133 case_death_data.to_sql(
 134     'weekly_cases',
 135     engine,
 136     if_exists='replace',
 137     index=False,
 138     chunksize=500,
 139     dtype={
 140         "report_date": Date,
 141         "cases_weekly": Integer,
 142         "cases_culm": Integer,
 143         "deaths_weekly": Integer,
 144         "deaths_culm": Integer,
 145         "country_code": String
 146     }
 147 )
 148
 149
 150 # %%
 151 country_data.to_sql(
 152     'countries',
 153     engine,
 154     if_exists='replace',
 155     index=False,
 156     chunksize=500,
 157     dtype={
 158         "country_name": Text,
 159         "country_code": String,
 160         "population": Integer,
 161         "continent": Text
 162     }
 163 )
 164
 165
 166 # %%
 167 uk_query_string = (
 168 "https://api.coronavirus.data.gov.uk/v2/data?areaType=overview&"
 169 "metric=covidOccupiedMVBeds&"
 170 "metric=newAdmissions&"
 171 "metric=newCasesBySpecimenDate&"
 172 "metric=hospitalCases&"
 173 "metric=newDeaths28DaysByPublishDate&"
 174 "metric=cumDeaths28DaysByPublishDate&"
 175 "format=csv"
 176 )
 177
 178 os.system(f'curl "{uk_query_string}" > uk_data.csv')
 179
 180
 181 # %%
 182 test_query_string = (
 183 "https://api.coronavirus.data.gov.uk/v2/data?areaType=overview&"
 184 "metric=newPCRTestsByPublishDate&"
 185 "metric=newTestsByPublishDate&"
 186 "metric=newPillarOneTwoTestsByPublishDate&"
 187 "format=csv"
 188 )
 189 os.system(f'curl "{test_query_string}" > test_data.csv')
 190
 191
 192 # %%
 193 uk_data = pd.read_csv('uk_data.csv',
 194                        parse_dates=[0], dayfirst=True)
 195
 196
 197 # %%
 198 test_data = pd.read_csv('test_data.csv',
 199                        parse_dates=[0], dayfirst=True)
 200
 201
 202 # %%
 203 uk_data = uk_data.merge(test_data[['date', 'newPCRTestsByPublishDate',
 204        'newTestsByPublishDate', 'newPillarOneTwoTestsByPublishDate']], how='outer', on='date')
 205
 206
 207 # %%
 208 uk_data.rename(
 209     columns={
 210         'covidOccupiedMVBeds': 'ventilator_beds',
 211         'newCasesBySpecimenDate': 'new_cases',
 212         'hospitalCases': 'hospital_cases',
 213         'newDeaths28DaysByPublishDate': 'new_deaths',
 214         'cumDeaths28DaysByPublishDate': 'cum_deaths',
 215         'newAdmissions': 'new_admissions',
 216         'newPCRTestsByPublishDate': 'new_pcr_tests',
 217         'newTestsByPublishDate': 'new_tests',
 218         'newPillarOneTwoTestsByPublishDate': 'new_pillar_1_2_tests'
 219     }, inplace=True)
 220
 221
 222 # %%
 223 uk_data[['date',
 224          'hospital_cases', 'ventilator_beds',
 225          'new_cases', 'new_deaths',
 226          'cum_deaths',
 227          'hospital_cases', 'new_admissions',
 228          'new_pcr_tests', 'new_tests', 'new_pillar_1_2_tests'
 229         ]].to_sql(
 230     'uk_data',
 231     engine,
 232     if_exists='replace',
 233     index=False,
 234     chunksize=500,
 235     dtype={
 236         "date": Date,
 237         "hospital_cases": Integer,
 238         "ventilator_beds": Integer,
 239         "new_cases": Integer,
 240         "hospital_cases": Integer,
 241         "new_deaths": Integer,
 242         "cum_deaths": Integer,
 243         "new_admissions": Integer,
 244         'new_pcr_tests': Integer,
 245         'new_tests': Integer,
 246         'new_pillar_1_2_tests': Integer
 247     }
 248 )
 249
 250
 251 # %%
 252 query_string = '''drop table if exists uk_data_7;
 253 create table uk_data_7
 254 (date date primary key,
 255  hospital_cases real,
 256  ventilator_beds real,
 257  new_cases real,
 258  new_deaths real,
 259  new_admissions real,
 260  new_pcr_tests real,
 261  new_tests real,
 262  new_pillar_1_2_tests real
 263 );'''
 264
 265 with engine.connect() as connection:
 266     connection.execute(query_string)
 267
 268
 269 # %%
 270 query_string = '''with ownd as (
 271     select date,
 272         avg(hospital_cases) over wnd as w_hospital_cases,
 273         avg(ventilator_beds) over wnd as w_ventilator_beds,
 274         avg(new_cases) over wnd as w_new_cases,
 275         avg(new_deaths) over wnd as w_new_deaths,
 276         avg(new_admissions) over wnd as w_new_admissions,
 277         avg(new_pcr_tests) over wnd as w_new_pcr_tests,
 278         avg(new_tests) over wnd as w_new_tests,
 279         avg(new_pillar_1_2_tests) over wnd as w_new_pillar_1_2_tests,
 280         count(*) over wnd as w_size
 281     from uk_data
 282     window wnd as (
 283         order by uk_data.date
 284         rows between 3 preceding and 3 following
 285     )
 286 )
 287 insert into uk_data_7(date,
 288     hospital_cases,
 289     ventilator_beds,
 290     new_cases,
 291     new_deaths,
 292     new_admissions,
 293     new_pcr_tests,
 294     new_tests,
 295     new_pillar_1_2_tests
 296     )
 297 (select date,
 298         w_hospital_cases,
 299         w_ventilator_beds,
 300         w_new_cases,
 301         w_new_deaths,
 302         w_new_admissions,
 303         w_new_pcr_tests,
 304         w_new_tests,
 305         w_new_pillar_1_2_tests
 306     from ownd
 307     where w_size = 7
 308 )'''
 309 with engine.connect() as connection:
 310     connection.execute(query_string)
 311
 312
 313 # %%