Updated for imported data format
[covid19.git] / data_import.md
diff --git a/data_import.md b/data_import.md
deleted file mode 100644 (file)
index 0a680eb..0000000
+++ /dev/null
@@ -1,742 +0,0 @@
----
-jupyter:
-  jupytext:
-    formats: ipynb,md
-    text_representation:
-      extension: .md
-      format_name: markdown
-      format_version: '1.2'
-      jupytext_version: 1.9.1
-  kernelspec:
-    display_name: Python 3
-    language: python
-    name: python3
----
-
-<!-- #region Collapsed="false" -->
-Data from [European Centre for Disease Prevention and Control](https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide)
-<!-- #endregion -->
-
-```python Collapsed="false"
-from sqlalchemy.types import Integer, Text, String, DateTime, Date, Float
-from sqlalchemy import create_engine
-```
-
-```python Collapsed="false"
-import itertools
-import collections
-import json
-import pandas as pd
-import numpy as np
-from scipy.stats import gmean
-import datetime
-
-import matplotlib as mpl
-import matplotlib.pyplot as plt
-%matplotlib inline
-```
-
-```python Collapsed="false"
-%load_ext sql
-```
-
-```python Collapsed="false"
-connection_string = 'postgresql://covid:3NbjJTkT63@localhost/covid'
-```
-
-```python Collapsed="false"
-%sql $connection_string
-```
-
-```python Collapsed="false"
-eng = create_engine(connection_string)
-engine = eng.execution_options(isolation_level="AUTOCOMMIT")
-```
-
-```python Collapsed="false"
-DEATH_COUNT_THRESHOLD = 10
-COUNTRIES_CORE = 'IT DE UK ES IE FR BE'.split()
-COUNTRIES_NORDIC = 'SE NO DK FI UK'.split()
-COUNTRIES_FRIENDS = 'IT UK ES BE SI MX'.split()
-# COUNTRIES_FRIENDS = 'IT UK ES BE SI PT'.split()
-
-COUNTRIES_AMERICAS = ['AG', 'AR', 'AW', 'BS', 'BB', 'BZ', 'BM', 'BO', 'BR', 'VG', 'KY', # excluding Canada and USA
-       'CL', 'CO', 'CR', 'CU', 'CW', 'DM', 'DO', 'EC', 'SV', 'GL', 'GD', 'GT',
-       'GY', 'HT', 'HN', 'JM', 'MX', 'MS', 'NI', 'PA', 'PY', 'PE', 'PR', 'KN',
-       'LC', 'VC', 'SX', 'SR', 'TT', 'TC', 'VI', 'UY', 'VE']
-COUNTRIES_OF_INTEREST = list(set(COUNTRIES_CORE + COUNTRIES_FRIENDS))
-COUNTRIES_ALL = list(set(COUNTRIES_CORE + COUNTRIES_FRIENDS + COUNTRIES_NORDIC + COUNTRIES_AMERICAS))
-```
-
-```python Collapsed="false"
-!curl https://opendata.ecdc.europa.eu/covid19/casedistribution/csv/ > covid.csv
-```
-
-```python Collapsed="false"
-# First col is a date, treat geoId of NA as 'Namibia', not "NA" value
-raw_data = pd.read_csv('covid.csv', 
-                       parse_dates=[0], dayfirst=True,
-                       keep_default_na=False, na_values = [''],
-#                        dtype = {'day': np.int64, 
-#                                 'month': np.int64, 
-#                                 'year': np.int64, 
-#                                 'cases': np.int64, 
-#                                 'deaths': np.int64, 
-#                                 'countriesAndTerritories': str, 
-#                                 'geoId': str, 
-#                                 'countryterritoryCode': str, 
-#                                 'popData2019': np.int64, 
-#                                 'continentExp': str, 
-#                                 }
-                      )
-```
-
-```python Collapsed="false"
-raw_data.size
-```
-
-```python Collapsed="false"
-raw_data.fillna(0, inplace=True)
-```
-
-```python Collapsed="false"
-raw_data.head()
-```
-
-```python Collapsed="false"
-raw_data.rename(columns={'dateRep':'report_date', 'geoId': 'geo_id',
-                        'countriesAndTerritories': 'country_name',
-                        'countryterritoryCode': 'country_territory_code',
-                        'popData2019': 'population_2019',
-                        'continentExp': 'continent'}, inplace=True)
-```
-
-```python Collapsed="false"
-raw_data.columns
-```
-
-```python Collapsed="false"
-raw_data.dtypes
-```
-
-```python Collapsed="false"
-raw_data[['report_date', 'cases_weekly', 'deaths_weekly', 'geo_id', 'notification_rate_per_100000_population_14-days']]
-```
-
-```python Collapsed="false"
-raw_data[['country_name', 'geo_id', 'country_territory_code',
-       'population_2019', 'continent']]
-```
-
-```python Collapsed="false"
-raw_data[['report_date', 'cases_weekly', 'deaths_weekly', 'geo_id', 'notification_rate_per_100000_population_14-days']].to_sql(
-    'weekly_cases',
-    engine,
-    if_exists='replace',
-    index=False,
-    chunksize=500,
-    dtype={
-        "report_date": Date,
-        "cases_weekly": Integer,
-        "deaths_weekly": Integer,
-        "geo_id": String,
-        "notification_rate_per_100000_population_14-days": Float
-    }
-)
-```
-
-```python Collapsed="false"
-raw_data[['country_name', 'geo_id', 'country_territory_code',
-       'population_2019', 'continent']].drop_duplicates().to_sql(
-    'countries',
-    engine,
-    if_exists='replace',
-    index=False,
-    chunksize=500,
-    dtype={
-        "country_name": Text,
-        "geo_id": String,
-        "country_territory_code": String,
-        "population_2019": Integer,
-        "continent": Text
-    }
-)
-```
-
-```python Collapsed="false"
-# %sql select geo_id from weekly_cases limit 10
-```
-
-```python Collapsed="false"
-# %%sql alter table weekly_cases add primary key (geo_id, report_date);
-# alter table countries add primary key (geo_id);
-# alter table weekly_cases add foreign key (geo_id) references countries(geo_id);
-# alter table weekly_cases add culm_cases integer;
-# alter table weekly_cases add culm_deaths integer;
-```
-
-```python
-with engine.connect() as connection:
-    connection.execute('alter table weekly_cases add primary key (geo_id, report_date)')
-    connection.execute('alter table countries add primary key (geo_id);')
-    connection.execute('alter table weekly_cases add foreign key (geo_id) references countries(geo_id);')
-    connection.execute('alter table weekly_cases add culm_cases integer;')
-    connection.execute('alter table weekly_cases add culm_deaths integer;')
-```
-
-```python Collapsed="false"
-# %sql select report_date, cases_weekly, country_name from weekly_cases join countries using (geo_id) order by report_date desc limit 10
-```
-
-```python Collapsed="false"
-# %sql select report_date, cases_weekly, country_name from weekly_cases join countries on weekly_cases.geo_id = countries.geo_id order by report_date desc limit 10
-```
-
-```python Collapsed="false"
-%sql select * from weekly_cases limit 10
-```
-
-```python Collapsed="false"
-# %%sql
-# with culm as 
-#     (select report_date, geo_id,
-#         sum(cases_weekly) over (partition by geo_id 
-#                                 order by report_date) as culm_data
-#      from weekly_cases
-#     )
-# update weekly_cases
-#     set culm_cases = culm_data
-#     from culm
-#     where weekly_cases.report_date = culm.report_date and
-#           weekly_cases.geo_id = culm.geo_id
-```
-
-```python Collapsed="false"
-query_string = '''with culm as 
-    (select report_date, geo_id,
-        sum(cases_weekly) over (partition by geo_id 
-                                order by report_date) as culm_data
-     from weekly_cases
-    )
-update weekly_cases
-    set culm_cases = culm_data
-    from culm
-    where weekly_cases.report_date = culm.report_date and
-          weekly_cases.geo_id = culm.geo_id'''
-with engine.connect() as connection:
-    connection.execute(query_string)
-```
-
-```python Collapsed="false"
-# %%sql
-# with culm as 
-#     (select report_date, geo_id,
-#         sum(deaths_weekly) over (partition by geo_id 
-#                                 order by report_date) as culm_data
-#      from weekly_cases
-#     )
-# update weekly_cases
-#     set culm_deaths = culm_data
-#     from culm
-#     where weekly_cases.report_date = culm.report_date and
-#           weekly_cases.geo_id = culm.geo_id
-```
-
-```python Collapsed="false"
-query_string = '''with culm as 
-    (select report_date, geo_id,
-        sum(deaths_weekly) over (partition by geo_id 
-                                order by report_date) as culm_data
-     from weekly_cases
-    )
-update weekly_cases
-    set culm_deaths = culm_data
-    from culm
-    where weekly_cases.report_date = culm.report_date and
-          weekly_cases.geo_id = culm.geo_id'''
-with engine.connect() as connection:
-    connection.execute(query_string)
-```
-
-```python Collapsed="false"
-uk_query_string = (
-"https://api.coronavirus.data.gov.uk/v2/data?areaType=overview&"
-"metric=covidOccupiedMVBeds&"
-"metric=newAdmissions&"
-"metric=newCasesBySpecimenDate&"
-"metric=hospitalCases&"
-"metric=newDeaths28DaysByPublishDate&"
-"format=csv"
-)
-
-!curl "$uk_query_string" > uk_data.csv
-```
-
-```python Collapsed="false"
-test_query_string = (
-"https://api.coronavirus.data.gov.uk/v2/data?areaType=overview&"
-"metric=newPCRTestsByPublishDate&"
-"metric=newTestsByPublishDate&"
-"metric=newPillarOneTwoTestsByPublishDate&"
-"format=csv"
-)
-!curl "$test_query_string" > test_data.csv
-```
-
-```python Collapsed="false"
-# hospital_query_string = (
-# "https://api.coronavirus.data.gov.uk/v2/data?areaType=overview&"
-# "metric=newAdmissions&"
-# "format=csv"
-# )
-# !curl "$hospital_query_string" > hospital_admissions.csv
-```
-
-```python Collapsed="false"
-# hospital_query_string = (
-# "https://api.coronavirus.data.gov.uk/v1/data?"
-# "filters=areaName=United%2520Kingdom;areaType=overview&"
-# "structure=%7B%22date%22:%22date%22,%22areaName%22:%22areaName%22,%22areaType%22:%22areaType%22,"
-# "%22newAdmissions%22:%22newAdmissions%22,%22cumAdmissions%22:%22cumAdmissions%22%7D&format=csv"
-# )
-    
-# !curl "$hospital_query_string" | gunzip > hospital_admissions.csv
-```
-
-```python Collapsed="false"
-uk_data = pd.read_csv('uk_data.csv', 
-                       parse_dates=[0], dayfirst=True)
-uk_data
-```
-
-```python Collapsed="false"
-test_data = pd.read_csv('test_data.csv', 
-                       parse_dates=[0], dayfirst=True)
-test_data
-```
-
-```python Collapsed="false"
-test_data.columns
-```
-
-```python Collapsed="false"
-uk_data = uk_data.merge(test_data[['date', 'newPCRTestsByPublishDate',
-       'newTestsByPublishDate', 'newPillarOneTwoTestsByPublishDate']], how='outer', on='date')
-```
-
-```python Collapsed="false"
-uk_data.columns
-```
-
-```python Collapsed="false"
-uk_data.rename(
-    columns={
-        'covidOccupiedMVBeds': 'ventilator_beds',
-        'newCasesBySpecimenDate': 'new_cases',
-        'hospitalCases': 'hospital_cases', 
-        'newDeaths28DaysByPublishDate': 'new_deaths',
-        'newAdmissions': 'new_admissions',
-        'newPCRTestsByPublishDate': 'new_pcr_tests',
-        'newTestsByPublishDate': 'new_tests',
-        'newPillarOneTwoTestsByPublishDate': 'new_pillar_1_2_tests'
-    }, inplace=True)
-```
-
-```python Collapsed="false"
-uk_data.dtypes
-```
-
-```python Collapsed="false"
- uk_data.columns
-```
-
-```python Collapsed="false"
-uk_data[['date', 
-         'hospital_cases', 'ventilator_beds',
-         'new_cases', 'new_deaths', 
-         'hospital_cases', 'new_admissions',
-         'new_pcr_tests', 'new_tests', 'new_pillar_1_2_tests'
-        ]].to_sql(
-    'uk_data',
-    engine,
-    if_exists='replace',
-    index=False,
-    chunksize=500,
-    dtype={
-        "date": Date,
-        "hospital_cases": Integer,
-        "ventilator_beds": Integer,
-        "new_cases": Integer,
-        "hospital_cases": Integer,
-        "new_deaths": Integer,
-        "new_admissions": Integer,
-        'new_pcr_tests': Integer, 
-        'new_tests': Integer, 
-        'new_pillar_1_2_tests': Integer
-    }
-)
-```
-
-```python Collapsed="false"
-# %sql select * from uk_data order by date desc limit 10
-```
-
-```python Collapsed="false"
-query_string = '''drop table if exists uk_data_7;
-create table uk_data_7 
-(date date primary key,
- hospital_cases real,
- ventilator_beds real,
- new_cases real,
- new_deaths real,
- new_admissions real,
- new_pcr_tests real,
- new_tests real,
- new_pillar_1_2_tests real
-);'''
-
-with engine.connect() as connection:
-    connection.execute(query_string)
-```
-
-```python
-update_string = '''with ownd as (
-    select date,
-        avg(hospital_cases) over wnd as w_hospital_cases,
-        avg(ventilator_beds) over wnd as w_ventilator_beds,
-        avg(new_cases) over wnd as w_new_cases,
-        avg(new_deaths) over wnd as w_new_deaths,
-        avg(new_admissions) over wnd as w_new_admissions,
-        avg(new_pcr_tests) over wnd as w_new_pcr_tests,
-        avg(new_tests) over wnd as w_new_tests,
-        avg(new_pillar_1_2_tests) over wnd as w_new_pillar_1_2_tests,
-        count(*) over wnd as w_size
-    from uk_data
-    window wnd as (
-        order by uk_data.date
-        rows between 3 preceding and 3 following
-    )
-)
-insert into uk_data_7(date, 
-    hospital_cases, 
-    ventilator_beds, 
-    new_cases,
-    new_deaths,
-    new_admissions, 
-    new_pcr_tests, 
-    new_tests, 
-    new_pillar_1_2_tests
-    )
-(select date,
-        w_hospital_cases,
-        w_ventilator_beds,
-        w_new_cases,
-        w_new_deaths,
-        w_new_admissions,
-        w_new_pcr_tests,
-        w_new_tests,
-        w_new_pillar_1_2_tests
-    from ownd
-    where w_size = 7
-)'''
-with engine.connect() as connection:
-    connection.execute(update_string)
-```
-
-```python Collapsed="false"
-# %%sql insert into uk_data_7(date, ventilator_beds, new_cases, hospital_cases, new_deaths, new_admissions)
-# values (
-#     select date, 
-#     avg(ventilator_beds) over (order by date rows between 6 preceding and current row)
-#     from uk_data
-# )
-```
-
-```python Collapsed="false"
-# query_string = '''insert into uk_data_7(date, hospital_cases)
-#     select uk_data.date, 
-#         avg(uk_data.hospital_cases) over (order by uk_data.date rows between 3 preceding and 3 following) as hospital_cases
-#     from uk_data'''
-# with engine.connect() as connection:
-#     connection.execute(query_string)   
-```
-
-```python Collapsed="false"
-# %%sql 
-# with m7 as 
-#     (select uk_data.date as date7, 
-#         avg(uk_data.ventilator_beds) over (order by uk_data.date rows between 6 preceding and current row) as nc7
-#      from uk_data
-#     )
-# update uk_data_7
-#     set ventilator_beds = nc7
-#     from m7
-#     where uk_data_7.date = m7.date7
-```
-
-```python Collapsed="false"
-# %%sql
-# with m7 as 
-#     (select uk_data.date as date7, 
-#         avg(uk_data.ventilator_beds) over (order by uk_data.date rows between 3 preceding and 3 following) as nc7
-#      from uk_data
-#     )
-# update uk_data_7
-#     set ventilator_beds = nc7
-#     from m7
-#     where uk_data_7.date = m7.date7
-```
-
-```python Collapsed="false"
-# %%sql
-# with m7 as 
-#     (select uk_data.date as date7, 
-#         avg(uk_data.new_cases) over (order by uk_data.date rows between 3 preceding and 3 following) as nc7
-#      from uk_data
-#     )
-# update uk_data_7
-#     set new_cases = nc7
-#     from m7
-#     where uk_data_7.date = m7.date7
-```
-
-```python Collapsed="false"
-# %%sql
-# with m7 as 
-#     (select uk_data.date as date7, 
-#         avg(uk_data.hospital_cases) over (order by uk_data.date rows between 3 preceding and 3 following) as d7
-#      from uk_data
-#     )
-# update uk_data_7
-#     set hospital_cases = d7
-#     from m7
-#     where uk_data_7.date = m7.date7
-```
-
-```python Collapsed="false"
-# %%sql
-# with m7 as 
-#     (select uk_data.date as date7, 
-#         avg(uk_data.new_deaths) over (order by uk_data.date rows between 3 preceding and 3 following) as d7
-#      from uk_data
-#     )
-# update uk_data_7
-#     set new_deaths = d7
-#     from m7
-#     where uk_data_7.date = m7.date7
-```
-
-```python Collapsed="false"
-# %%sql
-# with m7 as 
-#     (select uk_data.date as date7, 
-#         avg(uk_data.new_admissions) over (order by uk_data.date rows between 3 preceding and 3 following) as d7
-#      from uk_data
-#     )
-# update uk_data_7
-#     set new_admissions = d7
-#     from m7
-#     where uk_data_7.date = m7.date7
-```
-
-```python Collapsed="false"
-# %%sql
-# with m7 as 
-#     (select uk_data.date as date7, 
-#         avg(uk_data.new_pcr_tests) over (order by uk_data.date rows between 3 preceding and 3 following) as d7
-#      from uk_data
-#     )
-# update uk_data_7
-#     set new_pcr_tests = d7
-#     from m7
-#     where uk_data_7.date = m7.date7
-```
-
-```python Collapsed="false"
-# %%sql
-# with m7 as 
-#     (select uk_data.date as date7, 
-#         avg(uk_data.new_tests) over (order by uk_data.date rows between 3 preceding and 3 following) as d7
-#      from uk_data
-#     )
-# update uk_data_7
-#     set new_tests = d7
-#     from m7
-#     where uk_data_7.date = m7.date7
-```
-
-```python Collapsed="false"
-# %%sql
-# with m7 as 
-#     (select uk_data.date as date7, 
-#         avg(uk_data.new_pillar_1_2_tests) over (order by uk_data.date rows between 3 preceding and 3 following) as d7
-#      from uk_data
-#     )
-# update uk_data_7
-#     set new_pillar_1_2_tests = d7
-#     from m7
-#     where uk_data_7.date = m7.date7
-```
-
-```python Collapsed="false"
-# %%sql
-# with wnd as
-#     (   select date, 
-#              avg(new_pillar_1_2_tests)
-#                 over (order by uk_data.date rows between 3 preceding and 3 following) as a_new_pillar_1_2_tests
-#         from uk_data
-#     )
-# update uk_data_7
-#     set new_pillar_1_2_tests = wnd.a_new_pillar_1_2_tests
-#     from wnd
-#     where uk_data_7.date = wnd.date
-#     and (select count(*) from wnd) = 7
-```
-
-```python Collapsed="false"
-# %%sql
-# with wnd as
-#     (   select date, 
-#              avg(new_pillar_1_2_tests)
-#                 over (order by uk_data.date rows between 3 preceding and 3 following) as a_new_pillar_1_2_tests,
-#              count(new_pillar_1_2_tests)
-#                 over (order by uk_data.date rows between 3 preceding and 3 following) as a_count
-#         from uk_data
-#     )
-# select uk_data_7.date, new_pillar_1_2_tests, wnd.a_new_pillar_1_2_tests, wnd.a_count
-#     from uk_data_7, wnd
-#     where uk_data_7.date = wnd.date
-#     order by uk_data_7.date desc limit 10
-    
-# select date, 
-#     count(*) over wnd as w_size
-# from uk_data
-# window wnd as (
-#     order by uk_data.date
-#     rows between 3 preceding and 3 following
-# );
-```
-
-```python Collapsed="false"
-# %%sql
-# select date, 
-#     count(*) over wnd as w_size
-# from uk_data
-# window wnd as (
-#     order by uk_data.date
-#     rows between 3 preceding and 3 following
-# )
-# order by date desc limit 10
-```
-
-```python
-# %%sql
-# with ownd as (
-#     select date,
-#         avg(hospital_cases) over wnd as w_hospital_cases,
-#         avg(ventilator_beds) over wnd as w_ventilator_beds,
-#         avg(new_cases) over wnd as w_new_cases,
-#         avg(new_deaths) over wnd as w_new_deaths,
-#         avg(new_admissions) over wnd as w_new_admissions,
-#         avg(new_pcr_tests) over wnd as w_new_pcr_tests,
-#         avg(new_tests) over wnd as w_new_tests,
-#         avg(new_pillar_1_2_tests) over wnd as w_new_pillar_1_2_tests,
-#         count(*) over wnd as w_size
-#     from uk_data
-#     window wnd as (
-#         order by uk_data.date
-#         rows between 3 preceding and 3 following
-#     ))
-# insert into uk_data_7(date, 
-#     hospital_cases, 
-#     ventilator_beds, 
-#     new_cases,
-#     new_deaths,
-#     new_admissions, 
-#     new_pcr_tests, 
-#     new_tests, 
-#     new_pillar_1_2_tests
-#     )
-# (select date,
-#         avg(hospital_cases) over wnd as w_hospital_cases,
-#         avg(ventilator_beds) over wnd as w_ventilator_beds,
-#         avg(new_cases) over wnd as w_new_cases,
-#         avg(new_deaths) over wnd as w_new_deaths,
-#         avg(new_admissions) over wnd as w_new_admissions,
-#         avg(new_pcr_tests) over wnd as w_new_pcr_tests,
-#         avg(new_tests) over wnd as w_new_tests,
-#         avg(new_pillar_1_2_tests) over wnd as w_new_pillar_1_2_tests,
-#         count(*) over wnd as w_size
-#     from uk_data
-#     window wnd as (
-#         order by uk_data.date
-#         rows between 3 preceding and 3 following
-#     )
-# )
-#     set date = ownd.date,
-#         hospital_cases = w_hospital_cases,
-#         ventilator_beds = w_ventilator_beds,
-#         new_cases = w_new_cases,
-#         new_deaths = w_new_deaths,
-#         new_admissions = w_new_admissions,
-#         new_pcr_tests = w_new_pcr_tests,
-#         new_tests = w_new_tests,
-#         new_pillar_1_2_tests = w_new_pillar_1_2_tests
-#     from ownd
-#     where w_size = 7
-```
-
-```python
-# %%sql
-# with ownd as (
-#     select date,
-#         avg(hospital_cases) over wnd as w_hospital_cases,
-#         avg(ventilator_beds) over wnd as w_ventilator_beds,
-#         avg(new_cases) over wnd as w_new_cases,
-#         avg(new_deaths) over wnd as w_new_deaths,
-#         avg(new_admissions) over wnd as w_new_admissions,
-#         avg(new_pcr_tests) over wnd as w_new_pcr_tests,
-#         avg(new_tests) over wnd as w_new_tests,
-#         avg(new_pillar_1_2_tests) over wnd as w_new_pillar_1_2_tests,
-#         count(*) over wnd as w_size
-#     from uk_data
-#     window wnd as (
-#         order by uk_data.date
-#         rows between 3 preceding and 3 following
-#     )
-# )
-# insert into uk_data_7(date, 
-#     hospital_cases, 
-#     ventilator_beds, 
-#     new_cases,
-#     new_deaths,
-#     new_admissions, 
-#     new_pcr_tests, 
-#     new_tests, 
-#     new_pillar_1_2_tests
-#     )
-# (select date,
-#         w_hospital_cases,
-#         w_ventilator_beds,
-#         w_new_cases,
-#         w_new_deaths,
-#         w_new_admissions,
-#         w_new_pcr_tests,
-#         w_new_tests,
-#         w_new_pillar_1_2_tests
-#     from ownd
-#     where w_size = 7
-# )
-```
-
-```python Collapsed="false"
-%sql select * from uk_data_7 order by date desc limit 10
-```
-
-```python Collapsed="false"
-%sql select * from uk_data order by date desc limit 10
-```
-
-```python Collapsed="false"
-
-```