X-Git-Url: https://git.njae.me.uk/?a=blobdiff_plain;ds=sidebyside;f=data_import.md;fp=data_import.md;h=0000000000000000000000000000000000000000;hb=4feace1e2a5076f3efa0fd65b31411ac511efbb1;hp=0a680eb9577b811165f896f1160b5d54fd915e88;hpb=891ac3fd6c3ef5cc37a66f0b53d870e218d74d28;p=covid19.git diff --git a/data_import.md b/data_import.md deleted file mode 100644 index 0a680eb..0000000 --- a/data_import.md +++ /dev/null @@ -1,742 +0,0 @@ ---- -jupyter: - jupytext: - formats: ipynb,md - text_representation: - extension: .md - format_name: markdown - format_version: '1.2' - jupytext_version: 1.9.1 - kernelspec: - display_name: Python 3 - language: python - name: python3 ---- - - -Data from [European Centre for Disease Prevention and Control](https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide) - - -```python Collapsed="false" -from sqlalchemy.types import Integer, Text, String, DateTime, Date, Float -from sqlalchemy import create_engine -``` - -```python Collapsed="false" -import itertools -import collections -import json -import pandas as pd -import numpy as np -from scipy.stats import gmean -import datetime - -import matplotlib as mpl -import matplotlib.pyplot as plt -%matplotlib inline -``` - -```python Collapsed="false" -%load_ext sql -``` - -```python Collapsed="false" -connection_string = 'postgresql://covid:3NbjJTkT63@localhost/covid' -``` - -```python Collapsed="false" -%sql $connection_string -``` - -```python Collapsed="false" -eng = create_engine(connection_string) -engine = eng.execution_options(isolation_level="AUTOCOMMIT") -``` - -```python Collapsed="false" -DEATH_COUNT_THRESHOLD = 10 -COUNTRIES_CORE = 'IT DE UK ES IE FR BE'.split() -COUNTRIES_NORDIC = 'SE NO DK FI UK'.split() -COUNTRIES_FRIENDS = 'IT UK ES BE SI MX'.split() -# COUNTRIES_FRIENDS = 'IT UK ES BE SI PT'.split() - -COUNTRIES_AMERICAS = ['AG', 'AR', 'AW', 'BS', 'BB', 'BZ', 'BM', 'BO', 'BR', 'VG', 'KY', # excluding Canada and USA - 'CL', 'CO', 'CR', 'CU', 'CW', 'DM', 'DO', 'EC', 'SV', 'GL', 'GD', 'GT', - 'GY', 'HT', 'HN', 'JM', 'MX', 'MS', 'NI', 'PA', 'PY', 'PE', 'PR', 'KN', - 'LC', 'VC', 'SX', 'SR', 'TT', 'TC', 'VI', 'UY', 'VE'] -COUNTRIES_OF_INTEREST = list(set(COUNTRIES_CORE + COUNTRIES_FRIENDS)) -COUNTRIES_ALL = list(set(COUNTRIES_CORE + COUNTRIES_FRIENDS + COUNTRIES_NORDIC + COUNTRIES_AMERICAS)) -``` - -```python Collapsed="false" -!curl https://opendata.ecdc.europa.eu/covid19/casedistribution/csv/ > covid.csv -``` - -```python Collapsed="false" -# First col is a date, treat geoId of NA as 'Namibia', not "NA" value -raw_data = pd.read_csv('covid.csv', - parse_dates=[0], dayfirst=True, - keep_default_na=False, na_values = [''], -# dtype = {'day': np.int64, -# 'month': np.int64, -# 'year': np.int64, -# 'cases': np.int64, -# 'deaths': np.int64, -# 'countriesAndTerritories': str, -# 'geoId': str, -# 'countryterritoryCode': str, -# 'popData2019': np.int64, -# 'continentExp': str, -# } - ) -``` - -```python Collapsed="false" -raw_data.size -``` - -```python Collapsed="false" -raw_data.fillna(0, inplace=True) -``` - -```python Collapsed="false" -raw_data.head() -``` - -```python Collapsed="false" -raw_data.rename(columns={'dateRep':'report_date', 'geoId': 'geo_id', - 'countriesAndTerritories': 'country_name', - 'countryterritoryCode': 'country_territory_code', - 'popData2019': 'population_2019', - 'continentExp': 'continent'}, inplace=True) -``` - -```python Collapsed="false" -raw_data.columns -``` - -```python Collapsed="false" -raw_data.dtypes -``` - -```python Collapsed="false" -raw_data[['report_date', 'cases_weekly', 'deaths_weekly', 'geo_id', 'notification_rate_per_100000_population_14-days']] -``` - -```python Collapsed="false" -raw_data[['country_name', 'geo_id', 'country_territory_code', - 'population_2019', 'continent']] -``` - -```python Collapsed="false" -raw_data[['report_date', 'cases_weekly', 'deaths_weekly', 'geo_id', 'notification_rate_per_100000_population_14-days']].to_sql( - 'weekly_cases', - engine, - if_exists='replace', - index=False, - chunksize=500, - dtype={ - "report_date": Date, - "cases_weekly": Integer, - "deaths_weekly": Integer, - "geo_id": String, - "notification_rate_per_100000_population_14-days": Float - } -) -``` - -```python Collapsed="false" -raw_data[['country_name', 'geo_id', 'country_territory_code', - 'population_2019', 'continent']].drop_duplicates().to_sql( - 'countries', - engine, - if_exists='replace', - index=False, - chunksize=500, - dtype={ - "country_name": Text, - "geo_id": String, - "country_territory_code": String, - "population_2019": Integer, - "continent": Text - } -) -``` - -```python Collapsed="false" -# %sql select geo_id from weekly_cases limit 10 -``` - -```python Collapsed="false" -# %%sql alter table weekly_cases add primary key (geo_id, report_date); -# alter table countries add primary key (geo_id); -# alter table weekly_cases add foreign key (geo_id) references countries(geo_id); -# alter table weekly_cases add culm_cases integer; -# alter table weekly_cases add culm_deaths integer; -``` - -```python -with engine.connect() as connection: - connection.execute('alter table weekly_cases add primary key (geo_id, report_date)') - connection.execute('alter table countries add primary key (geo_id);') - connection.execute('alter table weekly_cases add foreign key (geo_id) references countries(geo_id);') - connection.execute('alter table weekly_cases add culm_cases integer;') - connection.execute('alter table weekly_cases add culm_deaths integer;') -``` - -```python Collapsed="false" -# %sql select report_date, cases_weekly, country_name from weekly_cases join countries using (geo_id) order by report_date desc limit 10 -``` - -```python Collapsed="false" -# %sql select report_date, cases_weekly, country_name from weekly_cases join countries on weekly_cases.geo_id = countries.geo_id order by report_date desc limit 10 -``` - -```python Collapsed="false" -%sql select * from weekly_cases limit 10 -``` - -```python Collapsed="false" -# %%sql -# with culm as -# (select report_date, geo_id, -# sum(cases_weekly) over (partition by geo_id -# order by report_date) as culm_data -# from weekly_cases -# ) -# update weekly_cases -# set culm_cases = culm_data -# from culm -# where weekly_cases.report_date = culm.report_date and -# weekly_cases.geo_id = culm.geo_id -``` - -```python Collapsed="false" -query_string = '''with culm as - (select report_date, geo_id, - sum(cases_weekly) over (partition by geo_id - order by report_date) as culm_data - from weekly_cases - ) -update weekly_cases - set culm_cases = culm_data - from culm - where weekly_cases.report_date = culm.report_date and - weekly_cases.geo_id = culm.geo_id''' -with engine.connect() as connection: - connection.execute(query_string) -``` - -```python Collapsed="false" -# %%sql -# with culm as -# (select report_date, geo_id, -# sum(deaths_weekly) over (partition by geo_id -# order by report_date) as culm_data -# from weekly_cases -# ) -# update weekly_cases -# set culm_deaths = culm_data -# from culm -# where weekly_cases.report_date = culm.report_date and -# weekly_cases.geo_id = culm.geo_id -``` - -```python Collapsed="false" -query_string = '''with culm as - (select report_date, geo_id, - sum(deaths_weekly) over (partition by geo_id - order by report_date) as culm_data - from weekly_cases - ) -update weekly_cases - set culm_deaths = culm_data - from culm - where weekly_cases.report_date = culm.report_date and - weekly_cases.geo_id = culm.geo_id''' -with engine.connect() as connection: - connection.execute(query_string) -``` - -```python Collapsed="false" -uk_query_string = ( -"https://api.coronavirus.data.gov.uk/v2/data?areaType=overview&" -"metric=covidOccupiedMVBeds&" -"metric=newAdmissions&" -"metric=newCasesBySpecimenDate&" -"metric=hospitalCases&" -"metric=newDeaths28DaysByPublishDate&" -"format=csv" -) - -!curl "$uk_query_string" > uk_data.csv -``` - -```python Collapsed="false" -test_query_string = ( -"https://api.coronavirus.data.gov.uk/v2/data?areaType=overview&" -"metric=newPCRTestsByPublishDate&" -"metric=newTestsByPublishDate&" -"metric=newPillarOneTwoTestsByPublishDate&" -"format=csv" -) -!curl "$test_query_string" > test_data.csv -``` - -```python Collapsed="false" -# hospital_query_string = ( -# "https://api.coronavirus.data.gov.uk/v2/data?areaType=overview&" -# "metric=newAdmissions&" -# "format=csv" -# ) -# !curl "$hospital_query_string" > hospital_admissions.csv -``` - -```python Collapsed="false" -# hospital_query_string = ( -# "https://api.coronavirus.data.gov.uk/v1/data?" -# "filters=areaName=United%2520Kingdom;areaType=overview&" -# "structure=%7B%22date%22:%22date%22,%22areaName%22:%22areaName%22,%22areaType%22:%22areaType%22," -# "%22newAdmissions%22:%22newAdmissions%22,%22cumAdmissions%22:%22cumAdmissions%22%7D&format=csv" -# ) - -# !curl "$hospital_query_string" | gunzip > hospital_admissions.csv -``` - -```python Collapsed="false" -uk_data = pd.read_csv('uk_data.csv', - parse_dates=[0], dayfirst=True) -uk_data -``` - -```python Collapsed="false" -test_data = pd.read_csv('test_data.csv', - parse_dates=[0], dayfirst=True) -test_data -``` - -```python Collapsed="false" -test_data.columns -``` - -```python Collapsed="false" -uk_data = uk_data.merge(test_data[['date', 'newPCRTestsByPublishDate', - 'newTestsByPublishDate', 'newPillarOneTwoTestsByPublishDate']], how='outer', on='date') -``` - -```python Collapsed="false" -uk_data.columns -``` - -```python Collapsed="false" -uk_data.rename( - columns={ - 'covidOccupiedMVBeds': 'ventilator_beds', - 'newCasesBySpecimenDate': 'new_cases', - 'hospitalCases': 'hospital_cases', - 'newDeaths28DaysByPublishDate': 'new_deaths', - 'newAdmissions': 'new_admissions', - 'newPCRTestsByPublishDate': 'new_pcr_tests', - 'newTestsByPublishDate': 'new_tests', - 'newPillarOneTwoTestsByPublishDate': 'new_pillar_1_2_tests' - }, inplace=True) -``` - -```python Collapsed="false" -uk_data.dtypes -``` - -```python Collapsed="false" - uk_data.columns -``` - -```python Collapsed="false" -uk_data[['date', - 'hospital_cases', 'ventilator_beds', - 'new_cases', 'new_deaths', - 'hospital_cases', 'new_admissions', - 'new_pcr_tests', 'new_tests', 'new_pillar_1_2_tests' - ]].to_sql( - 'uk_data', - engine, - if_exists='replace', - index=False, - chunksize=500, - dtype={ - "date": Date, - "hospital_cases": Integer, - "ventilator_beds": Integer, - "new_cases": Integer, - "hospital_cases": Integer, - "new_deaths": Integer, - "new_admissions": Integer, - 'new_pcr_tests': Integer, - 'new_tests': Integer, - 'new_pillar_1_2_tests': Integer - } -) -``` - -```python Collapsed="false" -# %sql select * from uk_data order by date desc limit 10 -``` - -```python Collapsed="false" -query_string = '''drop table if exists uk_data_7; -create table uk_data_7 -(date date primary key, - hospital_cases real, - ventilator_beds real, - new_cases real, - new_deaths real, - new_admissions real, - new_pcr_tests real, - new_tests real, - new_pillar_1_2_tests real -);''' - -with engine.connect() as connection: - connection.execute(query_string) -``` - -```python -update_string = '''with ownd as ( - select date, - avg(hospital_cases) over wnd as w_hospital_cases, - avg(ventilator_beds) over wnd as w_ventilator_beds, - avg(new_cases) over wnd as w_new_cases, - avg(new_deaths) over wnd as w_new_deaths, - avg(new_admissions) over wnd as w_new_admissions, - avg(new_pcr_tests) over wnd as w_new_pcr_tests, - avg(new_tests) over wnd as w_new_tests, - avg(new_pillar_1_2_tests) over wnd as w_new_pillar_1_2_tests, - count(*) over wnd as w_size - from uk_data - window wnd as ( - order by uk_data.date - rows between 3 preceding and 3 following - ) -) -insert into uk_data_7(date, - hospital_cases, - ventilator_beds, - new_cases, - new_deaths, - new_admissions, - new_pcr_tests, - new_tests, - new_pillar_1_2_tests - ) -(select date, - w_hospital_cases, - w_ventilator_beds, - w_new_cases, - w_new_deaths, - w_new_admissions, - w_new_pcr_tests, - w_new_tests, - w_new_pillar_1_2_tests - from ownd - where w_size = 7 -)''' -with engine.connect() as connection: - connection.execute(update_string) -``` - -```python Collapsed="false" -# %%sql insert into uk_data_7(date, ventilator_beds, new_cases, hospital_cases, new_deaths, new_admissions) -# values ( -# select date, -# avg(ventilator_beds) over (order by date rows between 6 preceding and current row) -# from uk_data -# ) -``` - -```python Collapsed="false" -# query_string = '''insert into uk_data_7(date, hospital_cases) -# select uk_data.date, -# avg(uk_data.hospital_cases) over (order by uk_data.date rows between 3 preceding and 3 following) as hospital_cases -# from uk_data''' -# with engine.connect() as connection: -# connection.execute(query_string) -``` - -```python Collapsed="false" -# %%sql -# with m7 as -# (select uk_data.date as date7, -# avg(uk_data.ventilator_beds) over (order by uk_data.date rows between 6 preceding and current row) as nc7 -# from uk_data -# ) -# update uk_data_7 -# set ventilator_beds = nc7 -# from m7 -# where uk_data_7.date = m7.date7 -``` - -```python Collapsed="false" -# %%sql -# with m7 as -# (select uk_data.date as date7, -# avg(uk_data.ventilator_beds) over (order by uk_data.date rows between 3 preceding and 3 following) as nc7 -# from uk_data -# ) -# update uk_data_7 -# set ventilator_beds = nc7 -# from m7 -# where uk_data_7.date = m7.date7 -``` - -```python Collapsed="false" -# %%sql -# with m7 as -# (select uk_data.date as date7, -# avg(uk_data.new_cases) over (order by uk_data.date rows between 3 preceding and 3 following) as nc7 -# from uk_data -# ) -# update uk_data_7 -# set new_cases = nc7 -# from m7 -# where uk_data_7.date = m7.date7 -``` - -```python Collapsed="false" -# %%sql -# with m7 as -# (select uk_data.date as date7, -# avg(uk_data.hospital_cases) over (order by uk_data.date rows between 3 preceding and 3 following) as d7 -# from uk_data -# ) -# update uk_data_7 -# set hospital_cases = d7 -# from m7 -# where uk_data_7.date = m7.date7 -``` - -```python Collapsed="false" -# %%sql -# with m7 as -# (select uk_data.date as date7, -# avg(uk_data.new_deaths) over (order by uk_data.date rows between 3 preceding and 3 following) as d7 -# from uk_data -# ) -# update uk_data_7 -# set new_deaths = d7 -# from m7 -# where uk_data_7.date = m7.date7 -``` - -```python Collapsed="false" -# %%sql -# with m7 as -# (select uk_data.date as date7, -# avg(uk_data.new_admissions) over (order by uk_data.date rows between 3 preceding and 3 following) as d7 -# from uk_data -# ) -# update uk_data_7 -# set new_admissions = d7 -# from m7 -# where uk_data_7.date = m7.date7 -``` - -```python Collapsed="false" -# %%sql -# with m7 as -# (select uk_data.date as date7, -# avg(uk_data.new_pcr_tests) over (order by uk_data.date rows between 3 preceding and 3 following) as d7 -# from uk_data -# ) -# update uk_data_7 -# set new_pcr_tests = d7 -# from m7 -# where uk_data_7.date = m7.date7 -``` - -```python Collapsed="false" -# %%sql -# with m7 as -# (select uk_data.date as date7, -# avg(uk_data.new_tests) over (order by uk_data.date rows between 3 preceding and 3 following) as d7 -# from uk_data -# ) -# update uk_data_7 -# set new_tests = d7 -# from m7 -# where uk_data_7.date = m7.date7 -``` - -```python Collapsed="false" -# %%sql -# with m7 as -# (select uk_data.date as date7, -# avg(uk_data.new_pillar_1_2_tests) over (order by uk_data.date rows between 3 preceding and 3 following) as d7 -# from uk_data -# ) -# update uk_data_7 -# set new_pillar_1_2_tests = d7 -# from m7 -# where uk_data_7.date = m7.date7 -``` - -```python Collapsed="false" -# %%sql -# with wnd as -# ( select date, -# avg(new_pillar_1_2_tests) -# over (order by uk_data.date rows between 3 preceding and 3 following) as a_new_pillar_1_2_tests -# from uk_data -# ) -# update uk_data_7 -# set new_pillar_1_2_tests = wnd.a_new_pillar_1_2_tests -# from wnd -# where uk_data_7.date = wnd.date -# and (select count(*) from wnd) = 7 -``` - -```python Collapsed="false" -# %%sql -# with wnd as -# ( select date, -# avg(new_pillar_1_2_tests) -# over (order by uk_data.date rows between 3 preceding and 3 following) as a_new_pillar_1_2_tests, -# count(new_pillar_1_2_tests) -# over (order by uk_data.date rows between 3 preceding and 3 following) as a_count -# from uk_data -# ) -# select uk_data_7.date, new_pillar_1_2_tests, wnd.a_new_pillar_1_2_tests, wnd.a_count -# from uk_data_7, wnd -# where uk_data_7.date = wnd.date -# order by uk_data_7.date desc limit 10 - -# select date, -# count(*) over wnd as w_size -# from uk_data -# window wnd as ( -# order by uk_data.date -# rows between 3 preceding and 3 following -# ); -``` - -```python Collapsed="false" -# %%sql -# select date, -# count(*) over wnd as w_size -# from uk_data -# window wnd as ( -# order by uk_data.date -# rows between 3 preceding and 3 following -# ) -# order by date desc limit 10 -``` - -```python -# %%sql -# with ownd as ( -# select date, -# avg(hospital_cases) over wnd as w_hospital_cases, -# avg(ventilator_beds) over wnd as w_ventilator_beds, -# avg(new_cases) over wnd as w_new_cases, -# avg(new_deaths) over wnd as w_new_deaths, -# avg(new_admissions) over wnd as w_new_admissions, -# avg(new_pcr_tests) over wnd as w_new_pcr_tests, -# avg(new_tests) over wnd as w_new_tests, -# avg(new_pillar_1_2_tests) over wnd as w_new_pillar_1_2_tests, -# count(*) over wnd as w_size -# from uk_data -# window wnd as ( -# order by uk_data.date -# rows between 3 preceding and 3 following -# )) -# insert into uk_data_7(date, -# hospital_cases, -# ventilator_beds, -# new_cases, -# new_deaths, -# new_admissions, -# new_pcr_tests, -# new_tests, -# new_pillar_1_2_tests -# ) -# (select date, -# avg(hospital_cases) over wnd as w_hospital_cases, -# avg(ventilator_beds) over wnd as w_ventilator_beds, -# avg(new_cases) over wnd as w_new_cases, -# avg(new_deaths) over wnd as w_new_deaths, -# avg(new_admissions) over wnd as w_new_admissions, -# avg(new_pcr_tests) over wnd as w_new_pcr_tests, -# avg(new_tests) over wnd as w_new_tests, -# avg(new_pillar_1_2_tests) over wnd as w_new_pillar_1_2_tests, -# count(*) over wnd as w_size -# from uk_data -# window wnd as ( -# order by uk_data.date -# rows between 3 preceding and 3 following -# ) -# ) -# set date = ownd.date, -# hospital_cases = w_hospital_cases, -# ventilator_beds = w_ventilator_beds, -# new_cases = w_new_cases, -# new_deaths = w_new_deaths, -# new_admissions = w_new_admissions, -# new_pcr_tests = w_new_pcr_tests, -# new_tests = w_new_tests, -# new_pillar_1_2_tests = w_new_pillar_1_2_tests -# from ownd -# where w_size = 7 -``` - -```python -# %%sql -# with ownd as ( -# select date, -# avg(hospital_cases) over wnd as w_hospital_cases, -# avg(ventilator_beds) over wnd as w_ventilator_beds, -# avg(new_cases) over wnd as w_new_cases, -# avg(new_deaths) over wnd as w_new_deaths, -# avg(new_admissions) over wnd as w_new_admissions, -# avg(new_pcr_tests) over wnd as w_new_pcr_tests, -# avg(new_tests) over wnd as w_new_tests, -# avg(new_pillar_1_2_tests) over wnd as w_new_pillar_1_2_tests, -# count(*) over wnd as w_size -# from uk_data -# window wnd as ( -# order by uk_data.date -# rows between 3 preceding and 3 following -# ) -# ) -# insert into uk_data_7(date, -# hospital_cases, -# ventilator_beds, -# new_cases, -# new_deaths, -# new_admissions, -# new_pcr_tests, -# new_tests, -# new_pillar_1_2_tests -# ) -# (select date, -# w_hospital_cases, -# w_ventilator_beds, -# w_new_cases, -# w_new_deaths, -# w_new_admissions, -# w_new_pcr_tests, -# w_new_tests, -# w_new_pillar_1_2_tests -# from ownd -# where w_size = 7 -# ) -``` - -```python Collapsed="false" -%sql select * from uk_data_7 order by date desc limit 10 -``` - -```python Collapsed="false" -%sql select * from uk_data order by date desc limit 10 -``` - -```python Collapsed="false" - -```