X-Git-Url: https://git.njae.me.uk/?p=covid19.git;a=blobdiff_plain;f=data_import.py;h=833a6eb443a5c1bd41d6c009c339848350bf0f75;hp=20593c7219c8ebb614626b4f3a297bd8d38323f8;hb=HEAD;hpb=5afedd66506be7575034ae6deebcfaa7c2ced978 diff --git a/data_import.py b/data_import.py index 20593c7..833a6eb 100644 --- a/data_import.py +++ b/data_import.py @@ -14,6 +14,11 @@ import numpy as np # from scipy.stats import gmean import datetime import os +# import matplotlib as mpl +# import matplotlib.pyplot as plt +# import matplotlib.animation as ani + +# # %matplotlib inline # %% @@ -24,28 +29,32 @@ engine = eng.execution_options(isolation_level="AUTOCOMMIT") # %% -DEATH_COUNT_THRESHOLD = 10 -COUNTRIES_CORE = 'IT DE UK ES IE FR BE'.split() -COUNTRIES_NORDIC = 'SE NO DK FI UK'.split() -COUNTRIES_FRIENDS = 'IT UK ES BE SI MX'.split() -# COUNTRIES_FRIENDS = 'IT UK ES BE SI PT'.split() +# DEATH_COUNT_THRESHOLD = 10 +# COUNTRIES_CORE = 'IT DE UK ES IE FR BE'.split() +# COUNTRIES_NORDIC = 'SE NO DK FI UK'.split() +# COUNTRIES_FRIENDS = 'IT UK ES BE SI MX'.split() +# # COUNTRIES_FRIENDS = 'IT UK ES BE SI PT'.split() -COUNTRIES_AMERICAS = ['AG', 'AR', 'AW', 'BS', 'BB', 'BZ', 'BM', 'BO', 'BR', 'VG', 'KY', # excluding Canada and USA - 'CL', 'CO', 'CR', 'CU', 'CW', 'DM', 'DO', 'EC', 'SV', 'GL', 'GD', 'GT', - 'GY', 'HT', 'HN', 'JM', 'MX', 'MS', 'NI', 'PA', 'PY', 'PE', 'PR', 'KN', - 'LC', 'VC', 'SX', 'SR', 'TT', 'TC', 'VI', 'UY', 'VE'] -COUNTRIES_OF_INTEREST = list(set(COUNTRIES_CORE + COUNTRIES_FRIENDS)) -COUNTRIES_ALL = list(set(COUNTRIES_CORE + COUNTRIES_FRIENDS + COUNTRIES_NORDIC + COUNTRIES_AMERICAS)) +# COUNTRIES_AMERICAS = ['AG', 'AR', 'AW', 'BS', 'BB', 'BZ', 'BM', 'BO', 'BR', 'VG', 'KY', # excluding Canada and USA +# 'CL', 'CO', 'CR', 'CU', 'CW', 'DM', 'DO', 'EC', 'SV', 'GL', 'GD', 'GT', +# 'GY', 'HT', 'HN', 'JM', 'MX', 'MS', 'NI', 'PA', 'PY', 'PE', 'PR', 'KN', +# 'LC', 'VC', 'SX', 'SR', 'TT', 'TC', 'VI', 'UY', 'VE'] +# COUNTRIES_OF_INTEREST = list(set(COUNTRIES_CORE + COUNTRIES_FRIENDS)) +# COUNTRIES_ALL = list(set(COUNTRIES_CORE + COUNTRIES_FRIENDS + COUNTRIES_NORDIC + COUNTRIES_AMERICAS)) # %% -os.system('curl https://opendata.ecdc.europa.eu/covid19/casedistribution/csv/ > covid.csv') +## Old data source +# os.system('curl https://opendata.ecdc.europa.eu/covid19/casedistribution/csv/ > covid.csv') + +## In outer shell script, as it doesn't seem to work here +os.system('curl https://opendata.ecdc.europa.eu/covid19/nationalcasedeath/csv/ > covid.csv') # %% # First col is a date, treat geoId of NA as 'Namibia', not "NA" value raw_data = pd.read_csv('covid.csv', - parse_dates=[0], dayfirst=True, + parse_dates=[7], dayfirst=True, keep_default_na=False, na_values = [''], # dtype = {'day': np.int64, # 'month': np.int64, @@ -61,18 +70,67 @@ raw_data = pd.read_csv('covid.csv', ) raw_data.fillna(0, inplace=True) +raw_data = raw_data[raw_data.country_code != 0] +# raw_data + + +# %% +raw_data['date'] = raw_data.year_week.apply(lambda yw: datetime.datetime.fromisocalendar(int(yw[:4]), int(yw[5:]), 7 )) +# raw_data + +# %% +# raw_data.dtypes + +# %% +# raw_data.head() + +# %% +# raw_data[raw_data.country_code == 'GBR'].sort_values(by='date').tail(20) + +# %% +case_data = raw_data[raw_data.indicator == 'cases'][['country_code', 'date', 'weekly_count', 'cumulative_count']] +case_data.rename(columns={'weekly_count': 'cases_weekly', 'cumulative_count': 'culm_cases'}, inplace=True) +# case_data.head() + +# %% +deaths_data = raw_data[raw_data.indicator == 'deaths'][['country_code', 'date', 'weekly_count', 'cumulative_count']] +deaths_data.rename(columns={'weekly_count': 'deaths_weekly', 'cumulative_count': 'culm_deaths'}, inplace=True) +# deaths_data.head() + +# %% +case_death_data = case_data.merge(deaths_data) +# case_death_data + +# %% +# raw_data.columns + +# %% +country_data = raw_data[['country', 'country_code', 'continent', 'population']].drop_duplicates() +# country_data +# %% # %% -raw_data.rename(columns={'dateRep':'report_date', 'geoId': 'geo_id', - 'countriesAndTerritories': 'country_name', - 'countryterritoryCode': 'country_territory_code', - 'popData2019': 'population_2019', - 'continentExp': 'continent'}, inplace=True) +# raw_data.rename(columns={'dateRep':'report_date', 'geoId': 'geo_id', +# 'countriesAndTerritories': 'country_name', +# 'countryterritoryCode': 'country_territory_code', +# 'popData2019': 'population_2019', +# 'continentExp': 'continent'}, inplace=True) + +# raw_data.rename(columns={'dateRep':'report_date', 'geoId': 'geo_id', +# 'countriesAndTerritories': 'country_name', +# 'countryterritoryCode': 'country_territory_code', +# 'cases': 'cases_weekly', +# 'deaths': 'deaths_weekly', +# 'popData2019': 'population_2019', +# 'continentExp': 'continent', +# # 'Cumulative_number_for_14_days_of_COVID-19_cases_per_100000': 'notification_rate_per_100000_population_14-days' +# }, +# inplace=True) # %% -raw_data[['report_date', 'cases_weekly', 'deaths_weekly', 'geo_id', 'notification_rate_per_100000_population_14-days']].to_sql( +case_death_data.to_sql( 'weekly_cases', engine, if_exists='replace', @@ -81,16 +139,16 @@ raw_data[['report_date', 'cases_weekly', 'deaths_weekly', 'geo_id', 'notificatio dtype={ "report_date": Date, "cases_weekly": Integer, + "cases_culm": Integer, "deaths_weekly": Integer, - "geo_id": String, - "notification_rate_per_100000_population_14-days": Float + "deaths_culm": Integer, + "country_code": String } ) # %% -raw_data[['country_name', 'geo_id', 'country_territory_code', - 'population_2019', 'continent']].drop_duplicates().to_sql( +country_data.to_sql( 'countries', engine, if_exists='replace', @@ -98,55 +156,13 @@ raw_data[['country_name', 'geo_id', 'country_territory_code', chunksize=500, dtype={ "country_name": Text, - "geo_id": String, - "country_territory_code": String, - "population_2019": Integer, + "country_code": String, + "population": Integer, "continent": Text } ) -# %% -with engine.connect() as connection: - connection.execute('alter table weekly_cases add primary key (geo_id, report_date)') - connection.execute('alter table countries add primary key (geo_id);') - connection.execute('alter table weekly_cases add foreign key (geo_id) references countries(geo_id);') - connection.execute('alter table weekly_cases add culm_cases integer;') - connection.execute('alter table weekly_cases add culm_deaths integer;') - - -# %% -query_string = '''with culm as - (select report_date, geo_id, - sum(cases_weekly) over (partition by geo_id - order by report_date) as culm_data - from weekly_cases - ) -update weekly_cases - set culm_cases = culm_data - from culm - where weekly_cases.report_date = culm.report_date and - weekly_cases.geo_id = culm.geo_id''' -with engine.connect() as connection: - connection.execute(query_string) - - -# %% -query_string = '''with culm as - (select report_date, geo_id, - sum(deaths_weekly) over (partition by geo_id - order by report_date) as culm_data - from weekly_cases - ) -update weekly_cases - set culm_deaths = culm_data - from culm - where weekly_cases.report_date = culm.report_date and - weekly_cases.geo_id = culm.geo_id''' -with engine.connect() as connection: - connection.execute(query_string) - - # %% uk_query_string = ( "https://api.coronavirus.data.gov.uk/v2/data?areaType=overview&" @@ -155,6 +171,7 @@ uk_query_string = ( "metric=newCasesBySpecimenDate&" "metric=hospitalCases&" "metric=newDeaths28DaysByPublishDate&" +"metric=cumDeaths28DaysByPublishDate&" "format=csv" ) @@ -194,6 +211,7 @@ uk_data.rename( 'newCasesBySpecimenDate': 'new_cases', 'hospitalCases': 'hospital_cases', 'newDeaths28DaysByPublishDate': 'new_deaths', + 'cumDeaths28DaysByPublishDate': 'cum_deaths', 'newAdmissions': 'new_admissions', 'newPCRTestsByPublishDate': 'new_pcr_tests', 'newTestsByPublishDate': 'new_tests', @@ -205,6 +223,7 @@ uk_data.rename( uk_data[['date', 'hospital_cases', 'ventilator_beds', 'new_cases', 'new_deaths', + 'cum_deaths', 'hospital_cases', 'new_admissions', 'new_pcr_tests', 'new_tests', 'new_pillar_1_2_tests' ]].to_sql( @@ -220,6 +239,7 @@ uk_data[['date', "new_cases": Integer, "hospital_cases": Integer, "new_deaths": Integer, + "cum_deaths": Integer, "new_admissions": Integer, 'new_pcr_tests': Integer, 'new_tests': Integer, @@ -289,3 +309,5 @@ insert into uk_data_7(date, with engine.connect() as connection: connection.execute(query_string) + +# %%