{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import json\n", "import requests\n", "from bs4 import BeautifulSoup" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Change this\n", "OUTPUT_FILE = 'euromomo.json'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Change this if the script breaks\n", "BASE_PAGE = \"https://www.euromomo.eu/graphs-and-maps/\"\n", "JS_FILE_PATTERN = \"src-templates-graphs-and-maps-js\"" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "# First, we need to find the JS link inside the webpage\n", "r = requests.get(BASE_PAGE)\n", "if r.status_code != 200:\n", " raise Exception(f\"Cannot reach webpage {BASE_PAGE} {r.status_code}\")\n", "soup = BeautifulSoup(r.text, features=\"lxml\")\n", "link_to_file = None\n", "for possible in soup.find_all(\"link\", attrs={\"as\": \"script\"}):\n", " if JS_FILE_PATTERN in possible['href']:\n", " link_to_file = \"https://www.euromomo.eu\" + possible['href']\n", " break" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ,\n", " ]" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "soup.find_all(\"link\", attrs={\"as\": \"script\"})" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "if link_to_file is None:\n", " print(f\"Could not find a JS file with {JS_FILE_PATTERN} in its name :-(\")\n", " exit(1)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'https://www.euromomo.eu/component---src-templates-graphs-and-maps-js-4be2e0d0712cbdd396d8.js'" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "link_to_file" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "# We need a bit of renaming\n", "def rename_key(key, value):\n", " print(f'doing {key}')\n", " if isinstance(value, list):\n", " if \"Belgium\" in value:\n", " return \"countries\"\n", " if \"Total\" in value:\n", " return \"age_groups\"\n", " elif isinstance(value, str):\n", " value = int(value)\n", " if value <= 52:\n", " return \"week\"\n", " if value >= 2020:\n", " return \"year\"\n", " elif isinstance(value, dict):\n", " if set(value.keys()) == {\"counts\", \"zscores\", \"weeks\"}:\n", " return \"data_totals\"\n", " if set(value.keys()) == {\"data\", \"years\"}:\n", " return \"excess_mortality\"\n", " if set(value.keys()) == {\"data\", \"weeks\"}:\n", " if len(value[\"data\"]) == 24:\n", " return \"z_scores_country_age_groups\"\n", " if len(value[\"data\"]) >= 276:\n", " return \"z_scores_country\"\n", " raise Exception(f\"Cannot find valid key name. Key is {key} with value of type {type(value)}\")\n" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "js_file: str = requests.get(link_to_file).text" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "pos = 0\n", "found = []\n", "while True:\n", " pos = js_file.find(\"JSON.parse('\", pos)\n", " if pos == -1:\n", " break\n", " end_pos = js_file.find(\"')\", pos)\n", "\n", " content = js_file[pos+len(\"JSON.parse('\"):end_pos]\n", " assert \"'\" not in content\n", " found.append(content)\n", " pos += 1\n", "\n", "# the biggest JSON part in the file is probably the good one ;-)\n", "biggest = max(found, key=lambda x: len(x))" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "raw_data = json.loads(biggest)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "doing weeks\n" ] }, { "ename": "Exception", "evalue": "Cannot find valid key name. Key is weeks with value of type ", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mException\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mrename_key\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mraw_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mrename_key\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mraw_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m\u001b[0m in \u001b[0;36mrename_key\u001b[0;34m(key, value)\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"data\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>=\u001b[0m \u001b[0;36m276\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;34m\"z_scores_country\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 25\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"Cannot find valid key name. Key is {key} with value of type {type(value)}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mException\u001b[0m: Cannot find valid key name. Key is weeks with value of type " ] } ], "source": [ "data = {rename_key(k, v): v for k, v in raw_data.items()}" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "with open('euromomo.json', 'w') as f:\n", " json.dump(data, f)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(found)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['weeks', 'ages', 'pooled', 'countries', 'reportYear', 'reportWeek']" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[k for k in raw_data.keys()]" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "list" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type([1,2,3])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 4 }