+++ /dev/null
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "import json\n",
- "import requests\n",
- "from bs4 import BeautifulSoup"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Change this\n",
- "OUTPUT_FILE = 'euromomo.json'"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Change this if the script breaks\n",
- "BASE_PAGE = \"https://www.euromomo.eu/graphs-and-maps/\"\n",
- "JS_FILE_PATTERN = \"src-templates-graphs-and-maps-js\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "metadata": {},
- "outputs": [],
- "source": [
- "# First, we need to find the JS link inside the webpage\n",
- "r = requests.get(BASE_PAGE)\n",
- "if r.status_code != 200:\n",
- " raise Exception(f\"Cannot reach webpage {BASE_PAGE} {r.status_code}\")\n",
- "soup = BeautifulSoup(r.text, features=\"lxml\")\n",
- "link_to_file = None\n",
- "for possible in soup.find_all(\"link\", attrs={\"as\": \"script\"}):\n",
- " if JS_FILE_PATTERN in possible['href']:\n",
- " link_to_file = \"https://www.euromomo.eu\" + possible['href']\n",
- " break"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 31,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[<link as=\"script\" href=\"/50fe40790468168eec944b4bf3a7387c41e08258-80a60393842707b4ef3f.js\" rel=\"preload\"/>,\n",
- " <link as=\"script\" href=\"/framework-e94c62ba7ecbd1156bac.js\" rel=\"preload\"/>,\n",
- " <link as=\"script\" href=\"/app-c8f47833a3ed3eeb35b5.js\" rel=\"preload\"/>,\n",
- " <link as=\"script\" href=\"/component---src-templates-graphs-and-maps-js-4be2e0d0712cbdd396d8.js\" rel=\"preload\"/>,\n",
- " <link as=\"script\" href=\"/commons-5c085e7cd1a604626d58.js\" rel=\"preload\"/>,\n",
- " <link as=\"script\" href=\"/5e2a4920-b8478647cb0804da9c90.js\" rel=\"preload\"/>,\n",
- " <link as=\"script\" href=\"/styles-c41920853eed0685ae46.js\" rel=\"preload\"/>,\n",
- " <link as=\"script\" href=\"/webpack-runtime-ba300874af730e00e5e4.js\" rel=\"preload\"/>]"
- ]
- },
- "execution_count": 31,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "soup.find_all(\"link\", attrs={\"as\": \"script\"})"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 32,
- "metadata": {},
- "outputs": [],
- "source": [
- "if link_to_file is None:\n",
- " print(f\"Could not find a JS file with {JS_FILE_PATTERN} in its name :-(\")\n",
- " exit(1)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 33,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'https://www.euromomo.eu/component---src-templates-graphs-and-maps-js-4be2e0d0712cbdd396d8.js'"
- ]
- },
- "execution_count": 33,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "link_to_file"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 34,
- "metadata": {},
- "outputs": [],
- "source": [
- "# We need a bit of renaming\n",
- "def rename_key(key, value):\n",
- " print(f'doing {key}')\n",
- " if isinstance(value, list):\n",
- " if \"Belgium\" in value:\n",
- " return \"countries\"\n",
- " if \"Total\" in value:\n",
- " return \"age_groups\"\n",
- " elif isinstance(value, str):\n",
- " value = int(value)\n",
- " if value <= 52:\n",
- " return \"week\"\n",
- " if value >= 2020:\n",
- " return \"year\"\n",
- " elif isinstance(value, dict):\n",
- " if set(value.keys()) == {\"counts\", \"zscores\", \"weeks\"}:\n",
- " return \"data_totals\"\n",
- " if set(value.keys()) == {\"data\", \"years\"}:\n",
- " return \"excess_mortality\"\n",
- " if set(value.keys()) == {\"data\", \"weeks\"}:\n",
- " if len(value[\"data\"]) == 24:\n",
- " return \"z_scores_country_age_groups\"\n",
- " if len(value[\"data\"]) >= 276:\n",
- " return \"z_scores_country\"\n",
- " raise Exception(f\"Cannot find valid key name. Key is {key} with value of type {type(value)}\")\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 35,
- "metadata": {},
- "outputs": [],
- "source": [
- "js_file: str = requests.get(link_to_file).text"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 36,
- "metadata": {},
- "outputs": [],
- "source": [
- "pos = 0\n",
- "found = []\n",
- "while True:\n",
- " pos = js_file.find(\"JSON.parse('\", pos)\n",
- " if pos == -1:\n",
- " break\n",
- " end_pos = js_file.find(\"')\", pos)\n",
- "\n",
- " content = js_file[pos+len(\"JSON.parse('\"):end_pos]\n",
- " assert \"'\" not in content\n",
- " found.append(content)\n",
- " pos += 1\n",
- "\n",
- "# the biggest JSON part in the file is probably the good one ;-)\n",
- "biggest = max(found, key=lambda x: len(x))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 37,
- "metadata": {},
- "outputs": [],
- "source": [
- "raw_data = json.loads(biggest)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 38,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "doing weeks\n"
- ]
- },
- {
- "ename": "Exception",
- "evalue": "Cannot find valid key name. Key is weeks with value of type <class 'list'>",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mException\u001b[0m Traceback (most recent call last)",
- "\u001b[0;32m<ipython-input-38-7fbed730e1a3>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mrename_key\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mraw_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
- "\u001b[0;32m<ipython-input-38-7fbed730e1a3>\u001b[0m in \u001b[0;36m<dictcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mrename_key\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mraw_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
- "\u001b[0;32m<ipython-input-34-f58db1eb5796>\u001b[0m in \u001b[0;36mrename_key\u001b[0;34m(key, value)\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"data\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>=\u001b[0m \u001b[0;36m276\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;34m\"z_scores_country\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 25\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"Cannot find valid key name. Key is {key} with value of type {type(value)}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
- "\u001b[0;31mException\u001b[0m: Cannot find valid key name. Key is weeks with value of type <class 'list'>"
- ]
- }
- ],
- "source": [
- "data = {rename_key(k, v): v for k, v in raw_data.items()}"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 39,
- "metadata": {},
- "outputs": [],
- "source": [
- "with open('euromomo.json', 'w') as f:\n",
- " json.dump(data, f)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "1"
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(found)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['weeks', 'ages', 'pooled', 'countries', 'reportYear', 'reportWeek']"
- ]
- },
- "execution_count": 23,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "[k for k in raw_data.keys()]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "list"
- ]
- },
- "execution_count": 26,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "type([1,2,3])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.4"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}