{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import requests\n",
    "from bs4 import BeautifulSoup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Change this\n",
    "OUTPUT_FILE = 'euromomo.json'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Change this if the script breaks\n",
    "BASE_PAGE = \"https://www.euromomo.eu/graphs-and-maps/\"\n",
    "JS_FILE_PATTERN = \"src-templates-graphs-and-maps-js\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "# First, we need to find the JS link inside the webpage\n",
    "r = requests.get(BASE_PAGE)\n",
    "if r.status_code != 200:\n",
    "    raise Exception(f\"Cannot reach webpage {BASE_PAGE} {r.status_code}\")\n",
    "soup = BeautifulSoup(r.text, features=\"lxml\")\n",
    "link_to_file = None\n",
    "for possible in soup.find_all(\"link\", attrs={\"as\": \"script\"}):\n",
    "    if JS_FILE_PATTERN in possible['href']:\n",
    "        link_to_file = \"https://www.euromomo.eu\" + possible['href']\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<link as=\"script\" href=\"/50fe40790468168eec944b4bf3a7387c41e08258-80a60393842707b4ef3f.js\" rel=\"preload\"/>,\n",
       " <link as=\"script\" href=\"/framework-e94c62ba7ecbd1156bac.js\" rel=\"preload\"/>,\n",
       " <link as=\"script\" href=\"/app-c8f47833a3ed3eeb35b5.js\" rel=\"preload\"/>,\n",
       " <link as=\"script\" href=\"/component---src-templates-graphs-and-maps-js-4be2e0d0712cbdd396d8.js\" rel=\"preload\"/>,\n",
       " <link as=\"script\" href=\"/commons-5c085e7cd1a604626d58.js\" rel=\"preload\"/>,\n",
       " <link as=\"script\" href=\"/5e2a4920-b8478647cb0804da9c90.js\" rel=\"preload\"/>,\n",
       " <link as=\"script\" href=\"/styles-c41920853eed0685ae46.js\" rel=\"preload\"/>,\n",
       " <link as=\"script\" href=\"/webpack-runtime-ba300874af730e00e5e4.js\" rel=\"preload\"/>]"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "soup.find_all(\"link\", attrs={\"as\": \"script\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "if link_to_file is None:\n",
    "    print(f\"Could not find a JS file with {JS_FILE_PATTERN} in its name :-(\")\n",
    "    exit(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'https://www.euromomo.eu/component---src-templates-graphs-and-maps-js-4be2e0d0712cbdd396d8.js'"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "link_to_file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "# We need a bit of renaming\n",
    "def rename_key(key, value):\n",
    "    print(f'doing {key}')\n",
    "    if isinstance(value, list):\n",
    "        if \"Belgium\" in value:\n",
    "            return \"countries\"\n",
    "        if \"Total\" in value:\n",
    "            return \"age_groups\"\n",
    "    elif isinstance(value, str):\n",
    "        value = int(value)\n",
    "        if value <= 52:\n",
    "            return \"week\"\n",
    "        if value >= 2020:\n",
    "            return \"year\"\n",
    "    elif isinstance(value, dict):\n",
    "        if set(value.keys()) == {\"counts\", \"zscores\", \"weeks\"}:\n",
    "            return \"data_totals\"\n",
    "        if set(value.keys()) == {\"data\", \"years\"}:\n",
    "            return \"excess_mortality\"\n",
    "        if set(value.keys()) == {\"data\", \"weeks\"}:\n",
    "            if len(value[\"data\"]) == 24:\n",
    "                return \"z_scores_country_age_groups\"\n",
    "            if len(value[\"data\"]) >= 276:\n",
    "                return \"z_scores_country\"\n",
    "    raise Exception(f\"Cannot find valid key name. Key is {key} with value of type {type(value)}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "js_file: str = requests.get(link_to_file).text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "pos = 0\n",
    "found = []\n",
    "while True:\n",
    "    pos = js_file.find(\"JSON.parse('\", pos)\n",
    "    if pos == -1:\n",
    "        break\n",
    "    end_pos = js_file.find(\"')\", pos)\n",
    "\n",
    "    content = js_file[pos+len(\"JSON.parse('\"):end_pos]\n",
    "    assert \"'\" not in content\n",
    "    found.append(content)\n",
    "    pos += 1\n",
    "\n",
    "# the biggest JSON part in the file is probably the good one ;-)\n",
    "biggest = max(found, key=lambda x: len(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_data = json.loads(biggest)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "doing weeks\n"
     ]
    },
    {
     "ename": "Exception",
     "evalue": "Cannot find valid key name. Key is weeks with value of type <class 'list'>",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mException\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-38-7fbed730e1a3>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mrename_key\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mraw_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m<ipython-input-38-7fbed730e1a3>\u001b[0m in \u001b[0;36m<dictcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mrename_key\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mraw_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m<ipython-input-34-f58db1eb5796>\u001b[0m in \u001b[0;36mrename_key\u001b[0;34m(key, value)\u001b[0m\n\u001b[1;32m     23\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"data\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>=\u001b[0m \u001b[0;36m276\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     24\u001b[0m                 \u001b[0;32mreturn\u001b[0m \u001b[0;34m\"z_scores_country\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 25\u001b[0;31m     \u001b[0;32mraise\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"Cannot find valid key name. Key is {key} with value of type {type(value)}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mException\u001b[0m: Cannot find valid key name. Key is weeks with value of type <class 'list'>"
     ]
    }
   ],
   "source": [
    "data = {rename_key(k, v): v for k, v in raw_data.items()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('euromomo.json', 'w') as f:\n",
    "    json.dump(data, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(found)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['weeks', 'ages', 'pooled', 'countries', 'reportYear', 'reportWeek']"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[k for k in raw_data.keys()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "list"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type([1,2,3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}