Create a treemap of letter frequencies
authorNeil Smith <neil.git@njae.me.uk>
Fri, 16 Mar 2018 20:29:39 +0000 (20:29 +0000)
committerNeil Smith <neil.git@njae.me.uk>
Fri, 16 Mar 2018 20:29:39 +0000 (20:29 +0000)
letter-treemap.png [new file with mode: 0644]
make-letter-frequency-treemap.ipynb [new file with mode: 0644]

diff --git a/letter-treemap.png b/letter-treemap.png
new file mode 100644 (file)
index 0000000..6cde8ed
Binary files /dev/null and b/letter-treemap.png differ
diff --git a/make-letter-frequency-treemap.ipynb b/make-letter-frequency-treemap.ipynb
new file mode 100644 (file)
index 0000000..224c3b7
--- /dev/null
@@ -0,0 +1,237 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# libraries\n",
+    "import matplotlib.pyplot as plt\n",
+    "%matplotlib inline\n",
+    "\n",
+    "import squarify    # pip install squarify (algorithm for treemap)\n",
+    "\n",
+    "from support.language_models import *\n",
+    "from support.utilities import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "defaultdict(int,\n",
+       "            {'a': 0.07822525209432887,\n",
+       "             'b': 0.014829998223636929,\n",
+       "             'c': 0.02251879345845122,\n",
+       "             'd': 0.042759915992231244,\n",
+       "             'e': 0.12099426536374505,\n",
+       "             'f': 0.02159693603704411,\n",
+       "             'g': 0.018815084434702378,\n",
+       "             'h': 0.06645305621431015,\n",
+       "             'i': 0.06723047441023709,\n",
+       "             'j': 0.0010659774441790274,\n",
+       "             'k': 0.00865805425839555,\n",
+       "             'l': 0.04134042154867259,\n",
+       "             'm': 0.027483193578407596,\n",
+       "             'n': 0.06693265828344594,\n",
+       "             'o': 0.08052207518149467,\n",
+       "             'p': 0.016070260346516884,\n",
+       "             'q': 0.0008776478463153873,\n",
+       "             'r': 0.059626906298523796,\n",
+       "             's': 0.06455443850567806,\n",
+       "             't': 0.08946868868814231,\n",
+       "             'u': 0.03036719004738724,\n",
+       "             'v': 0.010421489620086533,\n",
+       "             'w': 0.024603665947343364,\n",
+       "             'x': 0.0011832844394584982,\n",
+       "             'y': 0.022829377693572104,\n",
+       "             'z': 0.0005708940436934243})"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "normalised_english_counts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(['e',\n",
+       "  't',\n",
+       "  'o',\n",
+       "  'a',\n",
+       "  'i',\n",
+       "  'n',\n",
+       "  'h',\n",
+       "  's',\n",
+       "  'r',\n",
+       "  'd',\n",
+       "  'l',\n",
+       "  'u',\n",
+       "  'm',\n",
+       "  'w',\n",
+       "  'y',\n",
+       "  'c',\n",
+       "  'f',\n",
+       "  'g',\n",
+       "  'p',\n",
+       "  'b',\n",
+       "  'v',\n",
+       "  'k',\n",
+       "  'x',\n",
+       "  'j',\n",
+       "  'q',\n",
+       "  'z'],\n",
+       " [0.12099426536374505,\n",
+       "  0.08946868868814231,\n",
+       "  0.08052207518149467,\n",
+       "  0.07822525209432887,\n",
+       "  0.06723047441023709,\n",
+       "  0.06693265828344594,\n",
+       "  0.06645305621431015,\n",
+       "  0.06455443850567806,\n",
+       "  0.059626906298523796,\n",
+       "  0.042759915992231244,\n",
+       "  0.04134042154867259,\n",
+       "  0.03036719004738724,\n",
+       "  0.027483193578407596,\n",
+       "  0.024603665947343364,\n",
+       "  0.022829377693572104,\n",
+       "  0.02251879345845122,\n",
+       "  0.02159693603704411,\n",
+       "  0.018815084434702378,\n",
+       "  0.016070260346516884,\n",
+       "  0.014829998223636929,\n",
+       "  0.010421489620086533,\n",
+       "  0.00865805425839555,\n",
+       "  0.0011832844394584982,\n",
+       "  0.0010659774441790274,\n",
+       "  0.0008776478463153873,\n",
+       "  0.0005708940436934243])"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ls = sorted(normalised_english_counts, key=normalised_english_counts.get, reverse=True)\n",
+    "cs = [normalised_english_counts[l] for l in ls]\n",
+    "ls, cs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAARAAAAD8CAYAAAC/+/tYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAEmtJREFUeJzt3Xl0XNV9wPH7ZkazaGa0zMhaPFoteRV4N8aybDkhtDZgk5AEmrQhiYGEkMY5CeEUSGh7yglJ6xA4lCb0QJw2UEhJgQDxUnJivMgYvOANbEu2ZGuxdmk00kgzmu31j9SBQHDgjjRXo/f9/OXRkd79/WF9de9o9EbTdV0AgAyT6gEApC8CAkAaAQEgjYAAkEZAAEgjIACkERAA0ggIAGkEBIA0i+oBhBDizdbSl1XPoNoNO7+ueoSU27zqWWVr1w/NUrb2pZTb+8f1eq+sXyCEEOLswGtlHcMny4QQIqZHMmxm1+jKkpv3bz+7eX0y158UAQEwsao8NS1VnpqWhB7TXmt/akVZ9qLm8bguRxjAQI51b7ss1+7rK866vHs8rkdAAINo9h8oHosFHfPyrmocr2tyhAGmsNoXTwkhhHjm24cvO/u77lJvmTNwtuAXi4vmZA2uvXNu0scYdiDAFNf0el/2mb29PpvTEo2NxS3nDvQXnt7VPX08rs0OBJDU3xRwvPTNPVd8+Tfrd6ue5VKaXu/zLLjOd/76f7i8QQghfnX3kXnuafbweFybHQiQRjrPBB33X723TvUcFxEQIAl6XNd+/fVdi7Zc89Ka57+yc8lYMGpWPdN7Va7IGzi7v7dwbCRmGh2MmE/s6Cwfr2sTECAJwe5R14KbZp7fuG3DroxMS+zgE2+XpWrtjobhzAfW1q9u2Nef/UGf8/iB1WKndkMgNmfRhQfW7q/bfNOJ5dbZVZ3HO4vF4wdWJz0Dz4EASXDk2kIVq31+IYSYc215+7FfnqkQQozLi7QupfWtgPOp75xYctP91Ucrl+UO/bnP93z++rOez19/VgghWm69+1prSVFgPOYgIEAytD/zeAKEh2K2/9h0bNnND80/VL4gJzjxK34wjjBAEkIDY47z9R25QgjRsK3FV3i5d2Ci17RmmqPuPFuo8bUBj8zXa2ZzIvfGa3kpO6CaqyAzePSXjeVbrnlpzVgwal12S3XLRK9psmiJ23+2+NCxHd3Fu3/R4pvo9S6FIwwgyVuZHdq4fcMuFWs73Bnx27csPvDozYdW2J2W2PJP+8blb1s+KgKCtPPSE93Fr/xXb6UmhF5UYR++d0vVEdUzpUrRTFfovt+u2i2EEG6vLXbP1pV7L/X5ngMZ7/tYa0J75+NfTm4eAoK00vBm0LXt5z0zv//c7H3eQmvE3xN9/3cI/shQW0N2255fLZxz4117o8FBq56ImYJd59yuworhZK9NQJBWjuwayltYl9XpLbRGhBAiNz8jqnqmyS6rZHbAXTyrq2Xn09XDbQ0+Z35Zz3jEQwieRAUMoXjlpxpD/R25Ga6c4Mzrv35wvK7LDgRpZdGarL6HvnFu2UB3pMlTYI36e6IZMruQl3ctm4jxkvaNtTsm5LrR0YA1EYtaNFMikYhGzGabIz4e1yUgSCuzF7uC13wp/8x9NzbWmExC91XZh+5+vOqo6rkmu5adz8wvWPixhsjQgKN1z7NzK67+4lvjcV0CAiGEELquC6HrQjNN/lPthtsK2jfcVtCueo500X301WLNZNLz59dd0BNxcerZzbX+pqPe3MqFSd/BmYAYWORCl6PnwZ9fmVFc6I92dOfkf+vLb1h9BSHVcxnZC3dfPa7Xc4mYcOWvahf5q9pFR0wIIcQVq79TL4QQFx8ng4AYXNwfcHq/dMMRx/zZg6pnQfqZ/PtVTChTlitEPCCLgBicZs1Ifh8LwyIgAKTxHAggKdLZ5eh6/PHl1oKCQKS7OzsjzztcsHHjUZPdPi6vsUgHht+BPPKDwRnXreyou25lR92j/zxYoXqeVLL6CkPFm/9uUt9RfLKLDw66smpXni/9+/t2aVZbbPCV36bsloaTgaEDcqA+nP3braMlz2wvrH96W2H9jhdHyw7tD2epngvpw+RyhjKrq/1CCOFeurQ93NoqdZMfWY1Hnp3TcvqV8ouPzx5/flbz21tnpGp9YwfktbDnylX2LneWKZ6VbYqvqLN3vlEf9qqeC+lEu8SjiZdfsqSjv+vtP7xJ1ED36emFpcs6UrW+oQMCJCsRDDpGT57KFUKI4OHDPltZ2YTf0vDdcvIqh2LRkDU00mcL9DdnmS22aKY7f1zeNOrDMHRAltfaB97YGy4MDifMw0MJ8/494cLltfakX94L4zDn5ASH6veWt/7T/WsS4bA15+pPTPgtDd8rN392Z3froendbYenewurU7b7EMLgv4VZVmMPfOLazLa/WttVK4QQazdkti5dYf+zt8gHLtI0TS/8yleU3hGtoGTphTPH/mdBLBqyzl/5tddSubahAyKEEJvuyWnedE/OhL+PBzBRsjxlwXg8YsmwusIOp3cslWsbPiBQ5yd33Khu8bXJX8JaVBgque97k+LX4Fd84h4lcxj6ORAAySEgAKQREADSCAgAaTyJCkP65rptqkf4I4++cK0QQojOK9PrW5IdCABpBASANAICQBoBASCNgACQRkAASCMgAKSl1y+dpzCrK6J6BAWsqgdAktiBAJBGQJBWGk4+P7v5zI4/3D2/4e3n5rz7MVKLgCCtFJfWtPZ2nygWQghdT4j+3tPTfSU17arm2fpwU8U/frx+zWO3HlmkagaVeA4EacXpKgxZLPaof6ApKxIesmU6pwVs9qyoqnkOvthZdseWRa/nVzhTdiPjyYSAIO0UTF/S2tl+oCQSCdoKfUvaVM2xZdPxy4d6I87Hbju6fNG6gtb1d1adS+X6vS89P3Pk5IlikyNzzOx2h21F0wfz1m1I6e05OcIg7RT5lnUG/OfzR4M9OfmFC3tUzbHxkfknnDkZ4U1PLdmf6niMnmnIHm08VVT67bt3+267441IT3d2Kte/iB0I0o7ZnKG7s0v6LBZb1GQyqx5HiVDzWY+jalaXyWpLCKtNOGZUdauYgx0I0o6uJ0RwuCPXV1qj7PiC3yMgSCuBwVbX/t0/+Hh2TlmfO8s3onoeVRwzqgZCTWcKEpExUzw0ag41ny1QMQdHGKSV7JzSYM2a7+5UPYdqmTNnBxxVs7pbfvzDOnNm5pjVmzdssjtiqZ6DgABJuL9+1e9Ure39i2ua8q//TGMiHDa3/eShGntpeSDVMxAQIE11PfPk/NhAn1uPx03O6vntmZUzjRmQLz78LdUjqFcTVD0B0oxv41eVvievEDyJCiAJBASANAICQBoBASCNgACQRkAASCMgAKQREADSJsULyWBMtsNNytbeXjvjfR/TS4oUTPJ7M8SgGPtR+v1tIDsQANLYgRhUfCRs7njgySXxQNCuJ3Qtd33NmZx1V3aongvphYAY1PC+E9PM2c5wyfdvOyCEELGhEf4v4CPjCGNQ9krfcLixbVrXv70wN3jwlMeS5Uz5vSSQ/giIQdkrp4+Ubr5jj620YKj/v1+d0/PEb2aqngnph22rQUW6+m2WHHc0d33NBbPLEQ28+map6pmQfgiIQYXPtGf1/3LnPE3TdGE26fm3XHtc9UxIPwTEoLJWLejNWrVgt+o5kN54DgSANAICQBoBASCNgACQRkAASCMgAKQREADSCAgAaQQEgDReiYq0NBwbcLw5/L9X1OV+brcQQjSMvDEjrkct81y1jamaIRjqcbzZ/PRyl6NgcDjUleuy5w/6PAvbmrv3zI7Gw9bqkg1H8rIqBydi7aZ/311pyjAnKjbWnjv1wNbq0daBrCWPfWF/7+4Gb+f2t0rn//DTKXnbS3YgQBLC0WFnRf7KptXzvvlqKOJ3dfpP+FbMvn1fZWHdyebuvVUTtW7OotL+oZMdHiGEGDnXlx0fi5kTkZg2eKzNm109vX+i1n0vwwckNjps6d6/o0z1HEhPtgzXaK6rdFjTTCLT6hn2uMr7NE0T2Zm+4bHocOZErZu7sCQw2jqQHQ2ELJrFnHBV5fv9R1pzhk51eXKXlA1M1LrvRUBGgxmDJw+Wq54DH41JM+lCCO3i44SIm1XMoWmmxLseCJPJkhBCCE1oui4S2gd+YZJMVotu9TpH2587XOKeVeDPudw34D903jvWN+x0zykKTtS675sjVQtNVp27X5wbCw45G7d8f3X7jqfnqp4HH47D5B6L6mPWcGIkI67HTP3RCwWqZ0o195yigc6tx2fkLCzp91xR0d+z83RZZrEnoJkmrFvvY/gnUYvqrj/V8uIT7lkbv7tH9Sz48EyaWS+zV595ffDXqzJM9rDD5E7ZT93JImdByUDn1uMzPcsq/BaXLa5lmBNZ84pSdnwRQghN1/VUrvcnXf7th15WtXa4r9PR8uITV8y+5T6l98YYqzHc/38x4/Y21SP8EZXvCyOEUPK+MLuu+tH6ZL7e8EcYAPIMHxCzPTOWiEYNf5QDZBg+IBmu7Kgj3zfQ8LP763gSFfho+MkrhCi/4aspedUeMNUYfgcCQB47kEkiHlPyOiggKexAAEgjIACkERAA0ggIAGkEBIA0AgJAGgEBII2AAJBGQABIIyAApPFSdiijPzdh9xyWFPjIXxH+kW/8ln/YNX7X+rCuSu7L2YEAkEZAAEgjIACkERAA0ggIAGkEBIA0AgJAGgEBII2AAJBGQABIIyAGFWnrdrRu2lyneg6kNwICQBp/TGdkCV3revCp+ZHznR5zljNU9L1bDpoctoTqsZA+2IEYWGxgyJm9ruZ86b/etUtz2GJDOw8WqZ4J6YWAGJg5xz3qmDdjSAghbGVFg7Fe/2T7+3pMcpPiCGML6KpHUG5UwZqaxfTOccWk6XokoSkYA2mMHQimvJMP7ZzV+NO9M1TPIaN+273rVM9wKQQEgLRJcYRB6llLCkKlj9y1++Jj71+va1Y5z3g7/ciuqu49TSUZbtuYzesMuyvzBlXPNBWxA8GU03+4Nbv3tXO+lf/5N3uWPvzpA8Fz/TmqZ5qq2IFgyhk43ObxLivtsjhtcSGE8Cwu7lI901TFDgSANAKCKceztHSg/1BrYWw0YooOhc0DR9oLVM80VXGEwZTjXVwSmLai4sK+m5+sy3DbxlzlXp5AnSAEBFPSnE1rzs7ZtOas6jmSVXvNA9tVz3ApHGEASCMgAKQREADSCAgAaQQEgDQCAkAaAQEgjYAAkEZAAEgjIACkERAA0ggIAGkEBIA0AgJAGgEBII2AAJDGDYUmiZzsEdUjpITzsXdukB4SPoWTJMdf9f/fOrPUzqEaOxAA0ggIAGkEBIA0AgJAGgEBII2AAJBGQABIIyAApBEQANIICABpBASANAICQBoBASCNgACQRkAASCMgAKQZ/oZC3W/t9vU1vF6hJ+Imh2f64IyPfeG4ZjKrHgtIC4begYz0tLj8549Pn/vJO/dd9tl79whN07vf3lusei4gXRh6BzLYdjIvHOjJOfn8v6wSQohEPGa22F1jqucC0oWhAyKEEDmll7WVr7rptOo5gHRk6CNMdsm8vqELp4vGgn6rEEJER4cyQv4uh+q5gHRh6B2IK78sWDj/qoYz2396pS50TdNMiZLlnzzhyC0MqZ4NSAeGDogQQuTPq+3In1fboXoOIB0Z+ggDIDkEBIA0AgJAGgEBII2AYMo5/OqDK1XPYBQEBFPOko/duU/1DEZh+F/jGlXjXU8ujfpHHHosbvb+5YLmos/Vtqqeabzs+82961Ze98B21XPICvd1OlpeeHy5Pb94INx7wWNxuELln/3aQbPVnlA923uxAzGoins+daz6idv3zv3JrXv6XzlWEekfzlA9E94RDQaceYtXn5996327TFZ7zH/i9SLVM/0p7EAMqvPp+oqhN5sLhRAiNjjqCDX3OK1e96DqufB7FmfWqLOkakgIIezTpg9GAgOZqmf6UwiIAfn3nfYGT7blzX30ln3mTFv81N/+bEUiEuUmKJOIZjIl3vm3puuxhKZyng/CEcaA4sGwxZxpi5ozbfGRhg5XuLU3V/VMSE/sQAzIU1fd27fjaPmJLz66xpafHbSXTvOrngnpSdN1XfUMYuktP35Z9Qyq6Z/pVz1CSjgfy1E9wrjwV02Nn70nHvzW+mS+niMMAGkEBIA0AgJAGgEBII2AAJBGQABIIyAApBEQANIICABpBASANAICQBoBASCNgACQRkAASCMgAKQREADSCAgAaQQEgDQCAkAaAQEgjYAAkEZAAEgjIACkERAA0ggIAGkEBIA0AgJAGgEBIG1SvLk2gPTEDgSANAICQBoBASCNgACQRkAASCMgAKQREADSCAgAaQQEgDQCAkAaAQEgjYAAkEZAAEgjIACkERAA0ggIAGkEBIA0AgJAGgEBII2AAJBGQABIIyAApBEQANL+D7vRSmjps6oAAAAAAElFTkSuQmCC\n",
+      "text/plain": [
+       "<matplotlib.figure.Figure at 0x7f1224d9be10>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# If you have 2 lists\n",
+    "plt.rcParams[\"figure.figsize\"] = (4,4)\n",
+    "squarify.plot(sizes=cs, label=ls, alpha=.7 )\n",
+    "plt.axis('off')\n",
+    "plt.savefig('letter-treemap.png', bbox_inches='tight')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'treattlpis'"
+      ]
+     },
+     "execution_count": 42,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cat(random_english_letter() for _ in range(10))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'lbycjleuqz'"
+      ]
+     },
+     "execution_count": 46,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import random\n",
+    "import string\n",
+    "\n",
+    "cat(random.choices(string.ascii_lowercase, k=10))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}