From: Neil Smith Date: Fri, 16 Mar 2018 20:29:39 +0000 (+0000) Subject: Create a treemap of letter frequencies X-Git-Url: https://git.njae.me.uk/?a=commitdiff_plain;h=759fda3aeb14a98bd5c096175c2b45e5cd75ac39;p=cipher-tools.git Create a treemap of letter frequencies --- diff --git a/letter-treemap.png b/letter-treemap.png new file mode 100644 index 0000000..6cde8ed Binary files /dev/null and b/letter-treemap.png differ diff --git a/make-letter-frequency-treemap.ipynb b/make-letter-frequency-treemap.ipynb new file mode 100644 index 0000000..224c3b7 --- /dev/null +++ b/make-letter-frequency-treemap.ipynb @@ -0,0 +1,237 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "# libraries\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "\n", + "import squarify # pip install squarify (algorithm for treemap)\n", + "\n", + "from support.language_models import *\n", + "from support.utilities import *" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "defaultdict(int,\n", + " {'a': 0.07822525209432887,\n", + " 'b': 0.014829998223636929,\n", + " 'c': 0.02251879345845122,\n", + " 'd': 0.042759915992231244,\n", + " 'e': 0.12099426536374505,\n", + " 'f': 0.02159693603704411,\n", + " 'g': 0.018815084434702378,\n", + " 'h': 0.06645305621431015,\n", + " 'i': 0.06723047441023709,\n", + " 'j': 0.0010659774441790274,\n", + " 'k': 0.00865805425839555,\n", + " 'l': 0.04134042154867259,\n", + " 'm': 0.027483193578407596,\n", + " 'n': 0.06693265828344594,\n", + " 'o': 0.08052207518149467,\n", + " 'p': 0.016070260346516884,\n", + " 'q': 0.0008776478463153873,\n", + " 'r': 0.059626906298523796,\n", + " 's': 0.06455443850567806,\n", + " 't': 0.08946868868814231,\n", + " 'u': 0.03036719004738724,\n", + " 'v': 0.010421489620086533,\n", + " 'w': 0.024603665947343364,\n", + " 'x': 0.0011832844394584982,\n", + " 'y': 0.022829377693572104,\n", + " 'z': 0.0005708940436934243})" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "normalised_english_counts" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(['e',\n", + " 't',\n", + " 'o',\n", + " 'a',\n", + " 'i',\n", + " 'n',\n", + " 'h',\n", + " 's',\n", + " 'r',\n", + " 'd',\n", + " 'l',\n", + " 'u',\n", + " 'm',\n", + " 'w',\n", + " 'y',\n", + " 'c',\n", + " 'f',\n", + " 'g',\n", + " 'p',\n", + " 'b',\n", + " 'v',\n", + " 'k',\n", + " 'x',\n", + " 'j',\n", + " 'q',\n", + " 'z'],\n", + " [0.12099426536374505,\n", + " 0.08946868868814231,\n", + " 0.08052207518149467,\n", + " 0.07822525209432887,\n", + " 0.06723047441023709,\n", + " 0.06693265828344594,\n", + " 0.06645305621431015,\n", + " 0.06455443850567806,\n", + " 0.059626906298523796,\n", + " 0.042759915992231244,\n", + " 0.04134042154867259,\n", + " 0.03036719004738724,\n", + " 0.027483193578407596,\n", + " 0.024603665947343364,\n", + " 0.022829377693572104,\n", + " 0.02251879345845122,\n", + " 0.02159693603704411,\n", + " 0.018815084434702378,\n", + " 0.016070260346516884,\n", + " 0.014829998223636929,\n", + " 0.010421489620086533,\n", + " 0.00865805425839555,\n", + " 0.0011832844394584982,\n", + " 0.0010659774441790274,\n", + " 0.0008776478463153873,\n", + " 0.0005708940436934243])" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ls = sorted(normalised_english_counts, key=normalised_english_counts.get, reverse=True)\n", + "cs = [normalised_english_counts[l] for l in ls]\n", + "ls, cs" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAARAAAAD8CAYAAAC/+/tYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAEmtJREFUeJzt3Xl0XNV9wPH7ZkazaGa0zMhaPFoteRV4N8aybDkhtDZgk5AEmrQhiYGEkMY5CeEUSGh7yglJ6xA4lCb0QJw2UEhJgQDxUnJivMgYvOANbEu2ZGuxdmk00kgzmu31j9SBQHDgjjRXo/f9/OXRkd79/WF9de9o9EbTdV0AgAyT6gEApC8CAkAaAQEgjYAAkEZAAEgjIACkERAA0ggIAGkEBIA0i+oBhBDizdbSl1XPoNoNO7+ueoSU27zqWWVr1w/NUrb2pZTb+8f1eq+sXyCEEOLswGtlHcMny4QQIqZHMmxm1+jKkpv3bz+7eX0y158UAQEwsao8NS1VnpqWhB7TXmt/akVZ9qLm8bguRxjAQI51b7ss1+7rK866vHs8rkdAAINo9h8oHosFHfPyrmocr2tyhAGmsNoXTwkhhHjm24cvO/u77lJvmTNwtuAXi4vmZA2uvXNu0scYdiDAFNf0el/2mb29PpvTEo2NxS3nDvQXnt7VPX08rs0OBJDU3xRwvPTNPVd8+Tfrd6ue5VKaXu/zLLjOd/76f7i8QQghfnX3kXnuafbweFybHQiQRjrPBB33X723TvUcFxEQIAl6XNd+/fVdi7Zc89Ka57+yc8lYMGpWPdN7Va7IGzi7v7dwbCRmGh2MmE/s6Cwfr2sTECAJwe5R14KbZp7fuG3DroxMS+zgE2+XpWrtjobhzAfW1q9u2Nef/UGf8/iB1WKndkMgNmfRhQfW7q/bfNOJ5dbZVZ3HO4vF4wdWJz0Dz4EASXDk2kIVq31+IYSYc215+7FfnqkQQozLi7QupfWtgPOp75xYctP91Ucrl+UO/bnP93z++rOez19/VgghWm69+1prSVFgPOYgIEAytD/zeAKEh2K2/9h0bNnND80/VL4gJzjxK34wjjBAEkIDY47z9R25QgjRsK3FV3i5d2Ci17RmmqPuPFuo8bUBj8zXa2ZzIvfGa3kpO6CaqyAzePSXjeVbrnlpzVgwal12S3XLRK9psmiJ23+2+NCxHd3Fu3/R4pvo9S6FIwwgyVuZHdq4fcMuFWs73Bnx27csPvDozYdW2J2W2PJP+8blb1s+KgKCtPPSE93Fr/xXb6UmhF5UYR++d0vVEdUzpUrRTFfovt+u2i2EEG6vLXbP1pV7L/X5ngMZ7/tYa0J75+NfTm4eAoK00vBm0LXt5z0zv//c7H3eQmvE3xN9/3cI/shQW0N2255fLZxz4117o8FBq56ImYJd59yuworhZK9NQJBWjuwayltYl9XpLbRGhBAiNz8jqnqmyS6rZHbAXTyrq2Xn09XDbQ0+Z35Zz3jEQwieRAUMoXjlpxpD/R25Ga6c4Mzrv35wvK7LDgRpZdGarL6HvnFu2UB3pMlTYI36e6IZMruQl3ctm4jxkvaNtTsm5LrR0YA1EYtaNFMikYhGzGabIz4e1yUgSCuzF7uC13wp/8x9NzbWmExC91XZh+5+vOqo6rkmu5adz8wvWPixhsjQgKN1z7NzK67+4lvjcV0CAiGEELquC6HrQjNN/lPthtsK2jfcVtCueo500X301WLNZNLz59dd0BNxcerZzbX+pqPe3MqFSd/BmYAYWORCl6PnwZ9fmVFc6I92dOfkf+vLb1h9BSHVcxnZC3dfPa7Xc4mYcOWvahf5q9pFR0wIIcQVq79TL4QQFx8ng4AYXNwfcHq/dMMRx/zZg6pnQfqZ/PtVTChTlitEPCCLgBicZs1Ifh8LwyIgAKTxHAggKdLZ5eh6/PHl1oKCQKS7OzsjzztcsHHjUZPdPi6vsUgHht+BPPKDwRnXreyou25lR92j/zxYoXqeVLL6CkPFm/9uUt9RfLKLDw66smpXni/9+/t2aVZbbPCV36bsloaTgaEDcqA+nP3braMlz2wvrH96W2H9jhdHyw7tD2epngvpw+RyhjKrq/1CCOFeurQ93NoqdZMfWY1Hnp3TcvqV8ouPzx5/flbz21tnpGp9YwfktbDnylX2LneWKZ6VbYqvqLN3vlEf9qqeC+lEu8SjiZdfsqSjv+vtP7xJ1ED36emFpcs6UrW+oQMCJCsRDDpGT57KFUKI4OHDPltZ2YTf0vDdcvIqh2LRkDU00mcL9DdnmS22aKY7f1zeNOrDMHRAltfaB97YGy4MDifMw0MJ8/494cLltfakX94L4zDn5ASH6veWt/7T/WsS4bA15+pPTPgtDd8rN392Z3froendbYenewurU7b7EMLgv4VZVmMPfOLazLa/WttVK4QQazdkti5dYf+zt8gHLtI0TS/8yleU3hGtoGTphTPH/mdBLBqyzl/5tddSubahAyKEEJvuyWnedE/OhL+PBzBRsjxlwXg8YsmwusIOp3cslWsbPiBQ5yd33Khu8bXJX8JaVBgque97k+LX4Fd84h4lcxj6ORAAySEgAKQREADSCAgAaTyJCkP65rptqkf4I4++cK0QQojOK9PrW5IdCABpBASANAICQBoBASCNgACQRkAASCMgAKSl1y+dpzCrK6J6BAWsqgdAktiBAJBGQJBWGk4+P7v5zI4/3D2/4e3n5rz7MVKLgCCtFJfWtPZ2nygWQghdT4j+3tPTfSU17arm2fpwU8U/frx+zWO3HlmkagaVeA4EacXpKgxZLPaof6ApKxIesmU6pwVs9qyoqnkOvthZdseWRa/nVzhTdiPjyYSAIO0UTF/S2tl+oCQSCdoKfUvaVM2xZdPxy4d6I87Hbju6fNG6gtb1d1adS+X6vS89P3Pk5IlikyNzzOx2h21F0wfz1m1I6e05OcIg7RT5lnUG/OfzR4M9OfmFC3tUzbHxkfknnDkZ4U1PLdmf6niMnmnIHm08VVT67bt3+267441IT3d2Kte/iB0I0o7ZnKG7s0v6LBZb1GQyqx5HiVDzWY+jalaXyWpLCKtNOGZUdauYgx0I0o6uJ0RwuCPXV1qj7PiC3yMgSCuBwVbX/t0/+Hh2TlmfO8s3onoeVRwzqgZCTWcKEpExUzw0ag41ny1QMQdHGKSV7JzSYM2a7+5UPYdqmTNnBxxVs7pbfvzDOnNm5pjVmzdssjtiqZ6DgABJuL9+1e9Ure39i2ua8q//TGMiHDa3/eShGntpeSDVMxAQIE11PfPk/NhAn1uPx03O6vntmZUzjRmQLz78LdUjqFcTVD0B0oxv41eVvievEDyJCiAJBASANAICQBoBASCNgACQRkAASCMgAKQREADSJsULyWBMtsNNytbeXjvjfR/TS4oUTPJ7M8SgGPtR+v1tIDsQANLYgRhUfCRs7njgySXxQNCuJ3Qtd33NmZx1V3aongvphYAY1PC+E9PM2c5wyfdvOyCEELGhEf4v4CPjCGNQ9krfcLixbVrXv70wN3jwlMeS5Uz5vSSQ/giIQdkrp4+Ubr5jj620YKj/v1+d0/PEb2aqngnph22rQUW6+m2WHHc0d33NBbPLEQ28+map6pmQfgiIQYXPtGf1/3LnPE3TdGE26fm3XHtc9UxIPwTEoLJWLejNWrVgt+o5kN54DgSANAICQBoBASCNgACQRkAASCMgAKQREADSCAgAaQQEgDReiYq0NBwbcLw5/L9X1OV+brcQQjSMvDEjrkct81y1jamaIRjqcbzZ/PRyl6NgcDjUleuy5w/6PAvbmrv3zI7Gw9bqkg1H8rIqBydi7aZ/311pyjAnKjbWnjv1wNbq0daBrCWPfWF/7+4Gb+f2t0rn//DTKXnbS3YgQBLC0WFnRf7KptXzvvlqKOJ3dfpP+FbMvn1fZWHdyebuvVUTtW7OotL+oZMdHiGEGDnXlx0fi5kTkZg2eKzNm109vX+i1n0vwwckNjps6d6/o0z1HEhPtgzXaK6rdFjTTCLT6hn2uMr7NE0T2Zm+4bHocOZErZu7sCQw2jqQHQ2ELJrFnHBV5fv9R1pzhk51eXKXlA1M1LrvRUBGgxmDJw+Wq54DH41JM+lCCO3i44SIm1XMoWmmxLseCJPJkhBCCE1oui4S2gd+YZJMVotu9TpH2587XOKeVeDPudw34D903jvWN+x0zykKTtS675sjVQtNVp27X5wbCw45G7d8f3X7jqfnqp4HH47D5B6L6mPWcGIkI67HTP3RCwWqZ0o195yigc6tx2fkLCzp91xR0d+z83RZZrEnoJkmrFvvY/gnUYvqrj/V8uIT7lkbv7tH9Sz48EyaWS+zV595ffDXqzJM9rDD5E7ZT93JImdByUDn1uMzPcsq/BaXLa5lmBNZ84pSdnwRQghN1/VUrvcnXf7th15WtXa4r9PR8uITV8y+5T6l98YYqzHc/38x4/Y21SP8EZXvCyOEUPK+MLuu+tH6ZL7e8EcYAPIMHxCzPTOWiEYNf5QDZBg+IBmu7Kgj3zfQ8LP763gSFfho+MkrhCi/4aspedUeMNUYfgcCQB47kEkiHlPyOiggKexAAEgjIACkERAA0ggIAGkEBIA0AgJAGgEBII2AAJBGQABIIyAApPFSdiijPzdh9xyWFPjIXxH+kW/8ln/YNX7X+rCuSu7L2YEAkEZAAEgjIACkERAA0ggIAGkEBIA0AgJAGgEBII2AAJBGQABIIyAGFWnrdrRu2lyneg6kNwICQBp/TGdkCV3revCp+ZHznR5zljNU9L1bDpoctoTqsZA+2IEYWGxgyJm9ruZ86b/etUtz2GJDOw8WqZ4J6YWAGJg5xz3qmDdjSAghbGVFg7Fe/2T7+3pMcpPiCGML6KpHUG5UwZqaxfTOccWk6XokoSkYA2mMHQimvJMP7ZzV+NO9M1TPIaN+273rVM9wKQQEgLRJcYRB6llLCkKlj9y1++Jj71+va1Y5z3g7/ciuqu49TSUZbtuYzesMuyvzBlXPNBWxA8GU03+4Nbv3tXO+lf/5N3uWPvzpA8Fz/TmqZ5qq2IFgyhk43ObxLivtsjhtcSGE8Cwu7lI901TFDgSANAKCKceztHSg/1BrYWw0YooOhc0DR9oLVM80VXGEwZTjXVwSmLai4sK+m5+sy3DbxlzlXp5AnSAEBFPSnE1rzs7ZtOas6jmSVXvNA9tVz3ApHGEASCMgAKQREADSCAgAaQQEgDQCAkAaAQEgjYAAkEZAAEgjIACkERAA0ggIAGkEBIA0AgJAGgEBII2AAJDGDYUmiZzsEdUjpITzsXdukB4SPoWTJMdf9f/fOrPUzqEaOxAA0ggIAGkEBIA0AgJAGgEBII2AAJBGQABIIyAApBEQANIICABpBASANAICQBoBASCNgACQRkAASCMgAKQZ/oZC3W/t9vU1vF6hJ+Imh2f64IyPfeG4ZjKrHgtIC4begYz0tLj8549Pn/vJO/dd9tl79whN07vf3lusei4gXRh6BzLYdjIvHOjJOfn8v6wSQohEPGa22F1jqucC0oWhAyKEEDmll7WVr7rptOo5gHRk6CNMdsm8vqELp4vGgn6rEEJER4cyQv4uh+q5gHRh6B2IK78sWDj/qoYz2396pS50TdNMiZLlnzzhyC0MqZ4NSAeGDogQQuTPq+3In1fboXoOIB0Z+ggDIDkEBIA0AgJAGgEBII2AYMo5/OqDK1XPYBQEBFPOko/duU/1DEZh+F/jGlXjXU8ujfpHHHosbvb+5YLmos/Vtqqeabzs+82961Ze98B21XPICvd1OlpeeHy5Pb94INx7wWNxuELln/3aQbPVnlA923uxAzGoins+daz6idv3zv3JrXv6XzlWEekfzlA9E94RDQaceYtXn5996327TFZ7zH/i9SLVM/0p7EAMqvPp+oqhN5sLhRAiNjjqCDX3OK1e96DqufB7FmfWqLOkakgIIezTpg9GAgOZqmf6UwiIAfn3nfYGT7blzX30ln3mTFv81N/+bEUiEuUmKJOIZjIl3vm3puuxhKZyng/CEcaA4sGwxZxpi5ozbfGRhg5XuLU3V/VMSE/sQAzIU1fd27fjaPmJLz66xpafHbSXTvOrngnpSdN1XfUMYuktP35Z9Qyq6Z/pVz1CSjgfy1E9wrjwV02Nn70nHvzW+mS+niMMAGkEBIA0AgJAGgEBII2AAJBGQABIIyAApBEQANIICABpBASANAICQBoBASCNgACQRkAASCMgAKQREADSCAgAaQQEgDQCAkAaAQEgjYAAkEZAAEgjIACkERAA0ggIAGkEBIA0AgJAGgEBIG1SvLk2gPTEDgSANAICQBoBASCNgACQRkAASCMgAKQREADSCAgAaQQEgDQCAkAaAQEgjYAAkEZAAEgjIACkERAA0ggIAGkEBIA0AgJAGgEBII2AAJBGQABIIyAApBEQANL+D7vRSmjps6oAAAAAAElFTkSuQmCC\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# If you have 2 lists\n", + "plt.rcParams[\"figure.figsize\"] = (4,4)\n", + "squarify.plot(sizes=cs, label=ls, alpha=.7 )\n", + "plt.axis('off')\n", + "plt.savefig('letter-treemap.png', bbox_inches='tight')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'treattlpis'" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cat(random_english_letter() for _ in range(10))" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'lbycjleuqz'" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import random\n", + "import string\n", + "\n", + "cat(random.choices(string.ascii_lowercase, k=10))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}