X-Git-Url: https://git.njae.me.uk/?a=blobdiff_plain;f=09-word-chains%2Fbuild-word-lists.ipynb;fp=09-word-chains%2Fbuild-word-lists.ipynb;h=d12df7877222bf4e3793fa7e1a9f17bd1b90f4a9;hb=9db793681c67b0ea1be1a404a5a7c1d5afc26610;hp=0000000000000000000000000000000000000000;hpb=76d7dcd5ad275f76e38a33a45e0fbdf2948c6b29;p=ou-summer-of-code-2017.git diff --git a/09-word-chains/build-word-lists.ipynb b/09-word-chains/build-word-lists.ipynb new file mode 100644 index 0000000..d12df78 --- /dev/null +++ b/09-word-chains/build-word-lists.ipynb @@ -0,0 +1,201 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import unicodedata\n", + "import string" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "unaccent_specials = ''.maketrans({\"’\": \"'\", '“': '\"', '”': '\"'})" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def unaccent(text):\n", + " translated_text = text.translate(unaccent_specials)\n", + " return unicodedata.normalize('NFKD', translated_text).\\\n", + " encode('ascii', 'ignore').\\\n", + " decode('utf-8')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def only_lower(text):\n", + " return all((c in string.ascii_lowercase) for c in text)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/usr/share/dict/british-english\n", + "find: ‘/usr/share/doc/google-chrome-stable’: Permission denied\n", + "/usr/share/man/man5/british-english.5.gz\n" + ] + } + ], + "source": [ + "# !find /usr -type f -iname 'british-english*'" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 2.3M\r\n", + "drwxr-xr-x 2 root root 4.0K Dec 29 12:37 .\r\n", + "drwxr-xr-x 640 root root 20K Apr 13 17:05 ..\r\n", + "-rw-r--r-- 1 root root 917K Oct 23 2011 american-english\r\n", + "-rw-r--r-- 1 root root 917K Oct 23 2011 british-english\r\n", + "-rw-r--r-- 1 root root 467K Aug 25 2016 cracklib-small\r\n", + "-rw-r--r-- 1 root root 199 Aug 29 2016 README.select-wordlist\r\n", + "lrwxrwxrwx 1 root root 30 Nov 10 2014 words -> /etc/dictionaries-common/words\r\n", + "lrwxrwxrwx 1 root root 16 Jun 18 2014 words.pre-dictionaries-common -> american-english\r\n" + ] + } + ], + "source": [ + "# !ls -lah /usr/share/dict" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def rude(word):\n", + " return any(w in word \n", + " for w in 'piss shit cunt fuck arse crap fart jizz whore bitch'.split())" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rude('fucks')" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def words_with_len(n):\n", + " return [unaccent(w.strip()) for w in open('/usr/share/dict/british-english').readlines()\n", + " if only_lower(unaccent(w.strip()))\n", + " if len(unaccent(w.strip())) == n\n", + " if not rude(unaccent(w.strip()))]" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2336 4-letter words\n", + "4566 5-letter words\n", + "7223 6-letter words\n" + ] + } + ], + "source": [ + "dicts = {}\n", + "\n", + "for n in [4, 5, 6]:\n", + " dicts[n] = words_with_len(n)\n", + " print('{} {}-letter words'.format(len(dicts[n]), n))\n", + " with open('words{}.txt'.format(n), 'w') as f:\n", + " f.write('\\n'.join(sorted(set(dicts[n]))))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2+" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}