X-Git-Url: https://git.njae.me.uk/?a=blobdiff_plain;ds=sidebyside;f=09-word-chains%2Fbuild-word-lists.ipynb;fp=09-word-chains%2Fbuild-word-lists.ipynb;h=0000000000000000000000000000000000000000;hb=2f750972c876ebbb88e1af96be1ce74b4d3558f8;hp=d12df7877222bf4e3793fa7e1a9f17bd1b90f4a9;hpb=63b5f6c0b7b6c18464cce2bc888cf26491b5e603;p=ou-summer-of-code-2017.git diff --git a/09-word-chains/build-word-lists.ipynb b/09-word-chains/build-word-lists.ipynb deleted file mode 100644 index d12df78..0000000 --- a/09-word-chains/build-word-lists.ipynb +++ /dev/null @@ -1,201 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "import unicodedata\n", - "import string" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "unaccent_specials = ''.maketrans({\"’\": \"'\", '“': '\"', '”': '\"'})" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "def unaccent(text):\n", - " translated_text = text.translate(unaccent_specials)\n", - " return unicodedata.normalize('NFKD', translated_text).\\\n", - " encode('ascii', 'ignore').\\\n", - " decode('utf-8')" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "def only_lower(text):\n", - " return all((c in string.ascii_lowercase) for c in text)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/usr/share/dict/british-english\n", - "find: ‘/usr/share/doc/google-chrome-stable’: Permission denied\n", - "/usr/share/man/man5/british-english.5.gz\n" - ] - } - ], - "source": [ - "# !find /usr -type f -iname 'british-english*'" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "total 2.3M\r\n", - "drwxr-xr-x 2 root root 4.0K Dec 29 12:37 .\r\n", - "drwxr-xr-x 640 root root 20K Apr 13 17:05 ..\r\n", - "-rw-r--r-- 1 root root 917K Oct 23 2011 american-english\r\n", - "-rw-r--r-- 1 root root 917K Oct 23 2011 british-english\r\n", - "-rw-r--r-- 1 root root 467K Aug 25 2016 cracklib-small\r\n", - "-rw-r--r-- 1 root root 199 Aug 29 2016 README.select-wordlist\r\n", - "lrwxrwxrwx 1 root root 30 Nov 10 2014 words -> /etc/dictionaries-common/words\r\n", - "lrwxrwxrwx 1 root root 16 Jun 18 2014 words.pre-dictionaries-common -> american-english\r\n" - ] - } - ], - "source": [ - "# !ls -lah /usr/share/dict" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "def rude(word):\n", - " return any(w in word \n", - " for w in 'piss shit cunt fuck arse crap fart jizz whore bitch'.split())" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rude('fucks')" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "def words_with_len(n):\n", - " return [unaccent(w.strip()) for w in open('/usr/share/dict/british-english').readlines()\n", - " if only_lower(unaccent(w.strip()))\n", - " if len(unaccent(w.strip())) == n\n", - " if not rude(unaccent(w.strip()))]" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2336 4-letter words\n", - "4566 5-letter words\n", - "7223 6-letter words\n" - ] - } - ], - "source": [ - "dicts = {}\n", - "\n", - "for n in [4, 5, 6]:\n", - " dicts[n] = words_with_len(n)\n", - " print('{} {}-letter words'.format(len(dicts[n]), n))\n", - " with open('words{}.txt'.format(n), 'w') as f:\n", - " f.write('\\n'.join(sorted(set(dicts[n]))))\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.2+" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}