{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import unicodedata\n", "import string" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "unaccent_specials = ''.maketrans({\"’\": \"'\", '“': '\"', '”': '\"'})" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def unaccent(text):\n", " translated_text = text.translate(unaccent_specials)\n", " return unicodedata.normalize('NFKD', translated_text).\\\n", " encode('ascii', 'ignore').\\\n", " decode('utf-8')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def only_lower(text):\n", " return all((c in string.ascii_lowercase) for c in text)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/usr/share/dict/british-english\n", "find: ‘/usr/share/doc/google-chrome-stable’: Permission denied\n", "/usr/share/man/man5/british-english.5.gz\n" ] } ], "source": [ "# !find /usr -type f -iname 'british-english*'" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 2.3M\r\n", "drwxr-xr-x 2 root root 4.0K Dec 29 12:37 .\r\n", "drwxr-xr-x 640 root root 20K Apr 13 17:05 ..\r\n", "-rw-r--r-- 1 root root 917K Oct 23 2011 american-english\r\n", "-rw-r--r-- 1 root root 917K Oct 23 2011 british-english\r\n", "-rw-r--r-- 1 root root 467K Aug 25 2016 cracklib-small\r\n", "-rw-r--r-- 1 root root 199 Aug 29 2016 README.select-wordlist\r\n", "lrwxrwxrwx 1 root root 30 Nov 10 2014 words -> /etc/dictionaries-common/words\r\n", "lrwxrwxrwx 1 root root 16 Jun 18 2014 words.pre-dictionaries-common -> american-english\r\n" ] } ], "source": [ "# !ls -lah /usr/share/dict" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def rude(word):\n", " return any(w in word \n", " for w in 'piss shit cunt fuck arse crap fart jizz whore bitch'.split())" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rude('fucks')" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def words_with_len(n):\n", " return [unaccent(w.strip()) for w in open('/usr/share/dict/british-english').readlines()\n", " if only_lower(unaccent(w.strip()))\n", " if len(unaccent(w.strip())) == n\n", " if not rude(unaccent(w.strip()))]" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2336 4-letter words\n", "4566 5-letter words\n", "7223 6-letter words\n" ] } ], "source": [ "dicts = {}\n", "\n", "for n in [4, 5, 6]:\n", " dicts[n] = words_with_len(n)\n", " print('{} {}-letter words'.format(len(dicts[n]), n))\n", " with open('words{}.txt'.format(n), 'w') as f:\n", " f.write('\\n'.join(sorted(set(dicts[n]))))\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2+" } }, "nbformat": 4, "nbformat_minor": 2 }