{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import unicodedata\n", "import string" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "unaccent_specials = ''.maketrans({\"’\": \"'\", '“': '\"', '”': '\"'})" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def unaccent(text):\n", " translated_text = text.translate(unaccent_specials)\n", " return unicodedata.normalize('NFKD', translated_text).\\\n", " encode('ascii', 'ignore').\\\n", " decode('utf-8')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def only_lower(text):\n", " return all((c in string.ascii_lowercase) for c in text)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# !find /usr -type f -iname 'british-english*'" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# !ls -lah /usr/share/dict" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def rude(word):\n", " return any(w in word \n", " for w in 'piss shit cunt fuck arse crap fart jizz whore bitch'.split())" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rude('fucks')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "def valid_words():\n", " return [unaccent(w.strip()) for w in open('/usr/share/dict/british-english').readlines()\n", " if only_lower(unaccent(w.strip()))\n", " if not rude(unaccent(w.strip()))]" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def words_with_len(n):\n", " return [unaccent(w.strip()) for w in open('/usr/share/dict/british-english').readlines()\n", " if only_lower(unaccent(w.strip()))\n", " if len(unaccent(w.strip())) == n\n", " if not rude(unaccent(w.strip()))]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "62863 words\n" ] } ], "source": [ "words = valid_words()\n", "print('{} words'.format(len(words)))\n", "with open('words.txt', 'w') as f:\n", " f.write('\\n'.join(words))\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2+" } }, "nbformat": 4, "nbformat_minor": 2 }