03-door-codes/build-word-lists.ipynb

   1 {
   2  "cells": [
   3   {
   4    "cell_type": "code",
   5    "execution_count": 1,
   6    "metadata": {
   7     "collapsed": true
   8    },
   9    "outputs": [],
  10    "source": [
  11     "import unicodedata\n",
  12     "import string"
  13    ]
  14   },
  15   {
  16    "cell_type": "code",
  17    "execution_count": 2,
  18    "metadata": {
  19     "collapsed": true
  20    },
  21    "outputs": [],
  22    "source": [
  23     "unaccent_specials = ''.maketrans({\"’\": \"'\", '“': '\"', '”': '\"'})"
  24    ]
  25   },
  26   {
  27    "cell_type": "code",
  28    "execution_count": 3,
  29    "metadata": {
  30     "collapsed": true
  31    },
  32    "outputs": [],
  33    "source": [
  34     "def unaccent(text):\n",
  35     "    translated_text = text.translate(unaccent_specials)\n",
  36     "    return unicodedata.normalize('NFKD', translated_text).\\\n",
  37     "        encode('ascii', 'ignore').\\\n",
  38     "        decode('utf-8')"
  39    ]
  40   },
  41   {
  42    "cell_type": "code",
  43    "execution_count": 4,
  44    "metadata": {
  45     "collapsed": true
  46    },
  47    "outputs": [],
  48    "source": [
  49     "def only_lower(text):\n",
  50     "    return all((c in string.ascii_lowercase) for c in text)"
  51    ]
  52   },
  53   {
  54    "cell_type": "code",
  55    "execution_count": 5,
  56    "metadata": {},
  57    "outputs": [],
  58    "source": [
  59     "# !find /usr -type f -iname 'british-english*'"
  60    ]
  61   },
  62   {
  63    "cell_type": "code",
  64    "execution_count": 6,
  65    "metadata": {},
  66    "outputs": [],
  67    "source": [
  68     "# !ls -lah /usr/share/dict"
  69    ]
  70   },
  71   {
  72    "cell_type": "code",
  73    "execution_count": 7,
  74    "metadata": {
  75     "collapsed": true
  76    },
  77    "outputs": [],
  78    "source": [
  79     "def rude(word):\n",
  80     "    return any(w in word \n",
  81     "               for w in 'piss shit cunt fuck arse crap fart jizz whore bitch'.split())"
  82    ]
  83   },
  84   {
  85    "cell_type": "code",
  86    "execution_count": 8,
  87    "metadata": {},
  88    "outputs": [
  89     {
  90      "data": {
  91       "text/plain": [
  92        "True"
  93       ]
  94      },
  95      "execution_count": 8,
  96      "metadata": {},
  97      "output_type": "execute_result"
  98     }
  99    ],
 100    "source": [
 101     "rude('fucks')"
 102    ]
 103   },
 104   {
 105    "cell_type": "code",
 106    "execution_count": 10,
 107    "metadata": {},
 108    "outputs": [],
 109    "source": [
 110     "def valid_words():\n",
 111     "    return [unaccent(w.strip()) for w in open('/usr/share/dict/british-english').readlines()\n",
 112     "               if only_lower(unaccent(w.strip()))\n",
 113     "               if not rude(unaccent(w.strip()))]"
 114    ]
 115   },
 116   {
 117    "cell_type": "code",
 118    "execution_count": 11,
 119    "metadata": {
 120     "collapsed": true
 121    },
 122    "outputs": [],
 123    "source": [
 124     "def words_with_len(n):\n",
 125     "    return [unaccent(w.strip()) for w in open('/usr/share/dict/british-english').readlines()\n",
 126     "               if only_lower(unaccent(w.strip()))\n",
 127     "               if len(unaccent(w.strip())) == n\n",
 128     "               if not rude(unaccent(w.strip()))]"
 129    ]
 130   },
 131   {
 132    "cell_type": "code",
 133    "execution_count": 13,
 134    "metadata": {},
 135    "outputs": [
 136     {
 137      "name": "stdout",
 138      "output_type": "stream",
 139      "text": [
 140       "62863 words\n"
 141      ]
 142     }
 143    ],
 144    "source": [
 145     "words = valid_words()\n",
 146     "print('{} words'.format(len(words)))\n",
 147     "with open('words.txt', 'w') as f:\n",
 148     "    f.write('\\n'.join(words))\n"
 149    ]
 150   },
 151   {
 152    "cell_type": "code",
 153    "execution_count": null,
 154    "metadata": {
 155     "collapsed": true
 156    },
 157    "outputs": [],
 158    "source": []
 159   }
 160  ],
 161  "metadata": {
 162   "kernelspec": {
 163    "display_name": "Python 3",
 164    "language": "python",
 165    "name": "python3"
 166   },
 167   "language_info": {
 168    "codemirror_mode": {
 169     "name": "ipython",
 170     "version": 3
 171    },
 172    "file_extension": ".py",
 173    "mimetype": "text/x-python",
 174    "name": "python",
 175    "nbconvert_exporter": "python",
 176    "pygments_lexer": "ipython3",
 177    "version": "3.5.2+"
 178   }
 179  },
 180  "nbformat": 4,
 181  "nbformat_minor": 2
 182 }