08-word-chains/build-word-lists.ipynb

   1 {
   2  "cells": [
   3   {
   4    "cell_type": "code",
   5    "execution_count": 1,
   6    "metadata": {
   7     "collapsed": true
   8    },
   9    "outputs": [],
  10    "source": [
  11     "import unicodedata\n",
  12     "import string"
  13    ]
  14   },
  15   {
  16    "cell_type": "code",
  17    "execution_count": 2,
  18    "metadata": {
  19     "collapsed": true
  20    },
  21    "outputs": [],
  22    "source": [
  23     "unaccent_specials = ''.maketrans({\"’\": \"'\", '“': '\"', '”': '\"'})"
  24    ]
  25   },
  26   {
  27    "cell_type": "code",
  28    "execution_count": 3,
  29    "metadata": {
  30     "collapsed": true
  31    },
  32    "outputs": [],
  33    "source": [
  34     "def unaccent(text):\n",
  35     "    translated_text = text.translate(unaccent_specials)\n",
  36     "    return unicodedata.normalize('NFKD', translated_text).\\\n",
  37     "        encode('ascii', 'ignore').\\\n",
  38     "        decode('utf-8')"
  39    ]
  40   },
  41   {
  42    "cell_type": "code",
  43    "execution_count": 4,
  44    "metadata": {
  45     "collapsed": true
  46    },
  47    "outputs": [],
  48    "source": [
  49     "def only_lower(text):\n",
  50     "    return all((c in string.ascii_lowercase) for c in text)"
  51    ]
  52   },
  53   {
  54    "cell_type": "code",
  55    "execution_count": 5,
  56    "metadata": {},
  57    "outputs": [],
  58    "source": [
  59     "# !find /usr -type f -iname 'british-english*'"
  60    ]
  61   },
  62   {
  63    "cell_type": "code",
  64    "execution_count": 6,
  65    "metadata": {},
  66    "outputs": [],
  67    "source": [
  68     "# !ls -lah /usr/share/dict"
  69    ]
  70   },
  71   {
  72    "cell_type": "code",
  73    "execution_count": 7,
  74    "metadata": {
  75     "collapsed": true
  76    },
  77    "outputs": [],
  78    "source": [
  79     "def rude(word):\n",
  80     "    return any(w in word \n",
  81     "               for w in 'piss shit cunt fuck arse crap fart jizz whore bitch'.split())"
  82    ]
  83   },
  84   {
  85    "cell_type": "code",
  86    "execution_count": 8,
  87    "metadata": {},
  88    "outputs": [
  89     {
  90      "data": {
  91       "text/plain": [
  92        "True"
  93       ]
  94      },
  95      "execution_count": 8,
  96      "metadata": {},
  97      "output_type": "execute_result"
  98     }
  99    ],
 100    "source": [
 101     "rude('fucks')"
 102    ]
 103   },
 104   {
 105    "cell_type": "code",
 106    "execution_count": 9,
 107    "metadata": {
 108     "collapsed": true
 109    },
 110    "outputs": [],
 111    "source": [
 112     "def words_with_len(n):\n",
 113     "    return [unaccent(w.strip()) for w in open('/usr/share/dict/british-english').readlines()\n",
 114     "               if only_lower(unaccent(w.strip()))\n",
 115     "               if len(unaccent(w.strip())) == n\n",
 116     "               if not rude(unaccent(w.strip()))]"
 117    ]
 118   },
 119   {
 120    "cell_type": "code",
 121    "execution_count": 10,
 122    "metadata": {},
 123    "outputs": [
 124     {
 125      "name": "stdout",
 126      "output_type": "stream",
 127      "text": [
 128       "4566 5-letter words\n",
 129       "7223 6-letter words\n",
 130       "9815 7-letter words\n",
 131       "10328 8-letter words\n"
 132      ]
 133     }
 134    ],
 135    "source": [
 136     "dicts = {}\n",
 137     "\n",
 138     "for n in [5, 6, 7, 8]:\n",
 139     "    dicts[n] = words_with_len(n)\n",
 140     "    print('{} {}-letter words'.format(len(dicts[n]), n))\n",
 141     "    with open('words{}.txt'.format(n), 'w') as f:\n",
 142     "        f.write('\\n'.join(sorted(set(dicts[n]))))\n"
 143    ]
 144   },
 145   {
 146    "cell_type": "code",
 147    "execution_count": null,
 148    "metadata": {
 149     "collapsed": true
 150    },
 151    "outputs": [],
 152    "source": []
 153   }
 154  ],
 155  "metadata": {
 156   "kernelspec": {
 157    "display_name": "Python 3",
 158    "language": "python",
 159    "name": "python3"
 160   },
 161   "language_info": {
 162    "codemirror_mode": {
 163     "name": "ipython",
 164     "version": 3
 165    },
 166    "file_extension": ".py",
 167    "mimetype": "text/x-python",
 168    "name": "python",
 169    "nbconvert_exporter": "python",
 170    "pygments_lexer": "ipython3",
 171    "version": "3.5.2+"
 172   }
 173  },
 174  "nbformat": 4,
 175  "nbformat_minor": 2
 176 }