Removing files from data analysis directory
[ou-summer-of-code-2017.git] / 08-word-chains / build-word-lists.ipynb
1 {
2 "cells": [
3 {
4 "cell_type": "code",
5 "execution_count": 1,
6 "metadata": {
7 "collapsed": true
8 },
9 "outputs": [],
10 "source": [
11 "import unicodedata\n",
12 "import string"
13 ]
14 },
15 {
16 "cell_type": "code",
17 "execution_count": 2,
18 "metadata": {
19 "collapsed": true
20 },
21 "outputs": [],
22 "source": [
23 "unaccent_specials = ''.maketrans({\"’\": \"'\", '“': '\"', '”': '\"'})"
24 ]
25 },
26 {
27 "cell_type": "code",
28 "execution_count": 3,
29 "metadata": {
30 "collapsed": true
31 },
32 "outputs": [],
33 "source": [
34 "def unaccent(text):\n",
35 " translated_text = text.translate(unaccent_specials)\n",
36 " return unicodedata.normalize('NFKD', translated_text).\\\n",
37 " encode('ascii', 'ignore').\\\n",
38 " decode('utf-8')"
39 ]
40 },
41 {
42 "cell_type": "code",
43 "execution_count": 4,
44 "metadata": {
45 "collapsed": true
46 },
47 "outputs": [],
48 "source": [
49 "def only_lower(text):\n",
50 " return all((c in string.ascii_lowercase) for c in text)"
51 ]
52 },
53 {
54 "cell_type": "code",
55 "execution_count": 5,
56 "metadata": {},
57 "outputs": [],
58 "source": [
59 "# !find /usr -type f -iname 'british-english*'"
60 ]
61 },
62 {
63 "cell_type": "code",
64 "execution_count": 6,
65 "metadata": {},
66 "outputs": [],
67 "source": [
68 "# !ls -lah /usr/share/dict"
69 ]
70 },
71 {
72 "cell_type": "code",
73 "execution_count": 7,
74 "metadata": {
75 "collapsed": true
76 },
77 "outputs": [],
78 "source": [
79 "def rude(word):\n",
80 " return any(w in word \n",
81 " for w in 'piss shit cunt fuck arse crap fart jizz whore bitch'.split())"
82 ]
83 },
84 {
85 "cell_type": "code",
86 "execution_count": 8,
87 "metadata": {},
88 "outputs": [
89 {
90 "data": {
91 "text/plain": [
92 "True"
93 ]
94 },
95 "execution_count": 8,
96 "metadata": {},
97 "output_type": "execute_result"
98 }
99 ],
100 "source": [
101 "rude('fucks')"
102 ]
103 },
104 {
105 "cell_type": "code",
106 "execution_count": 9,
107 "metadata": {
108 "collapsed": true
109 },
110 "outputs": [],
111 "source": [
112 "def words_with_len(n):\n",
113 " return [unaccent(w.strip()) for w in open('/usr/share/dict/british-english').readlines()\n",
114 " if only_lower(unaccent(w.strip()))\n",
115 " if len(unaccent(w.strip())) == n\n",
116 " if not rude(unaccent(w.strip()))]"
117 ]
118 },
119 {
120 "cell_type": "code",
121 "execution_count": 10,
122 "metadata": {},
123 "outputs": [
124 {
125 "name": "stdout",
126 "output_type": "stream",
127 "text": [
128 "4566 5-letter words\n",
129 "7223 6-letter words\n",
130 "9815 7-letter words\n",
131 "10328 8-letter words\n"
132 ]
133 }
134 ],
135 "source": [
136 "dicts = {}\n",
137 "\n",
138 "for n in [5, 6, 7, 8]:\n",
139 " dicts[n] = words_with_len(n)\n",
140 " print('{} {}-letter words'.format(len(dicts[n]), n))\n",
141 " with open('words{}.txt'.format(n), 'w') as f:\n",
142 " f.write('\\n'.join(sorted(set(dicts[n]))))\n"
143 ]
144 },
145 {
146 "cell_type": "code",
147 "execution_count": null,
148 "metadata": {
149 "collapsed": true
150 },
151 "outputs": [],
152 "source": []
153 }
154 ],
155 "metadata": {
156 "kernelspec": {
157 "display_name": "Python 3",
158 "language": "python",
159 "name": "python3"
160 },
161 "language_info": {
162 "codemirror_mode": {
163 "name": "ipython",
164 "version": 3
165 },
166 "file_extension": ".py",
167 "mimetype": "text/x-python",
168 "name": "python",
169 "nbconvert_exporter": "python",
170 "pygments_lexer": "ipython3",
171 "version": "3.5.2+"
172 }
173 },
174 "nbformat": 4,
175 "nbformat_minor": 2
176 }