Removing files from data analysis directory
[ou-summer-of-code-2017.git] / 03-door-codes / build-word-lists.ipynb
1 {
2 "cells": [
3 {
4 "cell_type": "code",
5 "execution_count": 1,
6 "metadata": {
7 "collapsed": true
8 },
9 "outputs": [],
10 "source": [
11 "import unicodedata\n",
12 "import string"
13 ]
14 },
15 {
16 "cell_type": "code",
17 "execution_count": 2,
18 "metadata": {
19 "collapsed": true
20 },
21 "outputs": [],
22 "source": [
23 "unaccent_specials = ''.maketrans({\"’\": \"'\", '“': '\"', '”': '\"'})"
24 ]
25 },
26 {
27 "cell_type": "code",
28 "execution_count": 3,
29 "metadata": {
30 "collapsed": true
31 },
32 "outputs": [],
33 "source": [
34 "def unaccent(text):\n",
35 " translated_text = text.translate(unaccent_specials)\n",
36 " return unicodedata.normalize('NFKD', translated_text).\\\n",
37 " encode('ascii', 'ignore').\\\n",
38 " decode('utf-8')"
39 ]
40 },
41 {
42 "cell_type": "code",
43 "execution_count": 4,
44 "metadata": {
45 "collapsed": true
46 },
47 "outputs": [],
48 "source": [
49 "def only_lower(text):\n",
50 " return all((c in string.ascii_lowercase) for c in text)"
51 ]
52 },
53 {
54 "cell_type": "code",
55 "execution_count": 5,
56 "metadata": {},
57 "outputs": [],
58 "source": [
59 "# !find /usr -type f -iname 'british-english*'"
60 ]
61 },
62 {
63 "cell_type": "code",
64 "execution_count": 6,
65 "metadata": {},
66 "outputs": [],
67 "source": [
68 "# !ls -lah /usr/share/dict"
69 ]
70 },
71 {
72 "cell_type": "code",
73 "execution_count": 7,
74 "metadata": {
75 "collapsed": true
76 },
77 "outputs": [],
78 "source": [
79 "def rude(word):\n",
80 " return any(w in word \n",
81 " for w in 'piss shit cunt fuck arse crap fart jizz whore bitch'.split())"
82 ]
83 },
84 {
85 "cell_type": "code",
86 "execution_count": 8,
87 "metadata": {},
88 "outputs": [
89 {
90 "data": {
91 "text/plain": [
92 "True"
93 ]
94 },
95 "execution_count": 8,
96 "metadata": {},
97 "output_type": "execute_result"
98 }
99 ],
100 "source": [
101 "rude('fucks')"
102 ]
103 },
104 {
105 "cell_type": "code",
106 "execution_count": 10,
107 "metadata": {},
108 "outputs": [],
109 "source": [
110 "def valid_words():\n",
111 " return [unaccent(w.strip()) for w in open('/usr/share/dict/british-english').readlines()\n",
112 " if only_lower(unaccent(w.strip()))\n",
113 " if not rude(unaccent(w.strip()))]"
114 ]
115 },
116 {
117 "cell_type": "code",
118 "execution_count": 11,
119 "metadata": {
120 "collapsed": true
121 },
122 "outputs": [],
123 "source": [
124 "def words_with_len(n):\n",
125 " return [unaccent(w.strip()) for w in open('/usr/share/dict/british-english').readlines()\n",
126 " if only_lower(unaccent(w.strip()))\n",
127 " if len(unaccent(w.strip())) == n\n",
128 " if not rude(unaccent(w.strip()))]"
129 ]
130 },
131 {
132 "cell_type": "code",
133 "execution_count": 13,
134 "metadata": {},
135 "outputs": [
136 {
137 "name": "stdout",
138 "output_type": "stream",
139 "text": [
140 "62863 words\n"
141 ]
142 }
143 ],
144 "source": [
145 "words = valid_words()\n",
146 "print('{} words'.format(len(words)))\n",
147 "with open('words.txt', 'w') as f:\n",
148 " f.write('\\n'.join(words))\n"
149 ]
150 },
151 {
152 "cell_type": "code",
153 "execution_count": null,
154 "metadata": {
155 "collapsed": true
156 },
157 "outputs": [],
158 "source": []
159 }
160 ],
161 "metadata": {
162 "kernelspec": {
163 "display_name": "Python 3",
164 "language": "python",
165 "name": "python3"
166 },
167 "language_info": {
168 "codemirror_mode": {
169 "name": "ipython",
170 "version": 3
171 },
172 "file_extension": ".py",
173 "mimetype": "text/x-python",
174 "name": "python",
175 "nbconvert_exporter": "python",
176 "pygments_lexer": "ipython3",
177 "version": "3.5.2+"
178 }
179 },
180 "nbformat": 4,
181 "nbformat_minor": 2
182 }