Initial working
[imdb-keyword-list.git] / generate-random-plots.ipynb
1 {
2 "cells": [
3 {
4 "cell_type": "markdown",
5 "metadata": {},
6 "source": [
7 "# Generate random movie plot elements\n",
8 "\n",
9 "Data from [IMDB](http://www.imdb.com/interfaces), held at [ftp://ftp.fu-berlin.de/pub/misc/movies/database/](ftp://ftp.fu-berlin.de/pub/misc/movies/database/), in the `keywords.list.gz` file.\n",
10 "\n",
11 "[Abulafia generator](http://www.random-generator.com/index.php?title=Plot_Keyword_Oracle) and [Story-games thread](http://story-games.com/forums/discussion/3502/new-toy-imdb-plot-keywords), and [an earlier version of the list](http://www.logrus.com/~moose/page1/files/cleanplots.txt)."
12 ]
13 },
14 {
15 "cell_type": "code",
16 "execution_count": 1,
17 "metadata": {
18 "collapsed": true
19 },
20 "outputs": [],
21 "source": [
22 "# Get the file\n",
23 "!wget ftp://ftp.fu-berlin.de/pub/misc/movies/database/keywords.list.gz\n",
24 "\n",
25 "!gunzip keywords.list.gz"
26 ]
27 },
28 {
29 "cell_type": "code",
30 "execution_count": 72,
31 "metadata": {
32 "collapsed": false
33 },
34 "outputs": [
35 {
36 "name": "stdout",
37 "output_type": "stream",
38 "text": [
39 "generate-random-plots.ipynb keywords.list\r\n"
40 ]
41 }
42 ],
43 "source": [
44 "!ls"
45 ]
46 },
47 {
48 "cell_type": "code",
49 "execution_count": 73,
50 "metadata": {
51 "collapsed": true
52 },
53 "outputs": [],
54 "source": [
55 "import collections\n",
56 "import re\n",
57 "import random\n",
58 "import itertools"
59 ]
60 },
61 {
62 "cell_type": "code",
63 "execution_count": 74,
64 "metadata": {
65 "collapsed": true
66 },
67 "outputs": [],
68 "source": [
69 "def read_keywords_from_list(line):\n",
70 " if not line:\n",
71 " return []\n",
72 " accumulator = []\n",
73 " for kn in re.split('\\t+', line):\n",
74 " k, n = kn.split()\n",
75 " n = int(n[1:-1])\n",
76 " accumulator += [k] * n\n",
77 " return accumulator"
78 ]
79 },
80 {
81 "cell_type": "code",
82 "execution_count": 75,
83 "metadata": {
84 "collapsed": false
85 },
86 "outputs": [
87 {
88 "data": {
89 "text/plain": [
90 "['blue-pantyhose', 'blue-party', 'blue-party', 'blue-pearl']"
91 ]
92 },
93 "execution_count": 75,
94 "metadata": {},
95 "output_type": "execute_result"
96 }
97 ],
98 "source": [
99 "read_keywords_from_list('blue-pantyhose (1)\t\tblue-party (2)\tblue-pearl (1)')"
100 ]
101 },
102 {
103 "cell_type": "code",
104 "execution_count": 76,
105 "metadata": {
106 "collapsed": true
107 },
108 "outputs": [],
109 "source": [
110 "def read_keywords_from_title(line):\n",
111 " if line:\n",
112 " return [re.split('\\t+', line)[1]]\n",
113 " else:\n",
114 " return []"
115 ]
116 },
117 {
118 "cell_type": "code",
119 "execution_count": 77,
120 "metadata": {
121 "collapsed": false
122 },
123 "outputs": [
124 {
125 "data": {
126 "text/plain": [
127 "['beer-drinking']"
128 ]
129 },
130 "execution_count": 77,
131 "metadata": {},
132 "output_type": "execute_result"
133 }
134 ],
135 "source": [
136 "read_keywords_from_title('Been Down So Long It Looks Like Up to Me (1971)\t\tbeer-drinking')"
137 ]
138 },
139 {
140 "cell_type": "code",
141 "execution_count": 78,
142 "metadata": {
143 "collapsed": false
144 },
145 "outputs": [
146 {
147 "name": "stdout",
148 "output_type": "stream",
149 "text": [
150 "skipping underlines ========\n",
151 "ending list 5: Submission Rules\n",
152 "skipping underlines =============\n",
153 "skipping underlines ==================\n",
154 "skipping underlines ===============\n",
155 "starting list keywords in use:\n",
156 "ending list 5: Submission Rules\n",
157 "skipping underlines ===================\n",
158 "skipping underlines ==============\n",
159 "skipping underlines ======================\n",
160 "skipping underlines =================================================\n",
161 "pre-title 8: THE KEYWORDS LIST\n",
162 "starting title ====================\n"
163 ]
164 }
165 ],
166 "source": [
167 "keywords_from_list = collections.Counter()\n",
168 "keywords_from_title = collections.Counter()\n",
169 "reading_state = None\n",
170 "for line in open('keywords.list', encoding='latin-1').readlines():\n",
171 " \n",
172 " if line.strip().startswith('keywords in use'):\n",
173 " reading_state = 'keywords_from_list'\n",
174 " print('starting list', line.strip())\n",
175 " continue\n",
176 " if line.strip().startswith('5: Submission Rules'):\n",
177 " readings_state = None\n",
178 " print('ending list', line.strip())\n",
179 " continue\n",
180 " if reading_state == 'pre_from_title':\n",
181 " if line.strip().startswith('==='):\n",
182 " reading_state = 'keywords_from_title'\n",
183 " print('starting title', line.strip())\n",
184 " else:\n",
185 " reading_state = None\n",
186 " print('not start of titles', line.strip())\n",
187 " continue\n",
188 " if line.strip().startswith('8: THE KEYWORDS LIST'):\n",
189 " reading_state = 'pre_from_title'\n",
190 " print('pre-title', line.strip())\n",
191 " continue\n",
192 " if line.strip().startswith('==='):\n",
193 " reading_state = None\n",
194 " print('skipping underlines', line.strip())\n",
195 " continue\n",
196 " \n",
197 " if reading_state == 'keywords_from_list':\n",
198 " keywords_from_list.update(read_keywords_from_list(line.strip()))\n",
199 " elif reading_state == 'keywords_from_title':\n",
200 " keywords_from_title.update(read_keywords_from_title(line.strip()))\n",
201 "\n",
202 "sum_keywords = sum(keywords_from_list.values())"
203 ]
204 },
205 {
206 "cell_type": "code",
207 "execution_count": 79,
208 "metadata": {
209 "collapsed": false
210 },
211 "outputs": [
212 {
213 "data": {
214 "text/plain": [
215 "[('sex', 83531),\n",
216 " ('hardcore', 69247),\n",
217 " ('character-name-in-title', 44747),\n",
218 " ('independent-film', 37932)]"
219 ]
220 },
221 "execution_count": 79,
222 "metadata": {},
223 "output_type": "execute_result"
224 }
225 ],
226 "source": [
227 "keywords_from_list.most_common(4)"
228 ]
229 },
230 {
231 "cell_type": "code",
232 "execution_count": 80,
233 "metadata": {
234 "collapsed": false
235 },
236 "outputs": [
237 {
238 "data": {
239 "text/plain": [
240 "[('sex', 83531),\n",
241 " ('hardcore', 69247),\n",
242 " ('character-name-in-title', 44747),\n",
243 " ('independent-film', 37932)]"
244 ]
245 },
246 "execution_count": 80,
247 "metadata": {},
248 "output_type": "execute_result"
249 }
250 ],
251 "source": [
252 "keywords_from_title.most_common(4)"
253 ]
254 },
255 {
256 "cell_type": "code",
257 "execution_count": 81,
258 "metadata": {
259 "collapsed": true
260 },
261 "outputs": [],
262 "source": [
263 "def pick_random(items, count):\n",
264 " i = random.randint(0, count)\n",
265 " return list(itertools.islice(items.elements(), i, i+1))[0]"
266 ]
267 },
268 {
269 "cell_type": "code",
270 "execution_count": 85,
271 "metadata": {
272 "collapsed": false
273 },
274 "outputs": [
275 {
276 "name": "stdout",
277 "output_type": "stream",
278 "text": [
279 "mother-daughter-relationship\n",
280 "oral-sex\n",
281 "cyberpunk\n",
282 "main-character-shot\n",
283 "medical-gloves\n",
284 "tv-show\n",
285 "female-nudity\n",
286 "conundrum\n",
287 "los-angeles-california\n",
288 "elevator\n",
289 "yorkshireman\n",
290 "paper-money\n",
291 "unfaithful-husband\n",
292 "brain-teaser\n",
293 "birthday\n",
294 "ranch\n",
295 "festival\n",
296 "world-war-two\n",
297 "reference-to-wikipedia\n",
298 "sundance\n"
299 ]
300 }
301 ],
302 "source": [
303 "for _ in range(20):\n",
304 " print(pick_random(keywords_from_list, sum_keywords))"
305 ]
306 },
307 {
308 "cell_type": "code",
309 "execution_count": null,
310 "metadata": {
311 "collapsed": true
312 },
313 "outputs": [],
314 "source": []
315 }
316 ],
317 "metadata": {
318 "kernelspec": {
319 "display_name": "Python 3",
320 "language": "python",
321 "name": "python3"
322 },
323 "language_info": {
324 "codemirror_mode": {
325 "name": "ipython",
326 "version": 3
327 },
328 "file_extension": ".py",
329 "mimetype": "text/x-python",
330 "name": "python",
331 "nbconvert_exporter": "python",
332 "pygments_lexer": "ipython3",
333 "version": "3.4.3+"
334 }
335 },
336 "nbformat": 4,
337 "nbformat_minor": 0
338 }