generate-random-plots.ipynb

   1 {
   2  "cells": [
   3   {
   4    "cell_type": "markdown",
   5    "metadata": {},
   6    "source": [
   7     "# Generate random movie plot elements\n",
   8     "\n",
   9     "Data from [IMDB](http://www.imdb.com/interfaces), held at [ftp://ftp.fu-berlin.de/pub/misc/movies/database/](ftp://ftp.fu-berlin.de/pub/misc/movies/database/), in the `keywords.list.gz` file.\n",
  10     "\n",
  11     "[Abulafia generator](http://www.random-generator.com/index.php?title=Plot_Keyword_Oracle) and [Story-games thread](http://story-games.com/forums/discussion/3502/new-toy-imdb-plot-keywords), and [an earlier version of the list](http://www.logrus.com/~moose/page1/files/cleanplots.txt)."
  12    ]
  13   },
  14   {
  15    "cell_type": "code",
  16    "execution_count": 1,
  17    "metadata": {
  18     "collapsed": true
  19    },
  20    "outputs": [],
  21    "source": [
  22     "# Get the file\n",
  23     "!wget ftp://ftp.fu-berlin.de/pub/misc/movies/database/keywords.list.gz\n",
  24     "\n",
  25     "!gunzip keywords.list.gz"
  26    ]
  27   },
  28   {
  29    "cell_type": "code",
  30    "execution_count": 72,
  31    "metadata": {
  32     "collapsed": false
  33    },
  34    "outputs": [
  35     {
  36      "name": "stdout",
  37      "output_type": "stream",
  38      "text": [
  39       "generate-random-plots.ipynb  keywords.list\r\n"
  40      ]
  41     }
  42    ],
  43    "source": [
  44     "!ls"
  45    ]
  46   },
  47   {
  48    "cell_type": "code",
  49    "execution_count": 73,
  50    "metadata": {
  51     "collapsed": true
  52    },
  53    "outputs": [],
  54    "source": [
  55     "import collections\n",
  56     "import re\n",
  57     "import random\n",
  58     "import itertools"
  59    ]
  60   },
  61   {
  62    "cell_type": "code",
  63    "execution_count": 74,
  64    "metadata": {
  65     "collapsed": true
  66    },
  67    "outputs": [],
  68    "source": [
  69     "def read_keywords_from_list(line):\n",
  70     "    if not line:\n",
  71     "        return []\n",
  72     "    accumulator = []\n",
  73     "    for kn in re.split('\\t+', line):\n",
  74     "        k, n = kn.split()\n",
  75     "        n = int(n[1:-1])\n",
  76     "        accumulator += [k] * n\n",
  77     "    return accumulator"
  78    ]
  79   },
  80   {
  81    "cell_type": "code",
  82    "execution_count": 75,
  83    "metadata": {
  84     "collapsed": false
  85    },
  86    "outputs": [
  87     {
  88      "data": {
  89       "text/plain": [
  90        "['blue-pantyhose', 'blue-party', 'blue-party', 'blue-pearl']"
  91       ]
  92      },
  93      "execution_count": 75,
  94      "metadata": {},
  95      "output_type": "execute_result"
  96     }
  97    ],
  98    "source": [
  99     "read_keywords_from_list('blue-pantyhose (1)\t\tblue-party (2)\tblue-pearl (1)')"
 100    ]
 101   },
 102   {
 103    "cell_type": "code",
 104    "execution_count": 76,
 105    "metadata": {
 106     "collapsed": true
 107    },
 108    "outputs": [],
 109    "source": [
 110     "def read_keywords_from_title(line):\n",
 111     "    if line:\n",
 112     "        return [re.split('\\t+', line)[1]]\n",
 113     "    else:\n",
 114     "        return []"
 115    ]
 116   },
 117   {
 118    "cell_type": "code",
 119    "execution_count": 77,
 120    "metadata": {
 121     "collapsed": false
 122    },
 123    "outputs": [
 124     {
 125      "data": {
 126       "text/plain": [
 127        "['beer-drinking']"
 128       ]
 129      },
 130      "execution_count": 77,
 131      "metadata": {},
 132      "output_type": "execute_result"
 133     }
 134    ],
 135    "source": [
 136     "read_keywords_from_title('Been Down So Long It Looks Like Up to Me (1971)\t\tbeer-drinking')"
 137    ]
 138   },
 139   {
 140    "cell_type": "code",
 141    "execution_count": 78,
 142    "metadata": {
 143     "collapsed": false
 144    },
 145    "outputs": [
 146     {
 147      "name": "stdout",
 148      "output_type": "stream",
 149      "text": [
 150       "skipping underlines ========\n",
 151       "ending list 5: Submission Rules\n",
 152       "skipping underlines =============\n",
 153       "skipping underlines ==================\n",
 154       "skipping underlines ===============\n",
 155       "starting list keywords in use:\n",
 156       "ending list 5: Submission Rules\n",
 157       "skipping underlines ===================\n",
 158       "skipping underlines ==============\n",
 159       "skipping underlines ======================\n",
 160       "skipping underlines =================================================\n",
 161       "pre-title 8: THE KEYWORDS LIST\n",
 162       "starting title ====================\n"
 163      ]
 164     }
 165    ],
 166    "source": [
 167     "keywords_from_list = collections.Counter()\n",
 168     "keywords_from_title = collections.Counter()\n",
 169     "reading_state = None\n",
 170     "for line in open('keywords.list', encoding='latin-1').readlines():\n",
 171     "        \n",
 172     "    if line.strip().startswith('keywords in use'):\n",
 173     "        reading_state = 'keywords_from_list'\n",
 174     "        print('starting list', line.strip())\n",
 175     "        continue\n",
 176     "    if line.strip().startswith('5: Submission Rules'):\n",
 177     "        readings_state = None\n",
 178     "        print('ending list', line.strip())\n",
 179     "        continue\n",
 180     "    if reading_state == 'pre_from_title':\n",
 181     "        if line.strip().startswith('==='):\n",
 182     "            reading_state = 'keywords_from_title'\n",
 183     "            print('starting title', line.strip())\n",
 184     "        else:\n",
 185     "            reading_state = None\n",
 186     "            print('not start of titles', line.strip())\n",
 187     "        continue\n",
 188     "    if line.strip().startswith('8: THE KEYWORDS LIST'):\n",
 189     "        reading_state = 'pre_from_title'\n",
 190     "        print('pre-title', line.strip())\n",
 191     "        continue\n",
 192     "    if line.strip().startswith('==='):\n",
 193     "        reading_state = None\n",
 194     "        print('skipping underlines', line.strip())\n",
 195     "        continue\n",
 196     "        \n",
 197     "    if reading_state == 'keywords_from_list':\n",
 198     "        keywords_from_list.update(read_keywords_from_list(line.strip()))\n",
 199     "    elif reading_state == 'keywords_from_title':\n",
 200     "        keywords_from_title.update(read_keywords_from_title(line.strip()))\n",
 201     "\n",
 202     "sum_keywords = sum(keywords_from_list.values())"
 203    ]
 204   },
 205   {
 206    "cell_type": "code",
 207    "execution_count": 79,
 208    "metadata": {
 209     "collapsed": false
 210    },
 211    "outputs": [
 212     {
 213      "data": {
 214       "text/plain": [
 215        "[('sex', 83531),\n",
 216        " ('hardcore', 69247),\n",
 217        " ('character-name-in-title', 44747),\n",
 218        " ('independent-film', 37932)]"
 219       ]
 220      },
 221      "execution_count": 79,
 222      "metadata": {},
 223      "output_type": "execute_result"
 224     }
 225    ],
 226    "source": [
 227     "keywords_from_list.most_common(4)"
 228    ]
 229   },
 230   {
 231    "cell_type": "code",
 232    "execution_count": 80,
 233    "metadata": {
 234     "collapsed": false
 235    },
 236    "outputs": [
 237     {
 238      "data": {
 239       "text/plain": [
 240        "[('sex', 83531),\n",
 241        " ('hardcore', 69247),\n",
 242        " ('character-name-in-title', 44747),\n",
 243        " ('independent-film', 37932)]"
 244       ]
 245      },
 246      "execution_count": 80,
 247      "metadata": {},
 248      "output_type": "execute_result"
 249     }
 250    ],
 251    "source": [
 252     "keywords_from_title.most_common(4)"
 253    ]
 254   },
 255   {
 256    "cell_type": "code",
 257    "execution_count": 81,
 258    "metadata": {
 259     "collapsed": true
 260    },
 261    "outputs": [],
 262    "source": [
 263     "def pick_random(items, count):\n",
 264     "    i = random.randint(0, count)\n",
 265     "    return list(itertools.islice(items.elements(), i, i+1))[0]"
 266    ]
 267   },
 268   {
 269    "cell_type": "code",
 270    "execution_count": 85,
 271    "metadata": {
 272     "collapsed": false
 273    },
 274    "outputs": [
 275     {
 276      "name": "stdout",
 277      "output_type": "stream",
 278      "text": [
 279       "mother-daughter-relationship\n",
 280       "oral-sex\n",
 281       "cyberpunk\n",
 282       "main-character-shot\n",
 283       "medical-gloves\n",
 284       "tv-show\n",
 285       "female-nudity\n",
 286       "conundrum\n",
 287       "los-angeles-california\n",
 288       "elevator\n",
 289       "yorkshireman\n",
 290       "paper-money\n",
 291       "unfaithful-husband\n",
 292       "brain-teaser\n",
 293       "birthday\n",
 294       "ranch\n",
 295       "festival\n",
 296       "world-war-two\n",
 297       "reference-to-wikipedia\n",
 298       "sundance\n"
 299      ]
 300     }
 301    ],
 302    "source": [
 303     "for _ in range(20):\n",
 304     "    print(pick_random(keywords_from_list, sum_keywords))"
 305    ]
 306   },
 307   {
 308    "cell_type": "code",
 309    "execution_count": null,
 310    "metadata": {
 311     "collapsed": true
 312    },
 313    "outputs": [],
 314    "source": []
 315   }
 316  ],
 317  "metadata": {
 318   "kernelspec": {
 319    "display_name": "Python 3",
 320    "language": "python",
 321    "name": "python3"
 322   },
 323   "language_info": {
 324    "codemirror_mode": {
 325     "name": "ipython",
 326     "version": 3
 327    },
 328    "file_extension": ".py",
 329    "mimetype": "text/x-python",
 330    "name": "python",
 331    "nbconvert_exporter": "python",
 332    "pygments_lexer": "ipython3",
 333    "version": "3.4.3+"
 334   }
 335  },
 336  "nbformat": 4,
 337  "nbformat_minor": 0
 338 }