{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Generate random movie plot elements\n",
    "\n",
    "Data from [IMDB](http://www.imdb.com/interfaces), held at [ftp://ftp.fu-berlin.de/pub/misc/movies/database/](ftp://ftp.fu-berlin.de/pub/misc/movies/database/), in the `keywords.list.gz` file.\n",
    "\n",
    "[Abulafia generator](http://www.random-generator.com/index.php?title=Plot_Keyword_Oracle) and [Story-games thread](http://story-games.com/forums/discussion/3502/new-toy-imdb-plot-keywords), and [an earlier version of the list](http://www.logrus.com/~moose/page1/files/cleanplots.txt)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Get the file\n",
    "!wget ftp://ftp.fu-berlin.de/pub/misc/movies/database/keywords.list.gz\n",
    "\n",
    "!gunzip keywords.list.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "generate-random-plots.ipynb  keywords.list\r\n"
     ]
    }
   ],
   "source": [
    "!ls"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import collections\n",
    "import re\n",
    "import random\n",
    "import itertools"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def read_keywords_from_list(line):\n",
    "    if not line:\n",
    "        return []\n",
    "    accumulator = []\n",
    "    for kn in re.split('\\t+', line):\n",
    "        k, n = kn.split()\n",
    "        n = int(n[1:-1])\n",
    "        accumulator += [k] * n\n",
    "    return accumulator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['blue-pantyhose', 'blue-party', 'blue-party', 'blue-pearl']"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "read_keywords_from_list('blue-pantyhose (1)\t\tblue-party (2)\tblue-pearl (1)')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def read_keywords_from_title(line):\n",
    "    if line:\n",
    "        return [re.split('\\t+', line)[1]]\n",
    "    else:\n",
    "        return []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['beer-drinking']"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "read_keywords_from_title('Been Down So Long It Looks Like Up to Me (1971)\t\tbeer-drinking')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "skipping underlines ========\n",
      "ending list 5: Submission Rules\n",
      "skipping underlines =============\n",
      "skipping underlines ==================\n",
      "skipping underlines ===============\n",
      "starting list keywords in use:\n",
      "ending list 5: Submission Rules\n",
      "skipping underlines ===================\n",
      "skipping underlines ==============\n",
      "skipping underlines ======================\n",
      "skipping underlines =================================================\n",
      "pre-title 8: THE KEYWORDS LIST\n",
      "starting title ====================\n"
     ]
    }
   ],
   "source": [
    "keywords_from_list = collections.Counter()\n",
    "keywords_from_title = collections.Counter()\n",
    "reading_state = None\n",
    "for line in open('keywords.list', encoding='latin-1').readlines():\n",
    "        \n",
    "    if line.strip().startswith('keywords in use'):\n",
    "        reading_state = 'keywords_from_list'\n",
    "        print('starting list', line.strip())\n",
    "        continue\n",
    "    if line.strip().startswith('5: Submission Rules'):\n",
    "        readings_state = None\n",
    "        print('ending list', line.strip())\n",
    "        continue\n",
    "    if reading_state == 'pre_from_title':\n",
    "        if line.strip().startswith('==='):\n",
    "            reading_state = 'keywords_from_title'\n",
    "            print('starting title', line.strip())\n",
    "        else:\n",
    "            reading_state = None\n",
    "            print('not start of titles', line.strip())\n",
    "        continue\n",
    "    if line.strip().startswith('8: THE KEYWORDS LIST'):\n",
    "        reading_state = 'pre_from_title'\n",
    "        print('pre-title', line.strip())\n",
    "        continue\n",
    "    if line.strip().startswith('==='):\n",
    "        reading_state = None\n",
    "        print('skipping underlines', line.strip())\n",
    "        continue\n",
    "        \n",
    "    if reading_state == 'keywords_from_list':\n",
    "        keywords_from_list.update(read_keywords_from_list(line.strip()))\n",
    "    elif reading_state == 'keywords_from_title':\n",
    "        keywords_from_title.update(read_keywords_from_title(line.strip()))\n",
    "\n",
    "sum_keywords = sum(keywords_from_list.values())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('sex', 83531),\n",
       " ('hardcore', 69247),\n",
       " ('character-name-in-title', 44747),\n",
       " ('independent-film', 37932)]"
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "keywords_from_list.most_common(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('sex', 83531),\n",
       " ('hardcore', 69247),\n",
       " ('character-name-in-title', 44747),\n",
       " ('independent-film', 37932)]"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "keywords_from_title.most_common(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def pick_random(items, count):\n",
    "    i = random.randint(0, count)\n",
    "    return list(itertools.islice(items.elements(), i, i+1))[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mother-daughter-relationship\n",
      "oral-sex\n",
      "cyberpunk\n",
      "main-character-shot\n",
      "medical-gloves\n",
      "tv-show\n",
      "female-nudity\n",
      "conundrum\n",
      "los-angeles-california\n",
      "elevator\n",
      "yorkshireman\n",
      "paper-money\n",
      "unfaithful-husband\n",
      "brain-teaser\n",
      "birthday\n",
      "ranch\n",
      "festival\n",
      "world-war-two\n",
      "reference-to-wikipedia\n",
      "sundance\n"
     ]
    }
   ],
   "source": [
    "for _ in range(20):\n",
    "    print(pick_random(keywords_from_list, sum_keywords))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.3+"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}