From: Neil Smith Date: Fri, 4 Mar 2016 14:12:00 +0000 (+0000) Subject: Initial working X-Git-Url: https://git.njae.me.uk/?a=commitdiff_plain;h=abbb5a3ff02c8f39fbf41aee5177b51d0a92e674;p=imdb-keyword-list.git Initial working --- abbb5a3ff02c8f39fbf41aee5177b51d0a92e674 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0546add --- /dev/null +++ b/.gitignore @@ -0,0 +1,45 @@ +*.py[cod] + +# C extensions +*.so + +# Packages +*.egg +*.egg-info +dist +build +eggs +parts +bin +var +sdist +develop-eggs +.installed.cfg +lib +lib64 +__pycache__ + +# Installer logs +pip-log.txt + +# Unit test / coverage reports +.coverage +.tox +nosetests.xml + +# Translations +*.mo + +# Mr Developer +.mr.developer.cfg +.project +.pydevproject + +# IPython +.ipynb* + +# Sublime text +*.sublime-workspace + +# Logs +*.log diff --git a/generate-random-plots.ipynb b/generate-random-plots.ipynb new file mode 100644 index 0000000..82a7f23 --- /dev/null +++ b/generate-random-plots.ipynb @@ -0,0 +1,338 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Generate random movie plot elements\n", + "\n", + "Data from [IMDB](http://www.imdb.com/interfaces), held at [ftp://ftp.fu-berlin.de/pub/misc/movies/database/](ftp://ftp.fu-berlin.de/pub/misc/movies/database/), in the `keywords.list.gz` file.\n", + "\n", + "[Abulafia generator](http://www.random-generator.com/index.php?title=Plot_Keyword_Oracle) and [Story-games thread](http://story-games.com/forums/discussion/3502/new-toy-imdb-plot-keywords), and [an earlier version of the list](http://www.logrus.com/~moose/page1/files/cleanplots.txt)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Get the file\n", + "!wget ftp://ftp.fu-berlin.de/pub/misc/movies/database/keywords.list.gz\n", + "\n", + "!gunzip keywords.list.gz" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "generate-random-plots.ipynb keywords.list\r\n" + ] + } + ], + "source": [ + "!ls" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import collections\n", + "import re\n", + "import random\n", + "import itertools" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def read_keywords_from_list(line):\n", + " if not line:\n", + " return []\n", + " accumulator = []\n", + " for kn in re.split('\\t+', line):\n", + " k, n = kn.split()\n", + " n = int(n[1:-1])\n", + " accumulator += [k] * n\n", + " return accumulator" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['blue-pantyhose', 'blue-party', 'blue-party', 'blue-pearl']" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "read_keywords_from_list('blue-pantyhose (1)\t\tblue-party (2)\tblue-pearl (1)')" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def read_keywords_from_title(line):\n", + " if line:\n", + " return [re.split('\\t+', line)[1]]\n", + " else:\n", + " return []" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['beer-drinking']" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "read_keywords_from_title('Been Down So Long It Looks Like Up to Me (1971)\t\tbeer-drinking')" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "skipping underlines ========\n", + "ending list 5: Submission Rules\n", + "skipping underlines =============\n", + "skipping underlines ==================\n", + "skipping underlines ===============\n", + "starting list keywords in use:\n", + "ending list 5: Submission Rules\n", + "skipping underlines ===================\n", + "skipping underlines ==============\n", + "skipping underlines ======================\n", + "skipping underlines =================================================\n", + "pre-title 8: THE KEYWORDS LIST\n", + "starting title ====================\n" + ] + } + ], + "source": [ + "keywords_from_list = collections.Counter()\n", + "keywords_from_title = collections.Counter()\n", + "reading_state = None\n", + "for line in open('keywords.list', encoding='latin-1').readlines():\n", + " \n", + " if line.strip().startswith('keywords in use'):\n", + " reading_state = 'keywords_from_list'\n", + " print('starting list', line.strip())\n", + " continue\n", + " if line.strip().startswith('5: Submission Rules'):\n", + " readings_state = None\n", + " print('ending list', line.strip())\n", + " continue\n", + " if reading_state == 'pre_from_title':\n", + " if line.strip().startswith('==='):\n", + " reading_state = 'keywords_from_title'\n", + " print('starting title', line.strip())\n", + " else:\n", + " reading_state = None\n", + " print('not start of titles', line.strip())\n", + " continue\n", + " if line.strip().startswith('8: THE KEYWORDS LIST'):\n", + " reading_state = 'pre_from_title'\n", + " print('pre-title', line.strip())\n", + " continue\n", + " if line.strip().startswith('==='):\n", + " reading_state = None\n", + " print('skipping underlines', line.strip())\n", + " continue\n", + " \n", + " if reading_state == 'keywords_from_list':\n", + " keywords_from_list.update(read_keywords_from_list(line.strip()))\n", + " elif reading_state == 'keywords_from_title':\n", + " keywords_from_title.update(read_keywords_from_title(line.strip()))\n", + "\n", + "sum_keywords = sum(keywords_from_list.values())" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('sex', 83531),\n", + " ('hardcore', 69247),\n", + " ('character-name-in-title', 44747),\n", + " ('independent-film', 37932)]" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "keywords_from_list.most_common(4)" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('sex', 83531),\n", + " ('hardcore', 69247),\n", + " ('character-name-in-title', 44747),\n", + " ('independent-film', 37932)]" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "keywords_from_title.most_common(4)" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def pick_random(items, count):\n", + " i = random.randint(0, count)\n", + " return list(itertools.islice(items.elements(), i, i+1))[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mother-daughter-relationship\n", + "oral-sex\n", + "cyberpunk\n", + "main-character-shot\n", + "medical-gloves\n", + "tv-show\n", + "female-nudity\n", + "conundrum\n", + "los-angeles-california\n", + "elevator\n", + "yorkshireman\n", + "paper-money\n", + "unfaithful-husband\n", + "brain-teaser\n", + "birthday\n", + "ranch\n", + "festival\n", + "world-war-two\n", + "reference-to-wikipedia\n", + "sundance\n" + ] + } + ], + "source": [ + "for _ in range(20):\n", + " print(pick_random(keywords_from_list, sum_keywords))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.4.3+" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}