From 43d29f284e13eb582dee4bd1866084abf1bc1115 Mon Sep 17 00:00:00 2001 From: Neil Smith Date: Thu, 3 Apr 2014 18:30:43 +0100 Subject: [PATCH] Initial files --- data-cleaning.ipynb | 219 +++++++++++++++++++++++++++++++++++++ data-use.ipynb | 0 make-small-accidents.ipynb | 168 ++++++++++++++++++++++++++++ 3 files changed, 387 insertions(+) create mode 100755 data-cleaning.ipynb create mode 100755 data-use.ipynb create mode 100644 make-small-accidents.ipynb diff --git a/data-cleaning.ipynb b/data-cleaning.ipynb new file mode 100755 index 0000000..30913ac --- /dev/null +++ b/data-cleaning.ipynb @@ -0,0 +1,219 @@ +{ + "metadata": { + "name": "" + }, + "nbformat": 3, + "nbformat_minor": 0, + "worksheets": [ + { + "cells": [ + { + "cell_type": "code", + "collapsed": false, + "input": [ + "import pymongo\n", + "client = pymongo.MongoClient('mongodb://ogedei:27017/')" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 1 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "db = client.accidents\n", + "accidents = db.accidents\n", + "vehicles = db.vehicles\n", + "casualties = db.casualties" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 4 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "accidents.find().count()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 5, + "text": [ + "1355615" + ] + } + ], + "prompt_number": 5 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "casualties.find().count()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 9, + "text": [ + "0" + ] + } + ], + "prompt_number": 9 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "[accidents.find_one({'Accident_Index':v['Acc_Index']}, ['Accident_Index', 'Date']) \n", + " for v in vehicles.find(fields=['Acc_Index'], limit=10)]" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 6, + "text": [ + "[]" + ] + } + ], + "prompt_number": 6 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# Find all the vehicles that don't have an accident\n", + "for v in vehicles.find(fields=['Acc_Index']):\n", + " if not accidents.find_one({'Accident_Index': v['Acc_Index']}):\n", + " print(v)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 7 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "for v in vehicles.find():\n", + " accidents.update({'Accident_Index': v['Acc_Index']}, {'$push' : {\"Involved_Vehicles\" : v}})" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 7 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "for v in vehicles.find():\n", + " accidents.update({'Accident_Index': v['Acc_Index']}, {'$push' : {\"Vehicles\" : v}})" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 8 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "for c in casualties.find():\n", + " accidents.update({'Accident_Index': c['Acc_Index']}, {'$push' : {\"Casualties\" : c}})" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 9 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "accidents.update({\"$exists\": \"Involved_Vehicles\"}, {\"$unset\": {\"Involved_Vehicles\":1}})" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 11, + "text": [ + "{'connectionId': 20, 'err': None, 'n': 0, 'ok': 1.0, 'updatedExisting': False}" + ] + } + ], + "prompt_number": 11 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "accidents.find({\"Involved_Vehicles\" : {\"$exists\" : 1}}).count()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 22, + "text": [ + "0" + ] + } + ], + "prompt_number": 22 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "accidents.update({\"Involved_Vehicles\" : {\"$exists\" : 1}}, {\"$unset\": {\"Involved_Vehicles\":1}}, multi=True)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 21, + "text": [ + "{'connectionId': 20, 'err': None, 'n': 57, 'ok': 1.0, 'updatedExisting': True}" + ] + } + ], + "prompt_number": 21 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [], + "language": "python", + "metadata": {}, + "outputs": [] + } + ], + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/data-use.ipynb b/data-use.ipynb new file mode 100755 index 0000000..e69de29 diff --git a/make-small-accidents.ipynb b/make-small-accidents.ipynb new file mode 100644 index 0000000..8635c80 --- /dev/null +++ b/make-small-accidents.ipynb @@ -0,0 +1,168 @@ +{ + "metadata": { + "name": "" + }, + "nbformat": 3, + "nbformat_minor": 0, + "worksheets": [ + { + "cells": [ + { + "cell_type": "code", + "collapsed": false, + "input": [ + "import pymongo\n", + "client = pymongo.MongoClient('mongodb://ogedei:27017/')" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 2 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Connect to the `accidents` database" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "db = client.accidents\n", + "accidents = db.accidents" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 3 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "accidents.find().count()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 4, + "text": [ + "1355615" + ] + } + ], + "prompt_number": 4 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Drop any existing small accident database" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "client.drop_database('asmall')" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 21 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a new database with a new collection.\n", + "\n", + "Note that this is created lazily, so neither the database nor the collection will appear on the server until we've put some data in it." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "small_db = client.asmall\n", + "small_accidents = small_db.accidents" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 22 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copy the first 100 accidents across." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "for a in accidents.find(limit=100):\n", + " small_accidents.insert(a)\n", + "small_accidents.find().count()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 23, + "text": [ + "100" + ] + } + ], + "prompt_number": 23 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create the index for it." + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "small_accidents.create_index('Accident_Index')" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 24, + "text": [ + "'Accident_Index_1'" + ] + } + ], + "prompt_number": 24 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [], + "language": "python", + "metadata": {}, + "outputs": [] + } + ], + "metadata": {} + } + ] +} \ No newline at end of file -- 2.34.1