Fixed missing file contents
[tm351-mongo.git] / data-cleaning.ipynb
1 {
2 "metadata": {
3 "name": ""
4 },
5 "nbformat": 3,
6 "nbformat_minor": 0,
7 "worksheets": [
8 {
9 "cells": [
10 {
11 "cell_type": "code",
12 "collapsed": false,
13 "input": [
14 "import pymongo\n",
15 "client = pymongo.MongoClient('mongodb://ogedei:27017/')"
16 ],
17 "language": "python",
18 "metadata": {},
19 "outputs": [],
20 "prompt_number": 1
21 },
22 {
23 "cell_type": "code",
24 "collapsed": false,
25 "input": [
26 "db = client.accidents\n",
27 "accidents = db.accidents\n",
28 "vehicles = db.vehicles\n",
29 "casualties = db.casualties"
30 ],
31 "language": "python",
32 "metadata": {},
33 "outputs": [],
34 "prompt_number": 4
35 },
36 {
37 "cell_type": "code",
38 "collapsed": false,
39 "input": [
40 "accidents.find().count()"
41 ],
42 "language": "python",
43 "metadata": {},
44 "outputs": [
45 {
46 "metadata": {},
47 "output_type": "pyout",
48 "prompt_number": 5,
49 "text": [
50 "1355615"
51 ]
52 }
53 ],
54 "prompt_number": 5
55 },
56 {
57 "cell_type": "code",
58 "collapsed": false,
59 "input": [
60 "casualties.find().count()"
61 ],
62 "language": "python",
63 "metadata": {},
64 "outputs": [
65 {
66 "metadata": {},
67 "output_type": "pyout",
68 "prompt_number": 9,
69 "text": [
70 "0"
71 ]
72 }
73 ],
74 "prompt_number": 9
75 },
76 {
77 "cell_type": "code",
78 "collapsed": false,
79 "input": [
80 "[accidents.find_one({'Accident_Index':v['Acc_Index']}, ['Accident_Index', 'Date']) \n",
81 " for v in vehicles.find(fields=['Acc_Index'], limit=10)]"
82 ],
83 "language": "python",
84 "metadata": {},
85 "outputs": [
86 {
87 "metadata": {},
88 "output_type": "pyout",
89 "prompt_number": 6,
90 "text": [
91 "[]"
92 ]
93 }
94 ],
95 "prompt_number": 6
96 },
97 {
98 "cell_type": "code",
99 "collapsed": false,
100 "input": [
101 "# Find all the vehicles that don't have an accident\n",
102 "for v in vehicles.find(fields=['Acc_Index']):\n",
103 " if not accidents.find_one({'Accident_Index': v['Acc_Index']}):\n",
104 " print(v)"
105 ],
106 "language": "python",
107 "metadata": {},
108 "outputs": [],
109 "prompt_number": 7
110 },
111 {
112 "cell_type": "code",
113 "collapsed": false,
114 "input": [
115 "for v in vehicles.find():\n",
116 " accidents.update({'Accident_Index': v['Acc_Index']}, {'$push' : {\"Involved_Vehicles\" : v}})"
117 ],
118 "language": "python",
119 "metadata": {},
120 "outputs": [],
121 "prompt_number": 7
122 },
123 {
124 "cell_type": "code",
125 "collapsed": false,
126 "input": [
127 "for v in vehicles.find():\n",
128 " accidents.update({'Accident_Index': v['Acc_Index']}, {'$push' : {\"Vehicles\" : v}})"
129 ],
130 "language": "python",
131 "metadata": {},
132 "outputs": [],
133 "prompt_number": 8
134 },
135 {
136 "cell_type": "code",
137 "collapsed": false,
138 "input": [
139 "for c in casualties.find():\n",
140 " accidents.update({'Accident_Index': c['Acc_Index']}, {'$push' : {\"Casualties\" : c}})"
141 ],
142 "language": "python",
143 "metadata": {},
144 "outputs": [],
145 "prompt_number": 9
146 },
147 {
148 "cell_type": "code",
149 "collapsed": false,
150 "input": [
151 "accidents.update({\"$exists\": \"Involved_Vehicles\"}, {\"$unset\": {\"Involved_Vehicles\":1}})"
152 ],
153 "language": "python",
154 "metadata": {},
155 "outputs": [
156 {
157 "metadata": {},
158 "output_type": "pyout",
159 "prompt_number": 11,
160 "text": [
161 "{'connectionId': 20, 'err': None, 'n': 0, 'ok': 1.0, 'updatedExisting': False}"
162 ]
163 }
164 ],
165 "prompt_number": 11
166 },
167 {
168 "cell_type": "code",
169 "collapsed": false,
170 "input": [
171 "accidents.find({\"Involved_Vehicles\" : {\"$exists\" : 1}}).count()"
172 ],
173 "language": "python",
174 "metadata": {},
175 "outputs": [
176 {
177 "metadata": {},
178 "output_type": "pyout",
179 "prompt_number": 22,
180 "text": [
181 "0"
182 ]
183 }
184 ],
185 "prompt_number": 22
186 },
187 {
188 "cell_type": "code",
189 "collapsed": false,
190 "input": [
191 "accidents.update({\"Involved_Vehicles\" : {\"$exists\" : 1}}, {\"$unset\": {\"Involved_Vehicles\":1}}, multi=True)"
192 ],
193 "language": "python",
194 "metadata": {},
195 "outputs": [
196 {
197 "metadata": {},
198 "output_type": "pyout",
199 "prompt_number": 21,
200 "text": [
201 "{'connectionId': 20, 'err': None, 'n': 57, 'ok': 1.0, 'updatedExisting': True}"
202 ]
203 }
204 ],
205 "prompt_number": 21
206 },
207 {
208 "cell_type": "code",
209 "collapsed": false,
210 "input": [],
211 "language": "python",
212 "metadata": {},
213 "outputs": []
214 }
215 ],
216 "metadata": {}
217 }
218 ]
219 }