Fixed bugs in recursive version of interleave
[ou-summer-of-code-2017.git] / data-analysis / import-data.ipynb
1 {
2 "cells": [
3 {
4 "cell_type": "code",
5 "execution_count": 1,
6 "metadata": {
7 "collapsed": true
8 },
9 "outputs": [],
10 "source": [
11 "# Import the required libraries and open the connection to Mongo\n",
12 "\n",
13 "import collections\n",
14 "from datetime import datetime\n",
15 "import matplotlib as mpl\n",
16 "import matplotlib.pyplot as plt\n",
17 "%matplotlib inline\n",
18 "\n",
19 "import numpy as np\n",
20 "import pandas as pd\n",
21 "import scipy.stats\n",
22 "\n",
23 "import pymongo\n",
24 "from bson.objectid import ObjectId\n",
25 "# client = pymongo.MongoClient('mongodb://localhost:27117/')\n",
26 "# client = pymongo.MongoClient('mongodb://ogedei:27017')\n",
27 "client = pymongo.MongoClient('mongodb://localhost:27017')"
28 ]
29 },
30 {
31 "cell_type": "code",
32 "execution_count": 2,
33 "metadata": {
34 "collapsed": true
35 },
36 "outputs": [],
37 "source": [
38 "# Connect to the database\n",
39 "summerofcode_db = client.summerofcode\n",
40 "imp = summerofcode_db.imp\n",
41 "icmas = summerofcode_db.icmas"
42 ]
43 },
44 {
45 "cell_type": "code",
46 "execution_count": 3,
47 "metadata": {},
48 "outputs": [
49 {
50 "name": "stdout",
51 "output_type": "stream",
52 "text": [
53 "import-data.ipynb\r\n",
54 "SUMMEROFCODE-Day 0 Warm up holiday prices-grades.csv\r\n",
55 "SUMMEROFCODE-Day 10 Word search-grades.csv\r\n",
56 "SUMMEROFCODE-Day 1 Choosing a holiday-grades.csv\r\n",
57 "SUMMEROFCODE-Day 2 Lifts-grades.csv\r\n",
58 "SUMMEROFCODE-Day 3 Door codes-grades.csv\r\n",
59 "SUMMEROFCODE-Day 4 Beach labyrinth-grades.csv\r\n",
60 "SUMMEROFCODE-Day 5 Laser display boards-grades.csv\r\n",
61 "SUMMEROFCODE-Day 6 Tour guides-grades.csv\r\n",
62 "SUMMEROFCODE-Day 7 Fixing the minibar-grades.csv\r\n",
63 "SUMMEROFCODE-Day 8 Visa woes-grades.csv\r\n",
64 "SUMMEROFCODE-Day 9 Resolving the bill-grades.csv\r\n",
65 "survey-email_1.csv\r\n",
66 "survey-email.csv\r\n",
67 "task-analysis.ipynb\r\n",
68 "winners.ipynb\r\n"
69 ]
70 }
71 ],
72 "source": [
73 "!ls"
74 ]
75 },
76 {
77 "cell_type": "code",
78 "execution_count": 4,
79 "metadata": {
80 "collapsed": true
81 },
82 "outputs": [],
83 "source": [
84 "def update_grades():\n",
85 " for i in imp.find(modifiers={\"$snapshot\": True}):\n",
86 " imp.update_one({'_id': i['_id']},\n",
87 " {'$set': {'grade': i['Grade/10']['00'],\n",
88 " 'q1': i['Q'][' 1 /5']['00'],\n",
89 " 'q2': i['Q'][' 2 /5']['00']},\n",
90 " '$unset': {'Grade/10': '', 'Q': ''}})"
91 ]
92 },
93 {
94 "cell_type": "code",
95 "execution_count": 5,
96 "metadata": {
97 "collapsed": true
98 },
99 "outputs": [],
100 "source": [
101 "def update_dates():\n",
102 " for i in imp.find(modifiers={\"$snapshot\": True}):\n",
103 " starttime = datetime.strptime(i['Started on'].strip(), '%d %b %Y %H:%M')\n",
104 " if i['Completed'] == '-':\n",
105 " imp.update_one({'_id': i['_id']},\n",
106 " {'$set': {'started': starttime},\n",
107 " '$unset': {'Started on': '', 'Completed': '', 'Time taken': ''}})\n",
108 " else:\n",
109 " endtime = datetime.strptime(i['Completed'].strip(), '%d %b %Y %H:%M')\n",
110 " imp.update_one({'_id': i['_id']},\n",
111 " {'$set': {'started': starttime,\n",
112 " 'completed': endtime},\n",
113 " '$unset': {'Started on': '', 'Completed': '', 'Time taken': ''}})"
114 ]
115 },
116 {
117 "cell_type": "code",
118 "execution_count": 6,
119 "metadata": {
120 "collapsed": true
121 },
122 "outputs": [],
123 "source": [
124 "def merge_imported(icma_number):\n",
125 " icmas.delete_many({'icma_number': icma_number})\n",
126 "\n",
127 " for i in imp.find(modifiers={\"$snapshot\": True}):\n",
128 " del i['_id']\n",
129 " i['icma_number'] = icma_number\n",
130 " icmas.insert_one(i)"
131 ]
132 },
133 {
134 "cell_type": "code",
135 "execution_count": 7,
136 "metadata": {},
137 "outputs": [
138 {
139 "data": {
140 "text/plain": [
141 "['SUMMEROFCODE-Day 0 Warm up holiday prices-grades.csv',\n",
142 " 'SUMMEROFCODE-Day 10 Word search-grades.csv',\n",
143 " 'SUMMEROFCODE-Day 1 Choosing a holiday-grades.csv',\n",
144 " 'SUMMEROFCODE-Day 2 Lifts-grades.csv',\n",
145 " 'SUMMEROFCODE-Day 3 Door codes-grades.csv',\n",
146 " 'SUMMEROFCODE-Day 4 Beach labyrinth-grades.csv',\n",
147 " 'SUMMEROFCODE-Day 5 Laser display boards-grades.csv',\n",
148 " 'SUMMEROFCODE-Day 6 Tour guides-grades.csv',\n",
149 " 'SUMMEROFCODE-Day 7 Fixing the minibar-grades.csv',\n",
150 " 'SUMMEROFCODE-Day 8 Visa woes-grades.csv',\n",
151 " 'SUMMEROFCODE-Day 9 Resolving the bill-grades.csv']"
152 ]
153 },
154 "execution_count": 7,
155 "metadata": {},
156 "output_type": "execute_result"
157 }
158 ],
159 "source": [
160 "csvs = !ls SUMMEROFCODE*csv\n",
161 "csvs"
162 ]
163 },
164 {
165 "cell_type": "code",
166 "execution_count": 8,
167 "metadata": {},
168 "outputs": [
169 {
170 "data": {
171 "text/plain": [
172 "<pymongo.results.DeleteResult at 0x7f7700eba5a0>"
173 ]
174 },
175 "execution_count": 8,
176 "metadata": {},
177 "output_type": "execute_result"
178 }
179 ],
180 "source": [
181 "icmas.delete_many({})"
182 ]
183 },
184 {
185 "cell_type": "code",
186 "execution_count": 9,
187 "metadata": {},
188 "outputs": [
189 {
190 "name": "stdout",
191 "output_type": "stream",
192 "text": [
193 "SUMMEROFCODE-Day 0 Warm up holiday prices-grades.csv\n",
194 "2017-07-30T19:43:58.575+0100\tconnected to: localhost\n",
195 "2017-07-30T19:43:58.575+0100\tdropping: summerofcode.imp\n",
196 "2017-07-30T19:43:58.592+0100\timported 292 documents\n",
197 "SUMMEROFCODE-Day 10 Word search-grades.csv\n",
198 "2017-07-30T19:43:58.948+0100\tconnected to: localhost\n",
199 "2017-07-30T19:43:58.948+0100\tdropping: summerofcode.imp\n",
200 "2017-07-30T19:43:58.950+0100\timported 60 documents\n",
201 "SUMMEROFCODE-Day 1 Choosing a holiday-grades.csv\n",
202 "2017-07-30T19:43:59.121+0100\tconnected to: localhost\n",
203 "2017-07-30T19:43:59.121+0100\tdropping: summerofcode.imp\n",
204 "2017-07-30T19:43:59.124+0100\timported 153 documents\n",
205 "SUMMEROFCODE-Day 2 Lifts-grades.csv\n",
206 "2017-07-30T19:43:59.366+0100\tconnected to: localhost\n",
207 "2017-07-30T19:43:59.366+0100\tdropping: summerofcode.imp\n",
208 "2017-07-30T19:43:59.369+0100\timported 102 documents\n",
209 "SUMMEROFCODE-Day 3 Door codes-grades.csv\n",
210 "2017-07-30T19:43:59.569+0100\tconnected to: localhost\n",
211 "2017-07-30T19:43:59.569+0100\tdropping: summerofcode.imp\n",
212 "2017-07-30T19:43:59.571+0100\timported 88 documents\n",
213 "SUMMEROFCODE-Day 4 Beach labyrinth-grades.csv\n",
214 "2017-07-30T19:43:59.753+0100\tconnected to: localhost\n",
215 "2017-07-30T19:43:59.753+0100\tdropping: summerofcode.imp\n",
216 "2017-07-30T19:43:59.756+0100\timported 97 documents\n",
217 "SUMMEROFCODE-Day 5 Laser display boards-grades.csv\n",
218 "2017-07-30T19:43:59.960+0100\tconnected to: localhost\n",
219 "2017-07-30T19:43:59.960+0100\tdropping: summerofcode.imp\n",
220 "2017-07-30T19:43:59.963+0100\timported 76 documents\n",
221 "SUMMEROFCODE-Day 6 Tour guides-grades.csv\n",
222 "2017-07-30T19:44:00.141+0100\tconnected to: localhost\n",
223 "2017-07-30T19:44:00.142+0100\tdropping: summerofcode.imp\n",
224 "2017-07-30T19:44:00.144+0100\timported 98 documents\n",
225 "SUMMEROFCODE-Day 7 Fixing the minibar-grades.csv\n",
226 "2017-07-30T19:44:00.337+0100\tconnected to: localhost\n",
227 "2017-07-30T19:44:00.337+0100\tdropping: summerofcode.imp\n",
228 "2017-07-30T19:44:00.339+0100\timported 54 documents\n",
229 "SUMMEROFCODE-Day 8 Visa woes-grades.csv\n",
230 "2017-07-30T19:44:00.496+0100\tconnected to: localhost\n",
231 "2017-07-30T19:44:00.496+0100\tdropping: summerofcode.imp\n",
232 "2017-07-30T19:44:00.498+0100\timported 60 documents\n",
233 "SUMMEROFCODE-Day 9 Resolving the bill-grades.csv\n",
234 "2017-07-30T19:44:00.660+0100\tconnected to: localhost\n",
235 "2017-07-30T19:44:00.660+0100\tdropping: summerofcode.imp\n",
236 "2017-07-30T19:44:00.662+0100\timported 42 documents\n"
237 ]
238 }
239 ],
240 "source": [
241 "for csv in csvs:\n",
242 " print(csv)\n",
243 " qcsv = \"'{}'\".format(csv)\n",
244 " !mongoimport --drop --db summerofcode --collection imp --type csv --headerline --ignoreBlanks --file {qcsv}\n",
245 " imp.delete_many({'Surname': 'Overall average'})\n",
246 " update_grades()\n",
247 " update_dates()\n",
248 " merge_imported(int(csv.split()[1]))"
249 ]
250 },
251 {
252 "cell_type": "code",
253 "execution_count": 10,
254 "metadata": {},
255 "outputs": [
256 {
257 "data": {
258 "text/plain": [
259 "1111"
260 ]
261 },
262 "execution_count": 10,
263 "metadata": {},
264 "output_type": "execute_result"
265 }
266 ],
267 "source": [
268 "icmas.find({}, ['icma']).count()"
269 ]
270 },
271 {
272 "cell_type": "code",
273 "execution_count": 11,
274 "metadata": {},
275 "outputs": [
276 {
277 "data": {
278 "text/plain": [
279 "[{'_id': 3, 'count': 87},\n",
280 " {'_id': 2, 'count': 101},\n",
281 " {'_id': 10, 'count': 59},\n",
282 " {'_id': 0, 'count': 291},\n",
283 " {'_id': 6, 'count': 97},\n",
284 " {'_id': 1, 'count': 152},\n",
285 " {'_id': 5, 'count': 75},\n",
286 " {'_id': 4, 'count': 96},\n",
287 " {'_id': 8, 'count': 59},\n",
288 " {'_id': 7, 'count': 53},\n",
289 " {'_id': 9, 'count': 41}]"
290 ]
291 },
292 "execution_count": 11,
293 "metadata": {},
294 "output_type": "execute_result"
295 }
296 ],
297 "source": [
298 "pipeline = [{\"$group\": {\"_id\": \"$icma_number\", \"count\": {\"$sum\": 1}}}]\n",
299 "list(icmas.aggregate(pipeline))"
300 ]
301 },
302 {
303 "cell_type": "code",
304 "execution_count": null,
305 "metadata": {
306 "collapsed": true
307 },
308 "outputs": [],
309 "source": []
310 }
311 ],
312 "metadata": {
313 "kernelspec": {
314 "display_name": "Python 3",
315 "language": "python",
316 "name": "python3"
317 },
318 "language_info": {
319 "codemirror_mode": {
320 "name": "ipython",
321 "version": 3
322 },
323 "file_extension": ".py",
324 "mimetype": "text/x-python",
325 "name": "python",
326 "nbconvert_exporter": "python",
327 "pygments_lexer": "ipython3",
328 "version": "3.5.2+"
329 }
330 },
331 "nbformat": 4,
332 "nbformat_minor": 2
333 }