24c5691c597aa2a9f53b6a9bfcaab4f7fb661c2d
[ou-summer-of-code-2017.git] / import-data.ipynb
1 {
2 "cells": [
3 {
4 "cell_type": "code",
5 "execution_count": 1,
6 "metadata": {
7 "collapsed": true
8 },
9 "outputs": [],
10 "source": [
11 "# Import the required libraries and open the connection to Mongo\n",
12 "\n",
13 "import collections\n",
14 "from datetime import datetime\n",
15 "import matplotlib as mpl\n",
16 "import matplotlib.pyplot as plt\n",
17 "%matplotlib inline\n",
18 "\n",
19 "import numpy as np\n",
20 "import pandas as pd\n",
21 "import scipy.stats\n",
22 "\n",
23 "import pymongo\n",
24 "from bson.objectid import ObjectId\n",
25 "# client = pymongo.MongoClient('mongodb://localhost:27117/')\n",
26 "# client = pymongo.MongoClient('mongodb://ogedei:27017')\n",
27 "client = pymongo.MongoClient('mongodb://localhost:27017')"
28 ]
29 },
30 {
31 "cell_type": "code",
32 "execution_count": 2,
33 "metadata": {
34 "collapsed": true
35 },
36 "outputs": [],
37 "source": [
38 "# Connect to the database\n",
39 "summerofcode_db = client.summerofcode\n",
40 "imp = summerofcode_db.imp\n",
41 "icmas = summerofcode_db.icmas"
42 ]
43 },
44 {
45 "cell_type": "code",
46 "execution_count": 3,
47 "metadata": {},
48 "outputs": [
49 {
50 "name": "stdout",
51 "output_type": "stream",
52 "text": [
53 "all-starts-hours.png\r\n",
54 "attempts-per-day.png\r\n",
55 "completion-hours.png\r\n",
56 "import-data.ipynb\r\n",
57 "participated-in-days.png\r\n",
58 "scores-per-day.png\r\n",
59 "SUMMEROFCODE-Day 0 Warm up holiday prices-grades.csv\r\n",
60 "SUMMEROFCODE-Day 10 Word search-grades.csv\r\n",
61 "SUMMEROFCODE-Day 1 Choosing a holiday-grades.csv\r\n",
62 "SUMMEROFCODE-Day 2 Lifts-grades.csv\r\n",
63 "SUMMEROFCODE-Day 3 Door codes-grades.csv\r\n",
64 "SUMMEROFCODE-Day 4 Beach labyrinth-grades.csv\r\n",
65 "SUMMEROFCODE-Day 5 Laser display boards-grades.csv\r\n",
66 "SUMMEROFCODE-Day 6 Tour guides-grades.csv\r\n",
67 "SUMMEROFCODE-Day 7 Fixing the minibar-grades.csv\r\n",
68 "SUMMEROFCODE-Day 8 Visa woes-grades.csv\r\n",
69 "SUMMEROFCODE-Day 9 Resolving the bill-grades.csv\r\n",
70 "survey-email_1.csv\r\n",
71 "survey-email.csv\r\n",
72 "task-analysis.ipynb\r\n",
73 "winners.ipynb\r\n"
74 ]
75 }
76 ],
77 "source": [
78 "!ls"
79 ]
80 },
81 {
82 "cell_type": "code",
83 "execution_count": 4,
84 "metadata": {
85 "collapsed": true
86 },
87 "outputs": [],
88 "source": [
89 "def update_grades():\n",
90 " for i in imp.find(modifiers={\"$snapshot\": True}):\n",
91 " imp.update_one({'_id': i['_id']},\n",
92 " {'$set': {'grade': i['Grade/10']['00'],\n",
93 " 'q1': i['Q'][' 1 /5']['00'],\n",
94 " 'q2': i['Q'][' 2 /5']['00']},\n",
95 " '$unset': {'Grade/10': '', 'Q': ''}})"
96 ]
97 },
98 {
99 "cell_type": "code",
100 "execution_count": 5,
101 "metadata": {
102 "collapsed": true
103 },
104 "outputs": [],
105 "source": [
106 "def update_dates():\n",
107 " for i in imp.find(modifiers={\"$snapshot\": True}):\n",
108 " starttime = datetime.strptime(i['Started on'].strip(), '%d %b %Y %H:%M')\n",
109 " if i['Completed'] == '-':\n",
110 " imp.update_one({'_id': i['_id']},\n",
111 " {'$set': {'started': starttime},\n",
112 " '$unset': {'Started on': '', 'Completed': '', 'Time taken': ''}})\n",
113 " else:\n",
114 " endtime = datetime.strptime(i['Completed'].strip(), '%d %b %Y %H:%M')\n",
115 " imp.update_one({'_id': i['_id']},\n",
116 " {'$set': {'started': starttime,\n",
117 " 'completed': endtime},\n",
118 " '$unset': {'Started on': '', 'Completed': '', 'Time taken': ''}})"
119 ]
120 },
121 {
122 "cell_type": "code",
123 "execution_count": 6,
124 "metadata": {
125 "collapsed": true
126 },
127 "outputs": [],
128 "source": [
129 "def merge_imported(icma_number):\n",
130 " icmas.delete_many({'icma_number': icma_number})\n",
131 "\n",
132 " for i in imp.find(modifiers={\"$snapshot\": True}):\n",
133 " del i['_id']\n",
134 " i['icma_number'] = icma_number\n",
135 " icmas.insert_one(i)"
136 ]
137 },
138 {
139 "cell_type": "code",
140 "execution_count": 7,
141 "metadata": {},
142 "outputs": [
143 {
144 "data": {
145 "text/plain": [
146 "['SUMMEROFCODE-Day 0 Warm up holiday prices-grades.csv',\n",
147 " 'SUMMEROFCODE-Day 10 Word search-grades.csv',\n",
148 " 'SUMMEROFCODE-Day 1 Choosing a holiday-grades.csv',\n",
149 " 'SUMMEROFCODE-Day 2 Lifts-grades.csv',\n",
150 " 'SUMMEROFCODE-Day 3 Door codes-grades.csv',\n",
151 " 'SUMMEROFCODE-Day 4 Beach labyrinth-grades.csv',\n",
152 " 'SUMMEROFCODE-Day 5 Laser display boards-grades.csv',\n",
153 " 'SUMMEROFCODE-Day 6 Tour guides-grades.csv',\n",
154 " 'SUMMEROFCODE-Day 7 Fixing the minibar-grades.csv',\n",
155 " 'SUMMEROFCODE-Day 8 Visa woes-grades.csv',\n",
156 " 'SUMMEROFCODE-Day 9 Resolving the bill-grades.csv']"
157 ]
158 },
159 "execution_count": 7,
160 "metadata": {},
161 "output_type": "execute_result"
162 }
163 ],
164 "source": [
165 "csvs = !ls SUMMEROFCODE*csv\n",
166 "csvs"
167 ]
168 },
169 {
170 "cell_type": "code",
171 "execution_count": 8,
172 "metadata": {},
173 "outputs": [
174 {
175 "data": {
176 "text/plain": [
177 "<pymongo.results.DeleteResult at 0x7f648f16b168>"
178 ]
179 },
180 "execution_count": 8,
181 "metadata": {},
182 "output_type": "execute_result"
183 }
184 ],
185 "source": [
186 "icmas.delete_many({})"
187 ]
188 },
189 {
190 "cell_type": "code",
191 "execution_count": 9,
192 "metadata": {},
193 "outputs": [
194 {
195 "name": "stdout",
196 "output_type": "stream",
197 "text": [
198 "SUMMEROFCODE-Day 0 Warm up holiday prices-grades.csv\n",
199 "2017-08-01T10:27:51.256+0100\tconnected to: localhost\n",
200 "2017-08-01T10:27:51.256+0100\tdropping: summerofcode.imp\n",
201 "2017-08-01T10:27:51.285+0100\timported 293 documents\n",
202 "SUMMEROFCODE-Day 10 Word search-grades.csv\n",
203 "2017-08-01T10:27:51.655+0100\tconnected to: localhost\n",
204 "2017-08-01T10:27:51.655+0100\tdropping: summerofcode.imp\n",
205 "2017-08-01T10:27:51.657+0100\timported 61 documents\n",
206 "SUMMEROFCODE-Day 1 Choosing a holiday-grades.csv\n",
207 "2017-08-01T10:27:51.821+0100\tconnected to: localhost\n",
208 "2017-08-01T10:27:51.821+0100\tdropping: summerofcode.imp\n",
209 "2017-08-01T10:27:51.824+0100\timported 153 documents\n",
210 "SUMMEROFCODE-Day 2 Lifts-grades.csv\n",
211 "2017-08-01T10:27:52.064+0100\tconnected to: localhost\n",
212 "2017-08-01T10:27:52.064+0100\tdropping: summerofcode.imp\n",
213 "2017-08-01T10:27:52.069+0100\timported 103 documents\n",
214 "SUMMEROFCODE-Day 3 Door codes-grades.csv\n",
215 "2017-08-01T10:27:52.287+0100\tconnected to: localhost\n",
216 "2017-08-01T10:27:52.287+0100\tdropping: summerofcode.imp\n",
217 "2017-08-01T10:27:52.290+0100\timported 88 documents\n",
218 "SUMMEROFCODE-Day 4 Beach labyrinth-grades.csv\n",
219 "2017-08-01T10:27:52.485+0100\tconnected to: localhost\n",
220 "2017-08-01T10:27:52.485+0100\tdropping: summerofcode.imp\n",
221 "2017-08-01T10:27:52.488+0100\timported 97 documents\n",
222 "SUMMEROFCODE-Day 5 Laser display boards-grades.csv\n",
223 "2017-08-01T10:27:52.693+0100\tconnected to: localhost\n",
224 "2017-08-01T10:27:52.694+0100\tdropping: summerofcode.imp\n",
225 "2017-08-01T10:27:52.696+0100\timported 77 documents\n",
226 "SUMMEROFCODE-Day 6 Tour guides-grades.csv\n",
227 "2017-08-01T10:27:52.882+0100\tconnected to: localhost\n",
228 "2017-08-01T10:27:52.882+0100\tdropping: summerofcode.imp\n",
229 "2017-08-01T10:27:52.885+0100\timported 100 documents\n",
230 "SUMMEROFCODE-Day 7 Fixing the minibar-grades.csv\n",
231 "2017-08-01T10:27:53.081+0100\tconnected to: localhost\n",
232 "2017-08-01T10:27:53.081+0100\tdropping: summerofcode.imp\n",
233 "2017-08-01T10:27:53.083+0100\timported 54 documents\n",
234 "SUMMEROFCODE-Day 8 Visa woes-grades.csv\n",
235 "2017-08-01T10:27:53.255+0100\tconnected to: localhost\n",
236 "2017-08-01T10:27:53.255+0100\tdropping: summerofcode.imp\n",
237 "2017-08-01T10:27:53.257+0100\timported 61 documents\n",
238 "SUMMEROFCODE-Day 9 Resolving the bill-grades.csv\n",
239 "2017-08-01T10:27:53.421+0100\tconnected to: localhost\n",
240 "2017-08-01T10:27:53.421+0100\tdropping: summerofcode.imp\n",
241 "2017-08-01T10:27:53.422+0100\timported 42 documents\n"
242 ]
243 }
244 ],
245 "source": [
246 "for csv in csvs:\n",
247 " print(csv)\n",
248 " qcsv = \"'{}'\".format(csv)\n",
249 " !mongoimport --drop --db summerofcode --collection imp --type csv --headerline --ignoreBlanks --file {qcsv}\n",
250 " imp.delete_many({'Surname': 'Overall average'})\n",
251 " update_grades()\n",
252 " update_dates()\n",
253 " merge_imported(int(csv.split()[1]))"
254 ]
255 },
256 {
257 "cell_type": "code",
258 "execution_count": 10,
259 "metadata": {},
260 "outputs": [
261 {
262 "data": {
263 "text/plain": [
264 "1118"
265 ]
266 },
267 "execution_count": 10,
268 "metadata": {},
269 "output_type": "execute_result"
270 }
271 ],
272 "source": [
273 "icmas.find({}, ['icma']).count()"
274 ]
275 },
276 {
277 "cell_type": "code",
278 "execution_count": 12,
279 "metadata": {},
280 "outputs": [
281 {
282 "data": {
283 "text/plain": [
284 "[{'_id': 0, 'count': 292},\n",
285 " {'_id': 1, 'count': 152},\n",
286 " {'_id': 2, 'count': 102},\n",
287 " {'_id': 3, 'count': 87},\n",
288 " {'_id': 4, 'count': 96},\n",
289 " {'_id': 5, 'count': 76},\n",
290 " {'_id': 6, 'count': 99},\n",
291 " {'_id': 7, 'count': 53},\n",
292 " {'_id': 8, 'count': 60},\n",
293 " {'_id': 9, 'count': 41},\n",
294 " {'_id': 10, 'count': 60}]"
295 ]
296 },
297 "execution_count": 12,
298 "metadata": {},
299 "output_type": "execute_result"
300 }
301 ],
302 "source": [
303 "pipeline = [{\"$group\": {\"_id\": \"$icma_number\", \"count\": {\"$sum\": 1}}},\n",
304 " {'$sort': {'_id': 1}}]\n",
305 "list(icmas.aggregate(pipeline))"
306 ]
307 },
308 {
309 "cell_type": "code",
310 "execution_count": null,
311 "metadata": {
312 "collapsed": true
313 },
314 "outputs": [],
315 "source": []
316 }
317 ],
318 "metadata": {
319 "kernelspec": {
320 "display_name": "Python 3",
321 "language": "python",
322 "name": "python3"
323 },
324 "language_info": {
325 "codemirror_mode": {
326 "name": "ipython",
327 "version": 3
328 },
329 "file_extension": ".py",
330 "mimetype": "text/x-python",
331 "name": "python",
332 "nbconvert_exporter": "python",
333 "pygments_lexer": "ipython3",
334 "version": "3.5.2+"
335 }
336 },
337 "nbformat": 4,
338 "nbformat_minor": 2
339 }