Removing files from data analysis directory
[ou-summer-of-code-2017.git] / data-analysis / import-data.ipynb
1 {
2 "cells": [
3 {
4 "cell_type": "code",
5 "execution_count": 1,
6 "metadata": {
7 "collapsed": true
8 },
9 "outputs": [],
10 "source": [
11 "# Import the required libraries and open the connection to Mongo\n",
12 "\n",
13 "import collections\n",
14 "from datetime import datetime\n",
15 "import matplotlib as mpl\n",
16 "import matplotlib.pyplot as plt\n",
17 "%matplotlib inline\n",
18 "\n",
19 "import numpy as np\n",
20 "import pandas as pd\n",
21 "import scipy.stats\n",
22 "\n",
23 "import pymongo\n",
24 "from bson.objectid import ObjectId\n",
25 "# client = pymongo.MongoClient('mongodb://localhost:27117/')\n",
26 "# client = pymongo.MongoClient('mongodb://ogedei:27017')\n",
27 "client = pymongo.MongoClient('mongodb://localhost:27017')"
28 ]
29 },
30 {
31 "cell_type": "code",
32 "execution_count": 2,
33 "metadata": {
34 "collapsed": true
35 },
36 "outputs": [],
37 "source": [
38 "# Connect to the database\n",
39 "summerofcode_db = client.summerofcode\n",
40 "imp = summerofcode_db.imp\n",
41 "icmas = summerofcode_db.icmas"
42 ]
43 },
44 {
45 "cell_type": "code",
46 "execution_count": 3,
47 "metadata": {},
48 "outputs": [
49 {
50 "name": "stdout",
51 "output_type": "stream",
52 "text": [
53 "all-starts-hours.png\r\n",
54 "attempts-per-day.png\r\n",
55 "completion-hours.png\r\n",
56 "import-data.ipynb\r\n",
57 "participated-in-days.png\r\n",
58 "scores-per-day.png\r\n",
59 "SUMMEROFCODE-Day 0 Warm up holiday prices-grades.csv\r\n",
60 "SUMMEROFCODE-Day 10 Word search-grades.csv\r\n",
61 "SUMMEROFCODE-Day 1 Choosing a holiday-grades.csv\r\n",
62 "SUMMEROFCODE-Day 2 Lifts-grades.csv\r\n",
63 "SUMMEROFCODE-Day 3 Door codes-grades.csv\r\n",
64 "SUMMEROFCODE-Day 4 Beach labyrinth-grades.csv\r\n",
65 "SUMMEROFCODE-Day 5 Laser display boards-grades.csv\r\n",
66 "SUMMEROFCODE-Day 6 Tour guides-grades.csv\r\n",
67 "SUMMEROFCODE-Day 7 Fixing the minibar-grades.csv\r\n",
68 "SUMMEROFCODE-Day 8 Visa woes-grades.csv\r\n",
69 "SUMMEROFCODE-Day 9 Resolving the bill-grades.csv\r\n",
70 "task-analysis.ipynb\r\n",
71 "winners.ipynb\r\n"
72 ]
73 }
74 ],
75 "source": [
76 "!ls"
77 ]
78 },
79 {
80 "cell_type": "code",
81 "execution_count": 4,
82 "metadata": {
83 "collapsed": true
84 },
85 "outputs": [],
86 "source": [
87 "def update_grades():\n",
88 " for i in imp.find(modifiers={\"$snapshot\": True}):\n",
89 " imp.update_one({'_id': i['_id']},\n",
90 " {'$set': {'grade': i['Grade/10']['00'],\n",
91 " 'q1': i['Q'][' 1 /5']['00'],\n",
92 " 'q2': i['Q'][' 2 /5']['00']},\n",
93 " '$unset': {'Grade/10': '', 'Q': ''}})"
94 ]
95 },
96 {
97 "cell_type": "code",
98 "execution_count": 5,
99 "metadata": {
100 "collapsed": true
101 },
102 "outputs": [],
103 "source": [
104 "def update_dates():\n",
105 " for i in imp.find(modifiers={\"$snapshot\": True}):\n",
106 " starttime = datetime.strptime(i['Started on'].strip(), '%d %b %Y %H:%M')\n",
107 " if i['Completed'] == '-':\n",
108 " imp.update_one({'_id': i['_id']},\n",
109 " {'$set': {'started': starttime},\n",
110 " '$unset': {'Started on': '', 'Completed': '', 'Time taken': ''}})\n",
111 " else:\n",
112 " endtime = datetime.strptime(i['Completed'].strip(), '%d %b %Y %H:%M')\n",
113 " imp.update_one({'_id': i['_id']},\n",
114 " {'$set': {'started': starttime,\n",
115 " 'completed': endtime},\n",
116 " '$unset': {'Started on': '', 'Completed': '', 'Time taken': ''}})"
117 ]
118 },
119 {
120 "cell_type": "code",
121 "execution_count": 6,
122 "metadata": {
123 "collapsed": true
124 },
125 "outputs": [],
126 "source": [
127 "def merge_imported(icma_number):\n",
128 " icmas.delete_many({'icma_number': icma_number})\n",
129 "\n",
130 " for i in imp.find(modifiers={\"$snapshot\": True}):\n",
131 " del i['_id']\n",
132 " i['icma_number'] = icma_number\n",
133 " icmas.insert_one(i)"
134 ]
135 },
136 {
137 "cell_type": "code",
138 "execution_count": 7,
139 "metadata": {},
140 "outputs": [
141 {
142 "data": {
143 "text/plain": [
144 "['SUMMEROFCODE-Day 0 Warm up holiday prices-grades.csv',\n",
145 " 'SUMMEROFCODE-Day 10 Word search-grades.csv',\n",
146 " 'SUMMEROFCODE-Day 1 Choosing a holiday-grades.csv',\n",
147 " 'SUMMEROFCODE-Day 2 Lifts-grades.csv',\n",
148 " 'SUMMEROFCODE-Day 3 Door codes-grades.csv',\n",
149 " 'SUMMEROFCODE-Day 4 Beach labyrinth-grades.csv',\n",
150 " 'SUMMEROFCODE-Day 5 Laser display boards-grades.csv',\n",
151 " 'SUMMEROFCODE-Day 6 Tour guides-grades.csv',\n",
152 " 'SUMMEROFCODE-Day 7 Fixing the minibar-grades.csv',\n",
153 " 'SUMMEROFCODE-Day 8 Visa woes-grades.csv',\n",
154 " 'SUMMEROFCODE-Day 9 Resolving the bill-grades.csv']"
155 ]
156 },
157 "execution_count": 7,
158 "metadata": {},
159 "output_type": "execute_result"
160 }
161 ],
162 "source": [
163 "csvs = !ls SUMMEROFCODE*csv\n",
164 "csvs"
165 ]
166 },
167 {
168 "cell_type": "code",
169 "execution_count": 8,
170 "metadata": {},
171 "outputs": [
172 {
173 "data": {
174 "text/plain": [
175 "<pymongo.results.DeleteResult at 0x7f1909986c18>"
176 ]
177 },
178 "execution_count": 8,
179 "metadata": {},
180 "output_type": "execute_result"
181 }
182 ],
183 "source": [
184 "icmas.delete_many({})"
185 ]
186 },
187 {
188 "cell_type": "code",
189 "execution_count": 9,
190 "metadata": {},
191 "outputs": [
192 {
193 "name": "stdout",
194 "output_type": "stream",
195 "text": [
196 "SUMMEROFCODE-Day 0 Warm up holiday prices-grades.csv\n",
197 "2017-08-07T10:55:24.121+0100\tconnected to: localhost\n",
198 "2017-08-07T10:55:24.121+0100\tdropping: summerofcode.imp\n",
199 "2017-08-07T10:55:24.190+0100\timported 294 documents\n",
200 "SUMMEROFCODE-Day 10 Word search-grades.csv\n",
201 "2017-08-07T10:55:24.567+0100\tconnected to: localhost\n",
202 "2017-08-07T10:55:24.567+0100\tdropping: summerofcode.imp\n",
203 "2017-08-07T10:55:24.577+0100\timported 64 documents\n",
204 "SUMMEROFCODE-Day 1 Choosing a holiday-grades.csv\n",
205 "2017-08-07T10:55:24.746+0100\tconnected to: localhost\n",
206 "2017-08-07T10:55:24.746+0100\tdropping: summerofcode.imp\n",
207 "2017-08-07T10:55:24.751+0100\timported 158 documents\n",
208 "SUMMEROFCODE-Day 2 Lifts-grades.csv\n",
209 "2017-08-07T10:55:24.990+0100\tconnected to: localhost\n",
210 "2017-08-07T10:55:24.990+0100\tdropping: summerofcode.imp\n",
211 "2017-08-07T10:55:24.993+0100\timported 105 documents\n",
212 "SUMMEROFCODE-Day 3 Door codes-grades.csv\n",
213 "2017-08-07T10:55:25.187+0100\tconnected to: localhost\n",
214 "2017-08-07T10:55:25.187+0100\tdropping: summerofcode.imp\n",
215 "2017-08-07T10:55:25.198+0100\timported 89 documents\n",
216 "SUMMEROFCODE-Day 4 Beach labyrinth-grades.csv\n",
217 "2017-08-07T10:55:25.381+0100\tconnected to: localhost\n",
218 "2017-08-07T10:55:25.381+0100\tdropping: summerofcode.imp\n",
219 "2017-08-07T10:55:25.383+0100\timported 97 documents\n",
220 "SUMMEROFCODE-Day 5 Laser display boards-grades.csv\n",
221 "2017-08-07T10:55:25.571+0100\tconnected to: localhost\n",
222 "2017-08-07T10:55:25.571+0100\tdropping: summerofcode.imp\n",
223 "2017-08-07T10:55:25.573+0100\timported 77 documents\n",
224 "SUMMEROFCODE-Day 6 Tour guides-grades.csv\n",
225 "2017-08-07T10:55:25.743+0100\tconnected to: localhost\n",
226 "2017-08-07T10:55:25.744+0100\tdropping: summerofcode.imp\n",
227 "2017-08-07T10:55:25.746+0100\timported 101 documents\n",
228 "SUMMEROFCODE-Day 7 Fixing the minibar-grades.csv\n",
229 "2017-08-07T10:55:25.955+0100\tconnected to: localhost\n",
230 "2017-08-07T10:55:25.955+0100\tdropping: summerofcode.imp\n",
231 "2017-08-07T10:55:25.956+0100\timported 54 documents\n",
232 "SUMMEROFCODE-Day 8 Visa woes-grades.csv\n",
233 "2017-08-07T10:55:26.126+0100\tconnected to: localhost\n",
234 "2017-08-07T10:55:26.126+0100\tdropping: summerofcode.imp\n",
235 "2017-08-07T10:55:26.128+0100\timported 61 documents\n",
236 "SUMMEROFCODE-Day 9 Resolving the bill-grades.csv\n",
237 "2017-08-07T10:55:26.302+0100\tconnected to: localhost\n",
238 "2017-08-07T10:55:26.302+0100\tdropping: summerofcode.imp\n",
239 "2017-08-07T10:55:26.303+0100\timported 42 documents\n"
240 ]
241 }
242 ],
243 "source": [
244 "for csv in csvs:\n",
245 " print(csv)\n",
246 " qcsv = \"'{}'\".format(csv)\n",
247 " !mongoimport --drop --db summerofcode --collection imp --type csv --headerline --ignoreBlanks --file {qcsv}\n",
248 " imp.delete_many({'Surname': 'Overall average'})\n",
249 " update_grades()\n",
250 " update_dates()\n",
251 " merge_imported(int(csv.split()[1]))"
252 ]
253 },
254 {
255 "cell_type": "code",
256 "execution_count": 10,
257 "metadata": {},
258 "outputs": [
259 {
260 "data": {
261 "text/plain": [
262 "1131"
263 ]
264 },
265 "execution_count": 10,
266 "metadata": {},
267 "output_type": "execute_result"
268 }
269 ],
270 "source": [
271 "icmas.find({}, ['icma']).count()"
272 ]
273 },
274 {
275 "cell_type": "code",
276 "execution_count": 11,
277 "metadata": {},
278 "outputs": [
279 {
280 "data": {
281 "text/plain": [
282 "[{'_id': 0, 'count': 293},\n",
283 " {'_id': 1, 'count': 157},\n",
284 " {'_id': 2, 'count': 104},\n",
285 " {'_id': 3, 'count': 88},\n",
286 " {'_id': 4, 'count': 96},\n",
287 " {'_id': 5, 'count': 76},\n",
288 " {'_id': 6, 'count': 100},\n",
289 " {'_id': 7, 'count': 53},\n",
290 " {'_id': 8, 'count': 60},\n",
291 " {'_id': 9, 'count': 41},\n",
292 " {'_id': 10, 'count': 63}]"
293 ]
294 },
295 "execution_count": 11,
296 "metadata": {},
297 "output_type": "execute_result"
298 }
299 ],
300 "source": [
301 "pipeline = [{\"$group\": {\"_id\": \"$icma_number\", \"count\": {\"$sum\": 1}}},\n",
302 " {'$sort': {'_id': 1}}]\n",
303 "list(icmas.aggregate(pipeline))"
304 ]
305 },
306 {
307 "cell_type": "code",
308 "execution_count": null,
309 "metadata": {
310 "collapsed": true
311 },
312 "outputs": [],
313 "source": []
314 }
315 ],
316 "metadata": {
317 "kernelspec": {
318 "display_name": "Python 3",
319 "language": "python",
320 "name": "python3"
321 },
322 "language_info": {
323 "codemirror_mode": {
324 "name": "ipython",
325 "version": 3
326 },
327 "file_extension": ".py",
328 "mimetype": "text/x-python",
329 "name": "python",
330 "nbconvert_exporter": "python",
331 "pygments_lexer": "ipython3",
332 "version": "3.5.2+"
333 }
334 },
335 "nbformat": 4,
336 "nbformat_minor": 2
337 }