Added examples
[ou-summer-of-code-2017.git] / data-analysis / import-data.ipynb
1 {
2 "cells": [
3 {
4 "cell_type": "code",
5 "execution_count": 20,
6 "metadata": {},
7 "outputs": [],
8 "source": [
9 "# Import the required libraries and open the connection to Mongo\n",
10 "\n",
11 "import collections\n",
12 "from datetime import datetime\n",
13 "import matplotlib as mpl\n",
14 "import matplotlib.pyplot as plt\n",
15 "%matplotlib inline\n",
16 "\n",
17 "import numpy as np\n",
18 "import pandas as pd\n",
19 "import scipy.stats\n",
20 "\n",
21 "import pymongo\n",
22 "from bson.objectid import ObjectId\n",
23 "# client = pymongo.MongoClient('mongodb://localhost:27117/')\n",
24 "# client = pymongo.MongoClient('mongodb://ogedei:27017')\n",
25 "client = pymongo.MongoClient('mongodb://localhost:27017')"
26 ]
27 },
28 {
29 "cell_type": "code",
30 "execution_count": 141,
31 "metadata": {
32 "collapsed": true
33 },
34 "outputs": [],
35 "source": [
36 "# Connect to the database\n",
37 "summerofcode_db = client.summerofcode\n",
38 "imp = summerofcode_db.imp\n",
39 "icmas = summerofcode_db.icmas"
40 ]
41 },
42 {
43 "cell_type": "code",
44 "execution_count": 139,
45 "metadata": {},
46 "outputs": [
47 {
48 "name": "stdout",
49 "output_type": "stream",
50 "text": [
51 "14.1 Basic CRUD.ipynb\r\n",
52 "2016J_TMA01_q1-TN.ipynb\r\n",
53 "2016J_TMA01_q2-withTN.ipynb\r\n",
54 "import-data-Copy1.ipynb\r\n",
55 "import-data.ipynb\r\n",
56 "SUMMEROFCODE-Day 0 Warm up holiday prices-grades.csv\r\n",
57 "SUMMEROFCODE-Day 1 Choosing a holiday-grades.csv\r\n",
58 "SUMMEROFCODE-Day 2 Lifts-grades.csv\r\n",
59 "SUMMEROFCODE-Day 3 Door codes-grades.csv\r\n",
60 "SUMMEROFCODE-Day 4 Beach labyrinth-grades.csv\r\n",
61 "SUMMEROFCODE-Day 5 Laser display boards-grades.csv\r\n",
62 "TMA01_Question3_Solutions.ipynb\r\n"
63 ]
64 }
65 ],
66 "source": [
67 "!ls"
68 ]
69 },
70 {
71 "cell_type": "code",
72 "execution_count": 102,
73 "metadata": {},
74 "outputs": [
75 {
76 "data": {
77 "text/plain": [
78 "['SUMMEROFCODE-Day 0 Warm up holiday prices-grades.csv',\n",
79 " 'SUMMEROFCODE-Day 1 Choosing a holiday-grades.csv',\n",
80 " 'SUMMEROFCODE-Day 2 Lifts-grades.csv',\n",
81 " 'SUMMEROFCODE-Day 3 Door codes-grades.csv',\n",
82 " 'SUMMEROFCODE-Day 4 Beach labyrinth-grades.csv',\n",
83 " 'SUMMEROFCODE-Day 5 Laser display boards-grades.csv']"
84 ]
85 },
86 "execution_count": 102,
87 "metadata": {},
88 "output_type": "execute_result"
89 }
90 ],
91 "source": [
92 "csvs = !ls *csv\n",
93 "csvs"
94 ]
95 },
96 {
97 "cell_type": "code",
98 "execution_count": 114,
99 "metadata": {
100 "collapsed": true
101 },
102 "outputs": [],
103 "source": [
104 "def update_grades():\n",
105 " for i in imp.find(modifiers={\"$snapshot\": True}):\n",
106 " imp.update_one({'_id': i['_id']},\n",
107 " {'$set': {'grade': i['Grade/10']['00'],\n",
108 " 'q1': i['Q'][' 1 /5']['00'],\n",
109 " 'q2': i['Q'][' 2 /5']['00']},\n",
110 " '$unset': {'Grade/10': '', 'Q': ''}})"
111 ]
112 },
113 {
114 "cell_type": "code",
115 "execution_count": 109,
116 "metadata": {},
117 "outputs": [],
118 "source": [
119 "def update_dates():\n",
120 " for i in imp.find(modifiers={\"$snapshot\": True}):\n",
121 " starttime = datetime.strptime(i['Started on'].strip(), '%d %b %Y %H:%M')\n",
122 " if i['Completed'] == '-':\n",
123 " imp.update_one({'_id': i['_id']},\n",
124 " {'$set': {'started': starttime},\n",
125 " '$unset': {'Started on': '', 'Completed': '', 'Time taken': ''}})\n",
126 " else:\n",
127 " endtime = datetime.strptime(i['Completed'].strip(), '%d %b %Y %H:%M')\n",
128 " imp.update_one({'_id': i['_id']},\n",
129 " {'$set': {'started': starttime,\n",
130 " 'completed': endtime},\n",
131 " '$unset': {'Started on': '', 'Completed': '', 'Time taken': ''}})"
132 ]
133 },
134 {
135 "cell_type": "code",
136 "execution_count": 110,
137 "metadata": {
138 "collapsed": true
139 },
140 "outputs": [],
141 "source": [
142 "def merge_imported(icma_number):\n",
143 " icmas.delete_many({'icma_number': icma_number})\n",
144 "\n",
145 " for i in imp.find(modifiers={\"$snapshot\": True}):\n",
146 " del i['_id']\n",
147 " i['icma_number'] = icma_number\n",
148 " icmas.insert_one(i)"
149 ]
150 },
151 {
152 "cell_type": "code",
153 "execution_count": 111,
154 "metadata": {},
155 "outputs": [
156 {
157 "data": {
158 "text/plain": [
159 "['SUMMEROFCODE-Day 0 Warm up holiday prices-grades.csv',\n",
160 " 'SUMMEROFCODE-Day 1 Choosing a holiday-grades.csv',\n",
161 " 'SUMMEROFCODE-Day 2 Lifts-grades.csv',\n",
162 " 'SUMMEROFCODE-Day 3 Door codes-grades.csv',\n",
163 " 'SUMMEROFCODE-Day 4 Beach labyrinth-grades.csv',\n",
164 " 'SUMMEROFCODE-Day 5 Laser display boards-grades.csv']"
165 ]
166 },
167 "execution_count": 111,
168 "metadata": {},
169 "output_type": "execute_result"
170 }
171 ],
172 "source": [
173 "csvs = !ls *csv\n",
174 "csvs"
175 ]
176 },
177 {
178 "cell_type": "code",
179 "execution_count": 123,
180 "metadata": {},
181 "outputs": [
182 {
183 "data": {
184 "text/plain": [
185 "<pymongo.results.DeleteResult at 0x7f3f2249fca8>"
186 ]
187 },
188 "execution_count": 123,
189 "metadata": {},
190 "output_type": "execute_result"
191 }
192 ],
193 "source": [
194 "icmas.delete_many({})"
195 ]
196 },
197 {
198 "cell_type": "code",
199 "execution_count": 132,
200 "metadata": {},
201 "outputs": [
202 {
203 "name": "stdout",
204 "output_type": "stream",
205 "text": [
206 "SUMMEROFCODE-Day 0 Warm up holiday prices-grades.csv\n",
207 "2017-07-22T15:24:32.622+0100\tconnected to: localhost\n",
208 "2017-07-22T15:24:32.622+0100\tdropping: summerofcode.imp\n",
209 "2017-07-22T15:24:32.629+0100\timported 285 documents\n",
210 "SUMMEROFCODE-Day 1 Choosing a holiday-grades.csv\n",
211 "2017-07-22T15:24:32.971+0100\tconnected to: localhost\n",
212 "2017-07-22T15:24:32.971+0100\tdropping: summerofcode.imp\n",
213 "2017-07-22T15:24:32.974+0100\timported 137 documents\n",
214 "SUMMEROFCODE-Day 2 Lifts-grades.csv\n",
215 "2017-07-22T15:24:33.202+0100\tconnected to: localhost\n",
216 "2017-07-22T15:24:33.202+0100\tdropping: summerofcode.imp\n",
217 "2017-07-22T15:24:33.204+0100\timported 92 documents\n",
218 "SUMMEROFCODE-Day 3 Door codes-grades.csv\n",
219 "2017-07-22T15:24:33.394+0100\tconnected to: localhost\n",
220 "2017-07-22T15:24:33.394+0100\tdropping: summerofcode.imp\n",
221 "2017-07-22T15:24:33.396+0100\timported 77 documents\n",
222 "SUMMEROFCODE-Day 4 Beach labyrinth-grades.csv\n",
223 "2017-07-22T15:24:33.573+0100\tconnected to: localhost\n",
224 "2017-07-22T15:24:33.573+0100\tdropping: summerofcode.imp\n",
225 "2017-07-22T15:24:33.575+0100\timported 79 documents\n",
226 "SUMMEROFCODE-Day 5 Laser display boards-grades.csv\n",
227 "2017-07-22T15:24:33.753+0100\tconnected to: localhost\n",
228 "2017-07-22T15:24:33.753+0100\tdropping: summerofcode.imp\n",
229 "2017-07-22T15:24:33.755+0100\timported 57 documents\n"
230 ]
231 }
232 ],
233 "source": [
234 "for csv in csvs:\n",
235 " print(csv)\n",
236 " qcsv = \"'{}'\".format(csv)\n",
237 " !mongoimport --drop --db summerofcode --collection imp --type csv --headerline --ignoreBlanks --file {qcsv}\n",
238 " imp.delete_many({'Surname': 'Overall average'})\n",
239 " update_grades()\n",
240 " update_dates()\n",
241 " merge_imported(int(csv.split()[1]))"
242 ]
243 },
244 {
245 "cell_type": "code",
246 "execution_count": 134,
247 "metadata": {},
248 "outputs": [
249 {
250 "data": {
251 "text/plain": [
252 "721"
253 ]
254 },
255 "execution_count": 134,
256 "metadata": {},
257 "output_type": "execute_result"
258 }
259 ],
260 "source": [
261 "icmas.find({}, ['icma']).count()"
262 ]
263 },
264 {
265 "cell_type": "code",
266 "execution_count": 138,
267 "metadata": {},
268 "outputs": [
269 {
270 "data": {
271 "text/plain": [
272 "[{'_id': 3, 'count': 76},\n",
273 " {'_id': 2, 'count': 91},\n",
274 " {'_id': 5, 'count': 56},\n",
275 " {'_id': 4, 'count': 78},\n",
276 " {'_id': 1, 'count': 136},\n",
277 " {'_id': 0, 'count': 284}]"
278 ]
279 },
280 "execution_count": 138,
281 "metadata": {},
282 "output_type": "execute_result"
283 }
284 ],
285 "source": [
286 "pipeline = [{\"$group\": {\"_id\": \"$icma_number\", \"count\": {\"$sum\": 1}}}]\n",
287 "list(icmas.aggregate(pipeline))"
288 ]
289 },
290 {
291 "cell_type": "code",
292 "execution_count": null,
293 "metadata": {
294 "collapsed": true
295 },
296 "outputs": [],
297 "source": []
298 }
299 ],
300 "metadata": {
301 "kernelspec": {
302 "display_name": "Python 3",
303 "language": "python",
304 "name": "python3"
305 },
306 "language_info": {
307 "codemirror_mode": {
308 "name": "ipython",
309 "version": 3
310 },
311 "file_extension": ".py",
312 "mimetype": "text/x-python",
313 "name": "python",
314 "nbconvert_exporter": "python",
315 "pygments_lexer": "ipython3",
316 "version": "3.5.2+"
317 }
318 },
319 "nbformat": 4,
320 "nbformat_minor": 2
321 }