Data analysis
authorNeil Smith <neil.git@njae.me.uk>
Sat, 22 Jul 2017 18:11:34 +0000 (19:11 +0100)
committerNeil Smith <neil.git@njae.me.uk>
Sat, 22 Jul 2017 18:11:34 +0000 (19:11 +0100)
.gitignore
data-analysis/import-data.ipynb [new file with mode: 0644]

index 0546add21afd94c4049024b4460cc795e622f65a..31ec7fc2c121b27e754182e3325f3964f78114e5 100644 (file)
@@ -43,3 +43,9 @@ nosetests.xml
 
 # Logs
 *.log
+
+# CSV files, as they have personal data
+data-analysis/*csv
+data-analysis/winners.ipynb
+
+
diff --git a/data-analysis/import-data.ipynb b/data-analysis/import-data.ipynb
new file mode 100644 (file)
index 0000000..da7863e
--- /dev/null
@@ -0,0 +1,321 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import the required libraries and open the connection to Mongo\n",
+    "\n",
+    "import collections\n",
+    "from datetime import datetime\n",
+    "import matplotlib as mpl\n",
+    "import matplotlib.pyplot as plt\n",
+    "%matplotlib inline\n",
+    "\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import scipy.stats\n",
+    "\n",
+    "import pymongo\n",
+    "from bson.objectid import ObjectId\n",
+    "# client = pymongo.MongoClient('mongodb://localhost:27117/')\n",
+    "# client = pymongo.MongoClient('mongodb://ogedei:27017')\n",
+    "client = pymongo.MongoClient('mongodb://localhost:27017')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 141,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# Connect to the database\n",
+    "summerofcode_db = client.summerofcode\n",
+    "imp = summerofcode_db.imp\n",
+    "icmas = summerofcode_db.icmas"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 139,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "14.1 Basic CRUD.ipynb\r\n",
+      "2016J_TMA01_q1-TN.ipynb\r\n",
+      "2016J_TMA01_q2-withTN.ipynb\r\n",
+      "import-data-Copy1.ipynb\r\n",
+      "import-data.ipynb\r\n",
+      "SUMMEROFCODE-Day 0 Warm up holiday prices-grades.csv\r\n",
+      "SUMMEROFCODE-Day 1 Choosing a holiday-grades.csv\r\n",
+      "SUMMEROFCODE-Day 2 Lifts-grades.csv\r\n",
+      "SUMMEROFCODE-Day 3 Door codes-grades.csv\r\n",
+      "SUMMEROFCODE-Day 4 Beach labyrinth-grades.csv\r\n",
+      "SUMMEROFCODE-Day 5 Laser display boards-grades.csv\r\n",
+      "TMA01_Question3_Solutions.ipynb\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "!ls"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 102,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['SUMMEROFCODE-Day 0 Warm up holiday prices-grades.csv',\n",
+       " 'SUMMEROFCODE-Day 1 Choosing a holiday-grades.csv',\n",
+       " 'SUMMEROFCODE-Day 2 Lifts-grades.csv',\n",
+       " 'SUMMEROFCODE-Day 3 Door codes-grades.csv',\n",
+       " 'SUMMEROFCODE-Day 4 Beach labyrinth-grades.csv',\n",
+       " 'SUMMEROFCODE-Day 5 Laser display boards-grades.csv']"
+      ]
+     },
+     "execution_count": 102,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "csvs = !ls *csv\n",
+    "csvs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 114,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def update_grades():\n",
+    "    for i in imp.find(modifiers={\"$snapshot\": True}):\n",
+    "        imp.update_one({'_id': i['_id']},\n",
+    "            {'$set': {'grade': i['Grade/10']['00'],\n",
+    "                      'q1': i['Q'][' 1 /5']['00'],\n",
+    "                      'q2': i['Q'][' 2 /5']['00']},\n",
+    "             '$unset': {'Grade/10': '', 'Q': ''}})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 109,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def update_dates():\n",
+    "    for i in imp.find(modifiers={\"$snapshot\": True}):\n",
+    "        starttime = datetime.strptime(i['Started on'].strip(), '%d %b %Y %H:%M')\n",
+    "        if i['Completed'] == '-':\n",
+    "            imp.update_one({'_id': i['_id']},\n",
+    "                           {'$set': {'started': starttime},\n",
+    "                            '$unset': {'Started on': '', 'Completed': '', 'Time taken': ''}})\n",
+    "        else:\n",
+    "            endtime = datetime.strptime(i['Completed'].strip(), '%d %b %Y %H:%M')\n",
+    "            imp.update_one({'_id': i['_id']},\n",
+    "                           {'$set': {'started': starttime,\n",
+    "                                     'completed': endtime},\n",
+    "                            '$unset': {'Started on': '', 'Completed': '', 'Time taken': ''}})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 110,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def merge_imported(icma_number):\n",
+    "    icmas.delete_many({'icma_number': icma_number})\n",
+    "\n",
+    "    for i in imp.find(modifiers={\"$snapshot\": True}):\n",
+    "        del i['_id']\n",
+    "        i['icma_number'] = icma_number\n",
+    "        icmas.insert_one(i)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 111,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['SUMMEROFCODE-Day 0 Warm up holiday prices-grades.csv',\n",
+       " 'SUMMEROFCODE-Day 1 Choosing a holiday-grades.csv',\n",
+       " 'SUMMEROFCODE-Day 2 Lifts-grades.csv',\n",
+       " 'SUMMEROFCODE-Day 3 Door codes-grades.csv',\n",
+       " 'SUMMEROFCODE-Day 4 Beach labyrinth-grades.csv',\n",
+       " 'SUMMEROFCODE-Day 5 Laser display boards-grades.csv']"
+      ]
+     },
+     "execution_count": 111,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "csvs = !ls *csv\n",
+    "csvs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 123,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<pymongo.results.DeleteResult at 0x7f3f2249fca8>"
+      ]
+     },
+     "execution_count": 123,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "icmas.delete_many({})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 132,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "SUMMEROFCODE-Day 0 Warm up holiday prices-grades.csv\n",
+      "2017-07-22T15:24:32.622+0100\tconnected to: localhost\n",
+      "2017-07-22T15:24:32.622+0100\tdropping: summerofcode.imp\n",
+      "2017-07-22T15:24:32.629+0100\timported 285 documents\n",
+      "SUMMEROFCODE-Day 1 Choosing a holiday-grades.csv\n",
+      "2017-07-22T15:24:32.971+0100\tconnected to: localhost\n",
+      "2017-07-22T15:24:32.971+0100\tdropping: summerofcode.imp\n",
+      "2017-07-22T15:24:32.974+0100\timported 137 documents\n",
+      "SUMMEROFCODE-Day 2 Lifts-grades.csv\n",
+      "2017-07-22T15:24:33.202+0100\tconnected to: localhost\n",
+      "2017-07-22T15:24:33.202+0100\tdropping: summerofcode.imp\n",
+      "2017-07-22T15:24:33.204+0100\timported 92 documents\n",
+      "SUMMEROFCODE-Day 3 Door codes-grades.csv\n",
+      "2017-07-22T15:24:33.394+0100\tconnected to: localhost\n",
+      "2017-07-22T15:24:33.394+0100\tdropping: summerofcode.imp\n",
+      "2017-07-22T15:24:33.396+0100\timported 77 documents\n",
+      "SUMMEROFCODE-Day 4 Beach labyrinth-grades.csv\n",
+      "2017-07-22T15:24:33.573+0100\tconnected to: localhost\n",
+      "2017-07-22T15:24:33.573+0100\tdropping: summerofcode.imp\n",
+      "2017-07-22T15:24:33.575+0100\timported 79 documents\n",
+      "SUMMEROFCODE-Day 5 Laser display boards-grades.csv\n",
+      "2017-07-22T15:24:33.753+0100\tconnected to: localhost\n",
+      "2017-07-22T15:24:33.753+0100\tdropping: summerofcode.imp\n",
+      "2017-07-22T15:24:33.755+0100\timported 57 documents\n"
+     ]
+    }
+   ],
+   "source": [
+    "for csv in csvs:\n",
+    "    print(csv)\n",
+    "    qcsv = \"'{}'\".format(csv)\n",
+    "    !mongoimport --drop --db summerofcode --collection imp --type csv --headerline --ignoreBlanks --file {qcsv}\n",
+    "    imp.delete_many({'Surname': 'Overall average'})\n",
+    "    update_grades()\n",
+    "    update_dates()\n",
+    "    merge_imported(int(csv.split()[1]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 134,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "721"
+      ]
+     },
+     "execution_count": 134,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "icmas.find({}, ['icma']).count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 138,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'_id': 3, 'count': 76},\n",
+       " {'_id': 2, 'count': 91},\n",
+       " {'_id': 5, 'count': 56},\n",
+       " {'_id': 4, 'count': 78},\n",
+       " {'_id': 1, 'count': 136},\n",
+       " {'_id': 0, 'count': 284}]"
+      ]
+     },
+     "execution_count": 138,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pipeline = [{\"$group\": {\"_id\": \"$icma_number\", \"count\": {\"$sum\": 1}}}]\n",
+    "list(icmas.aggregate(pipeline))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2+"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}