From 2f9c7346690126c33c311b31f2c71d6442753a04 Mon Sep 17 00:00:00 2001 From: Neil Smith Date: Sat, 22 Jul 2017 19:11:34 +0100 Subject: [PATCH] Data analysis --- .gitignore | 6 + data-analysis/import-data.ipynb | 321 ++++++++++++++++++++++++++++++++ 2 files changed, 327 insertions(+) create mode 100644 data-analysis/import-data.ipynb diff --git a/.gitignore b/.gitignore index 0546add..31ec7fc 100644 --- a/.gitignore +++ b/.gitignore @@ -43,3 +43,9 @@ nosetests.xml # Logs *.log + +# CSV files, as they have personal data +data-analysis/*csv +data-analysis/winners.ipynb + + diff --git a/data-analysis/import-data.ipynb b/data-analysis/import-data.ipynb new file mode 100644 index 0000000..da7863e --- /dev/null +++ b/data-analysis/import-data.ipynb @@ -0,0 +1,321 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "# Import the required libraries and open the connection to Mongo\n", + "\n", + "import collections\n", + "from datetime import datetime\n", + "import matplotlib as mpl\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import scipy.stats\n", + "\n", + "import pymongo\n", + "from bson.objectid import ObjectId\n", + "# client = pymongo.MongoClient('mongodb://localhost:27117/')\n", + "# client = pymongo.MongoClient('mongodb://ogedei:27017')\n", + "client = pymongo.MongoClient('mongodb://localhost:27017')" + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Connect to the database\n", + "summerofcode_db = client.summerofcode\n", + "imp = summerofcode_db.imp\n", + "icmas = summerofcode_db.icmas" + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "14.1 Basic CRUD.ipynb\r\n", + "2016J_TMA01_q1-TN.ipynb\r\n", + "2016J_TMA01_q2-withTN.ipynb\r\n", + "import-data-Copy1.ipynb\r\n", + "import-data.ipynb\r\n", + "SUMMEROFCODE-Day 0 Warm up holiday prices-grades.csv\r\n", + "SUMMEROFCODE-Day 1 Choosing a holiday-grades.csv\r\n", + "SUMMEROFCODE-Day 2 Lifts-grades.csv\r\n", + "SUMMEROFCODE-Day 3 Door codes-grades.csv\r\n", + "SUMMEROFCODE-Day 4 Beach labyrinth-grades.csv\r\n", + "SUMMEROFCODE-Day 5 Laser display boards-grades.csv\r\n", + "TMA01_Question3_Solutions.ipynb\r\n" + ] + } + ], + "source": [ + "!ls" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['SUMMEROFCODE-Day 0 Warm up holiday prices-grades.csv',\n", + " 'SUMMEROFCODE-Day 1 Choosing a holiday-grades.csv',\n", + " 'SUMMEROFCODE-Day 2 Lifts-grades.csv',\n", + " 'SUMMEROFCODE-Day 3 Door codes-grades.csv',\n", + " 'SUMMEROFCODE-Day 4 Beach labyrinth-grades.csv',\n", + " 'SUMMEROFCODE-Day 5 Laser display boards-grades.csv']" + ] + }, + "execution_count": 102, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "csvs = !ls *csv\n", + "csvs" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def update_grades():\n", + " for i in imp.find(modifiers={\"$snapshot\": True}):\n", + " imp.update_one({'_id': i['_id']},\n", + " {'$set': {'grade': i['Grade/10']['00'],\n", + " 'q1': i['Q'][' 1 /5']['00'],\n", + " 'q2': i['Q'][' 2 /5']['00']},\n", + " '$unset': {'Grade/10': '', 'Q': ''}})" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": {}, + "outputs": [], + "source": [ + "def update_dates():\n", + " for i in imp.find(modifiers={\"$snapshot\": True}):\n", + " starttime = datetime.strptime(i['Started on'].strip(), '%d %b %Y %H:%M')\n", + " if i['Completed'] == '-':\n", + " imp.update_one({'_id': i['_id']},\n", + " {'$set': {'started': starttime},\n", + " '$unset': {'Started on': '', 'Completed': '', 'Time taken': ''}})\n", + " else:\n", + " endtime = datetime.strptime(i['Completed'].strip(), '%d %b %Y %H:%M')\n", + " imp.update_one({'_id': i['_id']},\n", + " {'$set': {'started': starttime,\n", + " 'completed': endtime},\n", + " '$unset': {'Started on': '', 'Completed': '', 'Time taken': ''}})" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def merge_imported(icma_number):\n", + " icmas.delete_many({'icma_number': icma_number})\n", + "\n", + " for i in imp.find(modifiers={\"$snapshot\": True}):\n", + " del i['_id']\n", + " i['icma_number'] = icma_number\n", + " icmas.insert_one(i)" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['SUMMEROFCODE-Day 0 Warm up holiday prices-grades.csv',\n", + " 'SUMMEROFCODE-Day 1 Choosing a holiday-grades.csv',\n", + " 'SUMMEROFCODE-Day 2 Lifts-grades.csv',\n", + " 'SUMMEROFCODE-Day 3 Door codes-grades.csv',\n", + " 'SUMMEROFCODE-Day 4 Beach labyrinth-grades.csv',\n", + " 'SUMMEROFCODE-Day 5 Laser display boards-grades.csv']" + ] + }, + "execution_count": 111, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "csvs = !ls *csv\n", + "csvs" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 123, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "icmas.delete_many({})" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SUMMEROFCODE-Day 0 Warm up holiday prices-grades.csv\n", + "2017-07-22T15:24:32.622+0100\tconnected to: localhost\n", + "2017-07-22T15:24:32.622+0100\tdropping: summerofcode.imp\n", + "2017-07-22T15:24:32.629+0100\timported 285 documents\n", + "SUMMEROFCODE-Day 1 Choosing a holiday-grades.csv\n", + "2017-07-22T15:24:32.971+0100\tconnected to: localhost\n", + "2017-07-22T15:24:32.971+0100\tdropping: summerofcode.imp\n", + "2017-07-22T15:24:32.974+0100\timported 137 documents\n", + "SUMMEROFCODE-Day 2 Lifts-grades.csv\n", + "2017-07-22T15:24:33.202+0100\tconnected to: localhost\n", + "2017-07-22T15:24:33.202+0100\tdropping: summerofcode.imp\n", + "2017-07-22T15:24:33.204+0100\timported 92 documents\n", + "SUMMEROFCODE-Day 3 Door codes-grades.csv\n", + "2017-07-22T15:24:33.394+0100\tconnected to: localhost\n", + "2017-07-22T15:24:33.394+0100\tdropping: summerofcode.imp\n", + "2017-07-22T15:24:33.396+0100\timported 77 documents\n", + "SUMMEROFCODE-Day 4 Beach labyrinth-grades.csv\n", + "2017-07-22T15:24:33.573+0100\tconnected to: localhost\n", + "2017-07-22T15:24:33.573+0100\tdropping: summerofcode.imp\n", + "2017-07-22T15:24:33.575+0100\timported 79 documents\n", + "SUMMEROFCODE-Day 5 Laser display boards-grades.csv\n", + "2017-07-22T15:24:33.753+0100\tconnected to: localhost\n", + "2017-07-22T15:24:33.753+0100\tdropping: summerofcode.imp\n", + "2017-07-22T15:24:33.755+0100\timported 57 documents\n" + ] + } + ], + "source": [ + "for csv in csvs:\n", + " print(csv)\n", + " qcsv = \"'{}'\".format(csv)\n", + " !mongoimport --drop --db summerofcode --collection imp --type csv --headerline --ignoreBlanks --file {qcsv}\n", + " imp.delete_many({'Surname': 'Overall average'})\n", + " update_grades()\n", + " update_dates()\n", + " merge_imported(int(csv.split()[1]))" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "721" + ] + }, + "execution_count": 134, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "icmas.find({}, ['icma']).count()" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'_id': 3, 'count': 76},\n", + " {'_id': 2, 'count': 91},\n", + " {'_id': 5, 'count': 56},\n", + " {'_id': 4, 'count': 78},\n", + " {'_id': 1, 'count': 136},\n", + " {'_id': 0, 'count': 284}]" + ] + }, + "execution_count": 138, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline = [{\"$group\": {\"_id\": \"$icma_number\", \"count\": {\"$sum\": 1}}}]\n", + "list(icmas.aggregate(pipeline))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2+" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} -- 2.34.1