Imported all the notebooks
[tm351-notebooks.git] / notebooks / 21. Data Mining I / 21.a. Data Mining I- Nearest neighbours.ipynb
diff --git a/notebooks/21. Data Mining I/21.a. Data Mining I- Nearest neighbours.ipynb b/notebooks/21. Data Mining I/21.a. Data Mining I- Nearest neighbours.ipynb
new file mode 100644 (file)
index 0000000..c9c956e
--- /dev/null
@@ -0,0 +1,1479 @@
+{
+ "metadata": {
+  "name": "",
+  "signature": "sha256:7b7d78ee36dd3d421f76faba9f6ae6d8e8a341a444cf609c536929b941b39358"
+ },
+ "nbformat": 3,
+ "nbformat_minor": 0,
+ "worksheets": [
+  {
+   "cells": [
+    {
+     "cell_type": "heading",
+     "level": 1,
+     "metadata": {},
+     "source": [
+      "Data Mining I: Nearest Neighbours"
+     ]
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "In this notebook, you will use the nearest neighbour libraries in python to carry out the experiments which you carried out by hand in the module notes.\n",
+      "\n",
+      "We will see how to represent the data using pandas DataFrames, and how to use these dataframes to call the appropriate classifier functions."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "import pandas as pd\n",
+      "\n",
+      "import matplotlib.pyplot as plt"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stderr",
+       "text": [
+        "/Users/agw96/virtualenv_environments/tm351/lib/python3.3/site-packages/pandas/io/excel.py:626: UserWarning: Installed openpyxl is not supported at this time. Use >=1.6.1 and <2.0.0.\n",
+        "  .format(openpyxl_compat.start_ver, openpyxl_compat.stop_ver))\n"
+       ]
+      }
+     ],
+     "prompt_number": 1
+    },
+    {
+     "cell_type": "heading",
+     "level": 3,
+     "metadata": {},
+     "source": [
+      "The `KNeighborsClassifier` classifier"
+     ]
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "The `SKLearn` library in python provides a set of functions for carrying out k-nearest neighbour analyses. The library is imported with the `import` command:"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "from sklearn.neighbors import KNeighborsClassifier"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 2
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "The first thing to do is to define a training set. The `KNeighborsClassifier` function takes two arguments. The first is a list of training cases, and the second is a list of the classes into which the training data is split.\n",
+      "\n",
+      "For example, suppose we were to use the data in figure 24.4. The datasets are:\n",
+      "\n",
+      "**Group A**\n",
+      "\n",
+      "|Patient Id|Exercise time (hours) |  Sleep time (hours)|\n",
+      "|:---:|:----:|:----:|\n",
+      "|A1|0.7|5.2|\n",
+      "|A2|0.6|5.6|\n",
+      "|A3|1.0|5.0|\n",
+      "|A4|1.8|5.5|\n",
+      "|A5|3.0|6.6|\n",
+      "|A6|2.5|6.7|\n",
+      "|A7|1.3|6.6|\n",
+      "|A8|0.8|7.6|\n",
+      "|A9|1.2|6.4|\n",
+      "|A10|1.7|7.2|\n",
+      "\n",
+      "**Group B**\n",
+      "\n",
+      "|Patient Id|Exercise time (hours) |  Sleep time (hours)|\n",
+      "|:---:|:----:|:----:|\n",
+      "|B1|2.7|6.2|\n",
+      "|B2|4.2|6.5|\n",
+      "|B3|4.3|7.1|\n",
+      "|B4|3.2|7.0|\n",
+      "|B5|3.5|6.0|\n",
+      "|B6|4.0|5.2|\n",
+      "|B7|2.7|7.2|\n",
+      "|B8|2.5|7.8|\n",
+      "|B9|3.8|6.4|\n",
+      "|B10|3.5|7.0|\n"
+     ]
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "We can represent this data in pandas using dataframes:"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "classA=pd.DataFrame({'Exercise time': [0.7, 0.6, 1.0, 1.8, 3.0, 2.5, 1.3, 0.8, 1.2, 1.7],\n",
+      "                      'Sleep time': [5.2, 5.6, 5.0, 5.5, 6.6, 6.7, 6.6, 7.6, 6.4, 7.2]},\n",
+      "                    index=['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10'])\n",
+      "\n",
+      "classB=pd.DataFrame({'Exercise time': [2.7, 4.2, 4.3, 3.2, 3.5, 4.0, 2.7, 2.5, 3.8, 3.5],\n",
+      "                     'Sleep time': [6.2, 6.5, 7.1, 7.0, 6.0, 5.2, 7.2, 7.8, 6.4, 7.0]},\n",
+      "                    index=['B1', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B9', 'B10'])\n"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 3
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "classA"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "html": [
+        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
+        "<table border=\"1\" class=\"dataframe\">\n",
+        "  <thead>\n",
+        "    <tr style=\"text-align: right;\">\n",
+        "      <th></th>\n",
+        "      <th>Exercise time</th>\n",
+        "      <th>Sleep time</th>\n",
+        "    </tr>\n",
+        "  </thead>\n",
+        "  <tbody>\n",
+        "    <tr>\n",
+        "      <th>A1</th>\n",
+        "      <td> 0.7</td>\n",
+        "      <td> 5.2</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>A2</th>\n",
+        "      <td> 0.6</td>\n",
+        "      <td> 5.6</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>A3</th>\n",
+        "      <td> 1.0</td>\n",
+        "      <td> 5.0</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>A4</th>\n",
+        "      <td> 1.8</td>\n",
+        "      <td> 5.5</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>A5</th>\n",
+        "      <td> 3.0</td>\n",
+        "      <td> 6.6</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>A6</th>\n",
+        "      <td> 2.5</td>\n",
+        "      <td> 6.7</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>A7</th>\n",
+        "      <td> 1.3</td>\n",
+        "      <td> 6.6</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>A8</th>\n",
+        "      <td> 0.8</td>\n",
+        "      <td> 7.6</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>A9</th>\n",
+        "      <td> 1.2</td>\n",
+        "      <td> 6.4</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>A10</th>\n",
+        "      <td> 1.7</td>\n",
+        "      <td> 7.2</td>\n",
+        "    </tr>\n",
+        "  </tbody>\n",
+        "</table>\n",
+        "</div>"
+       ],
+       "metadata": {},
+       "output_type": "pyout",
+       "prompt_number": 4,
+       "text": [
+        "     Exercise time  Sleep time\n",
+        "A1             0.7         5.2\n",
+        "A2             0.6         5.6\n",
+        "A3             1.0         5.0\n",
+        "A4             1.8         5.5\n",
+        "A5             3.0         6.6\n",
+        "A6             2.5         6.7\n",
+        "A7             1.3         6.6\n",
+        "A8             0.8         7.6\n",
+        "A9             1.2         6.4\n",
+        "A10            1.7         7.2"
+       ]
+      }
+     ],
+     "prompt_number": 4
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "classB"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "html": [
+        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
+        "<table border=\"1\" class=\"dataframe\">\n",
+        "  <thead>\n",
+        "    <tr style=\"text-align: right;\">\n",
+        "      <th></th>\n",
+        "      <th>Exercise time</th>\n",
+        "      <th>Sleep time</th>\n",
+        "    </tr>\n",
+        "  </thead>\n",
+        "  <tbody>\n",
+        "    <tr>\n",
+        "      <th>B1</th>\n",
+        "      <td> 2.7</td>\n",
+        "      <td> 6.2</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>B2</th>\n",
+        "      <td> 4.2</td>\n",
+        "      <td> 6.5</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>B3</th>\n",
+        "      <td> 4.3</td>\n",
+        "      <td> 7.1</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>B4</th>\n",
+        "      <td> 3.2</td>\n",
+        "      <td> 7.0</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>B5</th>\n",
+        "      <td> 3.5</td>\n",
+        "      <td> 6.0</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>B6</th>\n",
+        "      <td> 4.0</td>\n",
+        "      <td> 5.2</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>B7</th>\n",
+        "      <td> 2.7</td>\n",
+        "      <td> 7.2</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>B8</th>\n",
+        "      <td> 2.5</td>\n",
+        "      <td> 7.8</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>B9</th>\n",
+        "      <td> 3.8</td>\n",
+        "      <td> 6.4</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>B10</th>\n",
+        "      <td> 3.5</td>\n",
+        "      <td> 7.0</td>\n",
+        "    </tr>\n",
+        "  </tbody>\n",
+        "</table>\n",
+        "</div>"
+       ],
+       "metadata": {},
+       "output_type": "pyout",
+       "prompt_number": 5,
+       "text": [
+        "     Exercise time  Sleep time\n",
+        "B1             2.7         6.2\n",
+        "B2             4.2         6.5\n",
+        "B3             4.3         7.1\n",
+        "B4             3.2         7.0\n",
+        "B5             3.5         6.0\n",
+        "B6             4.0         5.2\n",
+        "B7             2.7         7.2\n",
+        "B8             2.5         7.8\n",
+        "B9             3.8         6.4\n",
+        "B10            3.5         7.0"
+       ]
+      }
+     ],
+     "prompt_number": 5
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "\n",
+      "Now, to apply the classifier to this data, the `KNeighborsClassifier` takes two arguments, the first is a list of the training cases, and the second is a list containing a label for the classifier class in each case.\n",
+      "\n",
+      "The easiest way to implement this in python is to add a new column to each of the dataframes containing the training data, which contains the class of each instance.\n",
+      "\n",
+      "This is straightforward in pandas. (We will add the classifier column to a *copy* of each of `classA` and `classB`, so that the original data can be retrieved if necessary).\n"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "# Make a copy of the original data\n",
+      "trainingDataA = classA.copy()\n",
+      "\n",
+      "# Add a further column to add the classification data\n",
+      "trainingDataA['class']='A'\n",
+      "\n",
+      "trainingDataA"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "html": [
+        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
+        "<table border=\"1\" class=\"dataframe\">\n",
+        "  <thead>\n",
+        "    <tr style=\"text-align: right;\">\n",
+        "      <th></th>\n",
+        "      <th>Exercise time</th>\n",
+        "      <th>Sleep time</th>\n",
+        "      <th>class</th>\n",
+        "    </tr>\n",
+        "  </thead>\n",
+        "  <tbody>\n",
+        "    <tr>\n",
+        "      <th>A1</th>\n",
+        "      <td> 0.7</td>\n",
+        "      <td> 5.2</td>\n",
+        "      <td> A</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>A2</th>\n",
+        "      <td> 0.6</td>\n",
+        "      <td> 5.6</td>\n",
+        "      <td> A</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>A3</th>\n",
+        "      <td> 1.0</td>\n",
+        "      <td> 5.0</td>\n",
+        "      <td> A</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>A4</th>\n",
+        "      <td> 1.8</td>\n",
+        "      <td> 5.5</td>\n",
+        "      <td> A</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>A5</th>\n",
+        "      <td> 3.0</td>\n",
+        "      <td> 6.6</td>\n",
+        "      <td> A</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>A6</th>\n",
+        "      <td> 2.5</td>\n",
+        "      <td> 6.7</td>\n",
+        "      <td> A</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>A7</th>\n",
+        "      <td> 1.3</td>\n",
+        "      <td> 6.6</td>\n",
+        "      <td> A</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>A8</th>\n",
+        "      <td> 0.8</td>\n",
+        "      <td> 7.6</td>\n",
+        "      <td> A</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>A9</th>\n",
+        "      <td> 1.2</td>\n",
+        "      <td> 6.4</td>\n",
+        "      <td> A</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>A10</th>\n",
+        "      <td> 1.7</td>\n",
+        "      <td> 7.2</td>\n",
+        "      <td> A</td>\n",
+        "    </tr>\n",
+        "  </tbody>\n",
+        "</table>\n",
+        "</div>"
+       ],
+       "metadata": {},
+       "output_type": "pyout",
+       "prompt_number": 6,
+       "text": [
+        "     Exercise time  Sleep time class\n",
+        "A1             0.7         5.2     A\n",
+        "A2             0.6         5.6     A\n",
+        "A3             1.0         5.0     A\n",
+        "A4             1.8         5.5     A\n",
+        "A5             3.0         6.6     A\n",
+        "A6             2.5         6.7     A\n",
+        "A7             1.3         6.6     A\n",
+        "A8             0.8         7.6     A\n",
+        "A9             1.2         6.4     A\n",
+        "A10            1.7         7.2     A"
+       ]
+      }
+     ],
+     "prompt_number": 6
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "# Make a copy of the original data\n",
+      "trainingDataB = classB.copy()\n",
+      "\n",
+      "# Add a further column to add the classification data\n",
+      "trainingDataB['class']='B'\n",
+      "\n",
+      "trainingDataB"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "html": [
+        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
+        "<table border=\"1\" class=\"dataframe\">\n",
+        "  <thead>\n",
+        "    <tr style=\"text-align: right;\">\n",
+        "      <th></th>\n",
+        "      <th>Exercise time</th>\n",
+        "      <th>Sleep time</th>\n",
+        "      <th>class</th>\n",
+        "    </tr>\n",
+        "  </thead>\n",
+        "  <tbody>\n",
+        "    <tr>\n",
+        "      <th>B1</th>\n",
+        "      <td> 2.7</td>\n",
+        "      <td> 6.2</td>\n",
+        "      <td> B</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>B2</th>\n",
+        "      <td> 4.2</td>\n",
+        "      <td> 6.5</td>\n",
+        "      <td> B</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>B3</th>\n",
+        "      <td> 4.3</td>\n",
+        "      <td> 7.1</td>\n",
+        "      <td> B</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>B4</th>\n",
+        "      <td> 3.2</td>\n",
+        "      <td> 7.0</td>\n",
+        "      <td> B</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>B5</th>\n",
+        "      <td> 3.5</td>\n",
+        "      <td> 6.0</td>\n",
+        "      <td> B</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>B6</th>\n",
+        "      <td> 4.0</td>\n",
+        "      <td> 5.2</td>\n",
+        "      <td> B</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>B7</th>\n",
+        "      <td> 2.7</td>\n",
+        "      <td> 7.2</td>\n",
+        "      <td> B</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>B8</th>\n",
+        "      <td> 2.5</td>\n",
+        "      <td> 7.8</td>\n",
+        "      <td> B</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>B9</th>\n",
+        "      <td> 3.8</td>\n",
+        "      <td> 6.4</td>\n",
+        "      <td> B</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>B10</th>\n",
+        "      <td> 3.5</td>\n",
+        "      <td> 7.0</td>\n",
+        "      <td> B</td>\n",
+        "    </tr>\n",
+        "  </tbody>\n",
+        "</table>\n",
+        "</div>"
+       ],
+       "metadata": {},
+       "output_type": "pyout",
+       "prompt_number": 7,
+       "text": [
+        "     Exercise time  Sleep time class\n",
+        "B1             2.7         6.2     B\n",
+        "B2             4.2         6.5     B\n",
+        "B3             4.3         7.1     B\n",
+        "B4             3.2         7.0     B\n",
+        "B5             3.5         6.0     B\n",
+        "B6             4.0         5.2     B\n",
+        "B7             2.7         7.2     B\n",
+        "B8             2.5         7.8     B\n",
+        "B9             3.8         6.4     B\n",
+        "B10            3.5         7.0     B"
+       ]
+      }
+     ],
+     "prompt_number": 7
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "We can create the list `trainingData` by concatenating the lists `classA` and `classB`:\n",
+      "\n",
+      "<div style='color:red'>Do we need to draw attention to the reindexing? It's not particularly relevant to the point. Depends on how much they've seen on indexing in the earlier sections, I guess (Tony and Steven are still discussing this.</div>"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "trainingData=trainingDataA.append(trainingDataB, ignore_index=True)\n",
+      "\n",
+      "trainingData"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "html": [
+        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
+        "<table border=\"1\" class=\"dataframe\">\n",
+        "  <thead>\n",
+        "    <tr style=\"text-align: right;\">\n",
+        "      <th></th>\n",
+        "      <th>Exercise time</th>\n",
+        "      <th>Sleep time</th>\n",
+        "      <th>class</th>\n",
+        "    </tr>\n",
+        "  </thead>\n",
+        "  <tbody>\n",
+        "    <tr>\n",
+        "      <th>0 </th>\n",
+        "      <td> 0.7</td>\n",
+        "      <td> 5.2</td>\n",
+        "      <td> A</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>1 </th>\n",
+        "      <td> 0.6</td>\n",
+        "      <td> 5.6</td>\n",
+        "      <td> A</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>2 </th>\n",
+        "      <td> 1.0</td>\n",
+        "      <td> 5.0</td>\n",
+        "      <td> A</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>3 </th>\n",
+        "      <td> 1.8</td>\n",
+        "      <td> 5.5</td>\n",
+        "      <td> A</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>4 </th>\n",
+        "      <td> 3.0</td>\n",
+        "      <td> 6.6</td>\n",
+        "      <td> A</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>5 </th>\n",
+        "      <td> 2.5</td>\n",
+        "      <td> 6.7</td>\n",
+        "      <td> A</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>6 </th>\n",
+        "      <td> 1.3</td>\n",
+        "      <td> 6.6</td>\n",
+        "      <td> A</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>7 </th>\n",
+        "      <td> 0.8</td>\n",
+        "      <td> 7.6</td>\n",
+        "      <td> A</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>8 </th>\n",
+        "      <td> 1.2</td>\n",
+        "      <td> 6.4</td>\n",
+        "      <td> A</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>9 </th>\n",
+        "      <td> 1.7</td>\n",
+        "      <td> 7.2</td>\n",
+        "      <td> A</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>10</th>\n",
+        "      <td> 2.7</td>\n",
+        "      <td> 6.2</td>\n",
+        "      <td> B</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>11</th>\n",
+        "      <td> 4.2</td>\n",
+        "      <td> 6.5</td>\n",
+        "      <td> B</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>12</th>\n",
+        "      <td> 4.3</td>\n",
+        "      <td> 7.1</td>\n",
+        "      <td> B</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>13</th>\n",
+        "      <td> 3.2</td>\n",
+        "      <td> 7.0</td>\n",
+        "      <td> B</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>14</th>\n",
+        "      <td> 3.5</td>\n",
+        "      <td> 6.0</td>\n",
+        "      <td> B</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>15</th>\n",
+        "      <td> 4.0</td>\n",
+        "      <td> 5.2</td>\n",
+        "      <td> B</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>16</th>\n",
+        "      <td> 2.7</td>\n",
+        "      <td> 7.2</td>\n",
+        "      <td> B</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>17</th>\n",
+        "      <td> 2.5</td>\n",
+        "      <td> 7.8</td>\n",
+        "      <td> B</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>18</th>\n",
+        "      <td> 3.8</td>\n",
+        "      <td> 6.4</td>\n",
+        "      <td> B</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>19</th>\n",
+        "      <td> 3.5</td>\n",
+        "      <td> 7.0</td>\n",
+        "      <td> B</td>\n",
+        "    </tr>\n",
+        "  </tbody>\n",
+        "</table>\n",
+        "</div>"
+       ],
+       "metadata": {},
+       "output_type": "pyout",
+       "prompt_number": 8,
+       "text": [
+        "    Exercise time  Sleep time class\n",
+        "0             0.7         5.2     A\n",
+        "1             0.6         5.6     A\n",
+        "2             1.0         5.0     A\n",
+        "3             1.8         5.5     A\n",
+        "4             3.0         6.6     A\n",
+        "5             2.5         6.7     A\n",
+        "6             1.3         6.6     A\n",
+        "7             0.8         7.6     A\n",
+        "8             1.2         6.4     A\n",
+        "9             1.7         7.2     A\n",
+        "10            2.7         6.2     B\n",
+        "11            4.2         6.5     B\n",
+        "12            4.3         7.1     B\n",
+        "13            3.2         7.0     B\n",
+        "14            3.5         6.0     B\n",
+        "15            4.0         5.2     B\n",
+        "16            2.7         7.2     B\n",
+        "17            2.5         7.8     B\n",
+        "18            3.8         6.4     B\n",
+        "19            3.5         7.0     B"
+       ]
+      }
+     ],
+     "prompt_number": 8
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "The next step is to construct an appropriate classifier. Calling `KNeighborsClassifier` returns a classifier object which can be trained with the training data. The classifier is called with a suitable value of *k* with:"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "k=3\n",
+      "classifier3 = KNeighborsClassifier(n_neighbors=k)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 9
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "To train the classifier, the classifier is called with the training data as the first argument, and the classes of datapoints as the second argument:"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "classifier3.fit(trainingData[['Exercise time', 'Sleep time']],\n",
+      "                trainingData['class'])"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "metadata": {},
+       "output_type": "pyout",
+       "prompt_number": 10,
+       "text": [
+        "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
+        "           n_neighbors=3, p=2, weights='uniform')"
+       ]
+      }
+     ],
+     "prompt_number": 10
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Our k-NN classifier is now ready to be used. To use the classifier to classify a new instance, we use the command `predict` with the new instance.\n",
+      "\n",
+      "So in this case, if the new patient had an exercise time of 2.5 hours, and a sleep time of 6.5 hours, then to classify him/her, we call:"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "print(classifier3.predict((2.5, 6.5)))"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "['A']\n"
+       ]
+      }
+     ],
+     "prompt_number": 11
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "The classifier predicts that the patient should be placed in class A."
+     ]
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "\n",
+      "To implement a weighted voting scheme in Python, we can simply set the parameter weights to `'distance'` when we train the classifier as follows: \n"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "k=3\n",
+      "classifier3 = KNeighborsClassifier(n_neighbors=k, weights='distance')"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 12
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "classifier3.fit(trainingData[['Exercise time', 'Sleep time']],\n",
+      "                trainingData['class'])"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "metadata": {},
+       "output_type": "pyout",
+       "prompt_number": 13,
+       "text": [
+        "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
+        "           n_neighbors=3, p=2, weights='distance')"
+       ]
+      }
+     ],
+     "prompt_number": 13
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "If the parameter weights is not specified, then by default it is set to `'uniform'`, to mean that each of the nearest neighbours is weighted equally."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "print(classifier3.predict((2.5, 6.5)))"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "['A']\n"
+       ]
+      }
+     ],
+     "prompt_number": 14
+    },
+    {
+     "cell_type": "heading",
+     "level": 3,
+     "metadata": {},
+     "source": [
+      "Exercise 24.5a"
+     ]
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Redo Exercise 24.3 using Python."
+     ]
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "We provide data for the two classes, *class A* and *class B* in the variables `classA` and `classB` respectively:"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "classA=pd.DataFrame({'Attribute 1':[15.7, 15.6, 16.0, 16.8, 18.0, 17.5, 16.3, 15.8, 16.2, 16.7],\n",
+      "                     'Attribute 2':[20.2, 20.6, 20.0, 20.5, 21.6, 21.7, 21.6, 22.6, 21.4, 22.2],\n",
+      "                     'Attribute 3':[1.3, 2.2, 3.4, 4.2, 5.5, 6.7, 7.6, 8.2, 9.0, 9.7]})\n",
+      "\n",
+      "classB=pd.DataFrame({'Attribute 1':[14.05, 16.3, 16.45, 14.8, 15.25, 16.0, 14.05, 13.75, 15.7, 15.25],\n",
+      "                     'Attribute 2':[19.3, 19.75, 20.65, 20.5, 19.0, 17.8, 20.8, 21.7, 19.6, 20.5],\n",
+      "                     'Attribute 3':[6.6, 7.2, 8.0, 9.1, 9.9, 11.0, 12.2, 13.2, 14.5, 14.5]})\n"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 15
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "classA"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "html": [
+        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
+        "<table border=\"1\" class=\"dataframe\">\n",
+        "  <thead>\n",
+        "    <tr style=\"text-align: right;\">\n",
+        "      <th></th>\n",
+        "      <th>Attribute 1</th>\n",
+        "      <th>Attribute 2</th>\n",
+        "      <th>Attribute 3</th>\n",
+        "    </tr>\n",
+        "  </thead>\n",
+        "  <tbody>\n",
+        "    <tr>\n",
+        "      <th>0</th>\n",
+        "      <td> 15.7</td>\n",
+        "      <td> 20.2</td>\n",
+        "      <td> 1.3</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>1</th>\n",
+        "      <td> 15.6</td>\n",
+        "      <td> 20.6</td>\n",
+        "      <td> 2.2</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>2</th>\n",
+        "      <td> 16.0</td>\n",
+        "      <td> 20.0</td>\n",
+        "      <td> 3.4</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>3</th>\n",
+        "      <td> 16.8</td>\n",
+        "      <td> 20.5</td>\n",
+        "      <td> 4.2</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>4</th>\n",
+        "      <td> 18.0</td>\n",
+        "      <td> 21.6</td>\n",
+        "      <td> 5.5</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>5</th>\n",
+        "      <td> 17.5</td>\n",
+        "      <td> 21.7</td>\n",
+        "      <td> 6.7</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>6</th>\n",
+        "      <td> 16.3</td>\n",
+        "      <td> 21.6</td>\n",
+        "      <td> 7.6</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>7</th>\n",
+        "      <td> 15.8</td>\n",
+        "      <td> 22.6</td>\n",
+        "      <td> 8.2</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>8</th>\n",
+        "      <td> 16.2</td>\n",
+        "      <td> 21.4</td>\n",
+        "      <td> 9.0</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>9</th>\n",
+        "      <td> 16.7</td>\n",
+        "      <td> 22.2</td>\n",
+        "      <td> 9.7</td>\n",
+        "    </tr>\n",
+        "  </tbody>\n",
+        "</table>\n",
+        "</div>"
+       ],
+       "metadata": {},
+       "output_type": "pyout",
+       "prompt_number": 16,
+       "text": [
+        "   Attribute 1  Attribute 2  Attribute 3\n",
+        "0         15.7         20.2          1.3\n",
+        "1         15.6         20.6          2.2\n",
+        "2         16.0         20.0          3.4\n",
+        "3         16.8         20.5          4.2\n",
+        "4         18.0         21.6          5.5\n",
+        "5         17.5         21.7          6.7\n",
+        "6         16.3         21.6          7.6\n",
+        "7         15.8         22.6          8.2\n",
+        "8         16.2         21.4          9.0\n",
+        "9         16.7         22.2          9.7"
+       ]
+      }
+     ],
+     "prompt_number": 16
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "classB"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "html": [
+        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
+        "<table border=\"1\" class=\"dataframe\">\n",
+        "  <thead>\n",
+        "    <tr style=\"text-align: right;\">\n",
+        "      <th></th>\n",
+        "      <th>Attribute 1</th>\n",
+        "      <th>Attribute 2</th>\n",
+        "      <th>Attribute 3</th>\n",
+        "    </tr>\n",
+        "  </thead>\n",
+        "  <tbody>\n",
+        "    <tr>\n",
+        "      <th>0</th>\n",
+        "      <td> 14.05</td>\n",
+        "      <td> 19.30</td>\n",
+        "      <td>  6.6</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>1</th>\n",
+        "      <td> 16.30</td>\n",
+        "      <td> 19.75</td>\n",
+        "      <td>  7.2</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>2</th>\n",
+        "      <td> 16.45</td>\n",
+        "      <td> 20.65</td>\n",
+        "      <td>  8.0</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>3</th>\n",
+        "      <td> 14.80</td>\n",
+        "      <td> 20.50</td>\n",
+        "      <td>  9.1</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>4</th>\n",
+        "      <td> 15.25</td>\n",
+        "      <td> 19.00</td>\n",
+        "      <td>  9.9</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>5</th>\n",
+        "      <td> 16.00</td>\n",
+        "      <td> 17.80</td>\n",
+        "      <td> 11.0</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>6</th>\n",
+        "      <td> 14.05</td>\n",
+        "      <td> 20.80</td>\n",
+        "      <td> 12.2</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>7</th>\n",
+        "      <td> 13.75</td>\n",
+        "      <td> 21.70</td>\n",
+        "      <td> 13.2</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>8</th>\n",
+        "      <td> 15.70</td>\n",
+        "      <td> 19.60</td>\n",
+        "      <td> 14.5</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>9</th>\n",
+        "      <td> 15.25</td>\n",
+        "      <td> 20.50</td>\n",
+        "      <td> 14.5</td>\n",
+        "    </tr>\n",
+        "  </tbody>\n",
+        "</table>\n",
+        "</div>"
+       ],
+       "metadata": {},
+       "output_type": "pyout",
+       "prompt_number": 17,
+       "text": [
+        "   Attribute 1  Attribute 2  Attribute 3\n",
+        "0        14.05        19.30          6.6\n",
+        "1        16.30        19.75          7.2\n",
+        "2        16.45        20.65          8.0\n",
+        "3        14.80        20.50          9.1\n",
+        "4        15.25        19.00          9.9\n",
+        "5        16.00        17.80         11.0\n",
+        "6        14.05        20.80         12.2\n",
+        "7        13.75        21.70         13.2\n",
+        "8        15.70        19.60         14.5\n",
+        "9        15.25        20.50         14.5"
+       ]
+      }
+     ],
+     "prompt_number": 17
+    },
+    {
+     "cell_type": "heading",
+     "level": 4,
+     "metadata": {},
+     "source": [
+      "Solution 24.5(a)"
+     ]
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "To redo Exercise 24.3, we will follow the same pattern that we have just seen in the notebook.\n"
+     ]
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "First, take training copies of the two classes, and add a new column to each to represent the class that each data point belongs to."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "trainingDataA=classA.copy()\n",
+      "trainingDataA['class']='A'\n",
+      "\n",
+      "trainingDataB=classB.copy()\n",
+      "trainingDataB['class']='B'\n",
+      "\n",
+      "trainingData=trainingDataA.append(trainingDataB, ignore_index=True)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 18
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "trainingData"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "html": [
+        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
+        "<table border=\"1\" class=\"dataframe\">\n",
+        "  <thead>\n",
+        "    <tr style=\"text-align: right;\">\n",
+        "      <th></th>\n",
+        "      <th>Attribute 1</th>\n",
+        "      <th>Attribute 2</th>\n",
+        "      <th>Attribute 3</th>\n",
+        "      <th>class</th>\n",
+        "    </tr>\n",
+        "  </thead>\n",
+        "  <tbody>\n",
+        "    <tr>\n",
+        "      <th>0 </th>\n",
+        "      <td> 15.70</td>\n",
+        "      <td> 20.20</td>\n",
+        "      <td>  1.3</td>\n",
+        "      <td> A</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>1 </th>\n",
+        "      <td> 15.60</td>\n",
+        "      <td> 20.60</td>\n",
+        "      <td>  2.2</td>\n",
+        "      <td> A</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>2 </th>\n",
+        "      <td> 16.00</td>\n",
+        "      <td> 20.00</td>\n",
+        "      <td>  3.4</td>\n",
+        "      <td> A</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>3 </th>\n",
+        "      <td> 16.80</td>\n",
+        "      <td> 20.50</td>\n",
+        "      <td>  4.2</td>\n",
+        "      <td> A</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>4 </th>\n",
+        "      <td> 18.00</td>\n",
+        "      <td> 21.60</td>\n",
+        "      <td>  5.5</td>\n",
+        "      <td> A</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>5 </th>\n",
+        "      <td> 17.50</td>\n",
+        "      <td> 21.70</td>\n",
+        "      <td>  6.7</td>\n",
+        "      <td> A</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>6 </th>\n",
+        "      <td> 16.30</td>\n",
+        "      <td> 21.60</td>\n",
+        "      <td>  7.6</td>\n",
+        "      <td> A</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>7 </th>\n",
+        "      <td> 15.80</td>\n",
+        "      <td> 22.60</td>\n",
+        "      <td>  8.2</td>\n",
+        "      <td> A</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>8 </th>\n",
+        "      <td> 16.20</td>\n",
+        "      <td> 21.40</td>\n",
+        "      <td>  9.0</td>\n",
+        "      <td> A</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>9 </th>\n",
+        "      <td> 16.70</td>\n",
+        "      <td> 22.20</td>\n",
+        "      <td>  9.7</td>\n",
+        "      <td> A</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>10</th>\n",
+        "      <td> 14.05</td>\n",
+        "      <td> 19.30</td>\n",
+        "      <td>  6.6</td>\n",
+        "      <td> B</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>11</th>\n",
+        "      <td> 16.30</td>\n",
+        "      <td> 19.75</td>\n",
+        "      <td>  7.2</td>\n",
+        "      <td> B</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>12</th>\n",
+        "      <td> 16.45</td>\n",
+        "      <td> 20.65</td>\n",
+        "      <td>  8.0</td>\n",
+        "      <td> B</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>13</th>\n",
+        "      <td> 14.80</td>\n",
+        "      <td> 20.50</td>\n",
+        "      <td>  9.1</td>\n",
+        "      <td> B</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>14</th>\n",
+        "      <td> 15.25</td>\n",
+        "      <td> 19.00</td>\n",
+        "      <td>  9.9</td>\n",
+        "      <td> B</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>15</th>\n",
+        "      <td> 16.00</td>\n",
+        "      <td> 17.80</td>\n",
+        "      <td> 11.0</td>\n",
+        "      <td> B</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>16</th>\n",
+        "      <td> 14.05</td>\n",
+        "      <td> 20.80</td>\n",
+        "      <td> 12.2</td>\n",
+        "      <td> B</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>17</th>\n",
+        "      <td> 13.75</td>\n",
+        "      <td> 21.70</td>\n",
+        "      <td> 13.2</td>\n",
+        "      <td> B</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>18</th>\n",
+        "      <td> 15.70</td>\n",
+        "      <td> 19.60</td>\n",
+        "      <td> 14.5</td>\n",
+        "      <td> B</td>\n",
+        "    </tr>\n",
+        "    <tr>\n",
+        "      <th>19</th>\n",
+        "      <td> 15.25</td>\n",
+        "      <td> 20.50</td>\n",
+        "      <td> 14.5</td>\n",
+        "      <td> B</td>\n",
+        "    </tr>\n",
+        "  </tbody>\n",
+        "</table>\n",
+        "</div>"
+       ],
+       "metadata": {},
+       "output_type": "pyout",
+       "prompt_number": 19,
+       "text": [
+        "    Attribute 1  Attribute 2  Attribute 3 class\n",
+        "0         15.70        20.20          1.3     A\n",
+        "1         15.60        20.60          2.2     A\n",
+        "2         16.00        20.00          3.4     A\n",
+        "3         16.80        20.50          4.2     A\n",
+        "4         18.00        21.60          5.5     A\n",
+        "5         17.50        21.70          6.7     A\n",
+        "6         16.30        21.60          7.6     A\n",
+        "7         15.80        22.60          8.2     A\n",
+        "8         16.20        21.40          9.0     A\n",
+        "9         16.70        22.20          9.7     A\n",
+        "10        14.05        19.30          6.6     B\n",
+        "11        16.30        19.75          7.2     B\n",
+        "12        16.45        20.65          8.0     B\n",
+        "13        14.80        20.50          9.1     B\n",
+        "14        15.25        19.00          9.9     B\n",
+        "15        16.00        17.80         11.0     B\n",
+        "16        14.05        20.80         12.2     B\n",
+        "17        13.75        21.70         13.2     B\n",
+        "18        15.70        19.60         14.5     B\n",
+        "19        15.25        20.50         14.5     B"
+       ]
+      }
+     ],
+     "prompt_number": 19
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "In this case, we want to use a classifier with *k=5*, so initialise the classifier appropriately:"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "\n",
+      "# Training k-NN classifier with our training set\n",
+      "k = 5\n",
+      "classifier5 = KNeighborsClassifier(n_neighbors=k)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 20
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "...and train the classifier again using the data in the dataframes:"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "classifier5.fit(trainingData[['Attribute 1', 'Attribute 2', 'Attribute 3']],\n",
+      "                trainingData['class']) \n"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "metadata": {},
+       "output_type": "pyout",
+       "prompt_number": 21,
+       "text": [
+        "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
+        "           n_neighbors=5, p=2, weights='uniform')"
+       ]
+      }
+     ],
+     "prompt_number": 21
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "We can then predict the class of the new data point by calling the `predict` method on the classifier:"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "# Classifying a new data point, and show the result of\n",
+      "# entering it into the classifier:\n",
+      "print(classifier5.predict((15.8, 20.1, 7.0)))"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "['B']\n"
+       ]
+      }
+     ],
+     "prompt_number": 22
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Has the new datapoint been correctly classified?"
+     ]
+    }
+   ],
+   "metadata": {}
+  }
+ ]
+}
\ No newline at end of file