Imported all the notebooks
[tm351-notebooks.git] / notebooks / Example Linked Data Notebooks / LD.02.1 OU Linked Data Demo.ipynb
diff --git a/notebooks/Example Linked Data Notebooks/LD.02.1 OU Linked Data Demo.ipynb b/notebooks/Example Linked Data Notebooks/LD.02.1 OU Linked Data Demo.ipynb
new file mode 100644 (file)
index 0000000..0a9ad01
--- /dev/null
@@ -0,0 +1,475 @@
+{
+ "metadata": {
+  "name": "",
+  "signature": "sha256:e062294dfcf41b7573bf004f823ea29ccd657b9f4debcbb2afd49ff3592180c9"
+ },
+ "nbformat": 3,
+ "nbformat_minor": 0,
+ "worksheets": [
+  {
+   "cells": [
+    {
+     "cell_type": "heading",
+     "level": 1,
+     "metadata": {},
+     "source": [
+      "OU Linked Data Demo"
+     ]
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "This notebook provides a quick tour of some of the data on the Open University's Open Linked Data platform."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "#Install a library to help us run some SPARQL queries if we haven't already installed it\n",
+      "#http://rdflib.github.io/sparqlwrapper/\n",
+      "#!pip3 install sparqlwrapper"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "#Import the necessary packages\n",
+      "from SPARQLWrapper import SPARQLWrapper, JSON"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "#Declare the Open University open Linked Data endpoint\n",
+      "endpoint=\"http://data.open.ac.uk/query\""
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "To get the data into a form where it's immediately useable, we can define some helper functions that will parse the result of the query into a *pandas* dataframe.\n",
+      "\n",
+      "Note that some endpoints can return data in a variety of data formats, including serialsed JSON and flattened CSV data tables that can be dropped directly into a *pandas* dataframe.\n",
+      "\n",
+      "We will stick with using RDF based responses for these notebooks."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "#A helper...\n",
+      "def runQuery(endpoint,prefix,q):\n",
+      "    ''' Run a SPARQL query with a declared prefix over a specified endpoint '''\n",
+      "    sparql = SPARQLWrapper(endpoint)\n",
+      "    sparql.setQuery(prefix+q)\n",
+      "    sparql.setReturnFormat(JSON)\n",
+      "    return sparql.query().convert()\n",
+      "\n",
+      "import pandas as pd\n",
+      "#And some more helpers\n",
+      "def dict2df(results):\n",
+      "    ''' Hack a function to flatten the SPARQL query results and return the column values '''\n",
+      "    data=[]\n",
+      "    for result in results[\"results\"][\"bindings\"]:\n",
+      "        tmp={}\n",
+      "        for el in result:\n",
+      "            tmp[el]=result[el]['value']\n",
+      "        data.append(tmp)\n",
+      "\n",
+      "    df = pd.DataFrame(data)\n",
+      "    return df\n",
+      "\n",
+      "def dfResults(endpoint,prefix,q):\n",
+      "    ''' Generate a data frame containing the results of running\n",
+      "        a SPARQL query with a declared prefix over a specified endpoint '''\n",
+      "    return dict2df( runQuery( endpoint, prefix, q ) )\n",
+      "\n",
+      "#NOTE\n",
+      "#http://nbviewer.ipython.org/github/pgroth/notebook/blob/master/sparql-viz-play.ipynb hints at some possibly tidier ways of doing this\n",
+      "#eg through the use of .variables and .bindings from sparql.query() (?without setting the JSON response?)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "#Sometimes it's convenient to build a default prefix to handle most queries to the endpoint\n",
+      "prefix='''\n",
+      "'''"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "I always think of constructing SPARQL queries a bit like constructing a dreamcatcher. You have to craft the query in the form of a graph that mathes - and can capture -  fragments of the whole Linked Data graph residing at the SPARQL endpoint you're querying over. The form of the query thus has to correspond to the shape, and internal graphical connectedness, of the answers of the question you want to ask.\n",
+      "\n",
+      "As with many Linked Data platforms, a useful starting point for exploration is often an example query... "
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "#Prices (ordered) with currency of OU level 1 courses in Arts and Humanities as available in France.\n",
+      "#via http://data.open.ac.uk/site/queries.html\n",
+      "q='''\n",
+      "#We're looking for some course identifiers along with the value and currency of their price\n",
+      "SELECT ?course ?price ?cur\n",
+      "FROM <http://data.open.ac.uk/context/course>\n",
+      "WHERE {\n",
+      "    #The course must be at level 1\n",
+      "    ?course <http://data.open.ac.uk/saou/ontology#OUCourseLevel> \"1\"^^<http://www.w3.org/2001/XMLSchema#string>.\n",
+      "    #The course must be in in Arts and Humanities\n",
+      "    ?course <http://purl.org/dc/terms/subject> <http://data.open.ac.uk/topic/arts_and_humanities>.\n",
+      "    #The course must be part of an offering\n",
+      "    ?off <http://purl.org/goodrelations/v1#includes> ?course.\n",
+      "    #And that offering must have a price\n",
+      "    ?off <http://purl.org/goodrelations/v1#hasPriceSpecification> ?ps.\n",
+      "    #The course must be located(?) in France\n",
+      "    ?course <http://purl.org/net/mlo/location> <http://sws.geonames.org/3017382/>.\n",
+      "    #And the offering must be available in France\n",
+      "    ?off <http://purl.org/goodrelations/v1#availableAtOrFrom> <http://sws.geonames.org/3017382/>.\n",
+      "    #The price must have a value\n",
+      "    ?ps <http://purl.org/goodrelations/v1#hasCurrencyValue> ?price.\n",
+      "    #The price must also be in a particular currency\n",
+      "    ?ps <http://purl.org/goodrelations/v1#hasCurrency> ?cur\n",
+      "} ORDER BY ?price\n",
+      "#The ordering ranks the results in increasing price order\n",
+      "'''"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "#We can now run the query over the endpoint with the desired prefix\n",
+      "#and get the results back as a pandas dataframe\n",
+      "dfResults(endpoint,prefix,q)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "When querying a new graph, we need to find some way of discovering how graphs are structured around particular items. The `DESCRIBE` command can be very useful in this respect.\n",
+      "\n",
+      "For example, we can have a look at the graph associated with the AA100 module."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "q='DESCRIBE <http://data.open.ac.uk/course/aa100>'\n",
+      "ans=runQuery(endpoint,prefix,q)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "In this case, the response is an encoded text bytestream - we can get a human readable view by decoding it as UTF-8 and printing the result."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "#print(ans.decode('utf-8'))\n",
+      "\n",
+      "#The result is rather verbose!"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Another approach to exploring the structure of a graph around a resource is to identify a resource and then pull back the properties associated with it.\n",
+      "\n",
+      "In the follwoing case, we identify a module resource and pull back the properties associated with it."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "prefix='''\n",
+      "PREFIX aiiso: <http://purl.org/vocab/aiiso/schema#>\n",
+      "'''\n",
+      "\n",
+      "q='''\n",
+      "SELECT DISTINCT ?property\n",
+      "WHERE {\n",
+      "  ?module a aiiso:Module .\n",
+      "  ?module ?property ?example\n",
+      "}\n",
+      "'''\n",
+      "\n",
+      "runQuery(endpoint,prefix,q)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "We can use the description or the list of properties to help us construct a query that only pulls out the fields we are interested in. For example, we can see that we can pull out the title and description of the module:"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "prefix='''\n",
+      "PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\n",
+      "PREFIX dcterms: <http://purl.org/dc/terms/>\n",
+      "PREFIX aiiso: <http://purl.org/vocab/aiiso/schema#>\n",
+      "'''\n",
+      "\n",
+      "q='''\n",
+      "SELECT ?code ?title ?desc WHERE {\n",
+      "    <http://data.open.ac.uk/course/aa100> aiiso:code ?code.\n",
+      "    <http://data.open.ac.uk/course/aa100> rdfs:label ?title.\n",
+      "    <http://data.open.ac.uk/course/aa100> dcterms:description ?desc.\n",
+      "}\n",
+      "'''\n",
+      "\n",
+      "runQuery(endpoint,prefix,q)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Let's create a helper function to display the result in a rather more readable way."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "def printQuery(endpoint,prefix,q):\n",
+      "    ''' Print the results from the SPARQL query '''\n",
+      "    results=runQuery(endpoint,prefix,q)\n",
+      "    for result in results[\"results\"][\"bindings\"]:\n",
+      "        for ans in result:\n",
+      "            print('{0}: {1}'.format(ans,result[ans]['value']))\n",
+      "        print()"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "printQuery(endpoint,prefix,q)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Repeatedly quoting the resource we are interested in describing feels a bit a redundant and susceptible to error. So instead we might search for a resource that is a module with the desired module code. The module code is used to \"anchor\" the corresponding module resource variable.\n",
+      "\n",
+      "We note from the result of the earlier query, and the `DESCRIBE` query, that the module code is specified as a string type: we need to make that clear in our query."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "prefix='''\n",
+      "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>\n",
+      "PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\n",
+      "PREFIX dcterms: <http://purl.org/dc/terms/>\n",
+      "PREFIX aiiso: <http://purl.org/vocab/aiiso/schema#>\n",
+      "PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>\n",
+      "'''\n",
+      "\n",
+      "q='''\n",
+      "SELECT ?code ?title ?desc WHERE {\n",
+      "    ?module aiiso:code 'AA100'^^xsd:string.\n",
+      "    ?module rdf:type aiiso:Module.\n",
+      "        \n",
+      "    ?module aiiso:code ?code.\n",
+      "    ?module aiiso:code ?code.\n",
+      "    ?module rdfs:label ?title.\n",
+      "    ?module dcterms:description ?desc.\n",
+      "}\n",
+      "'''\n",
+      "\n",
+      "printQuery(endpoint,prefix,q)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Rummaging through some of the other properties associeated with a course, we see it may be possible to get a list of books associated with the course via the <http://data.open.ac.uk/saou/ontology#hasBook> property."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "prefix='''\n",
+      "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>\n",
+      "PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\n",
+      "PREFIX dcterms: <http://purl.org/dc/terms/>\n",
+      "PREFIX aiiso: <http://purl.org/vocab/aiiso/schema#>\n",
+      "PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>\n",
+      "PREFIX saou: <http://data.open.ac.uk/saou/ontology#>\n",
+      "'''\n",
+      "\n",
+      "q='''\n",
+      "SELECT ?code ?book WHERE {\n",
+      "    ?module aiiso:code 'AA100'^^xsd:string.\n",
+      "    ?module rdf:type aiiso:Module.\n",
+      "        \n",
+      "    ?module aiiso:code ?code.\n",
+      "    ?module rdfs:label ?title.\n",
+      "    ?module dcterms:description ?desc.\n",
+      "    ?module saou:hasBook ?book.\n",
+      "}\n",
+      "'''\n",
+      "printQuery(endpoint,prefix,q)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Now we can start to \"join\" the result with other data elements by extending the graph representation of the answers we want to get back. For example, what might we find out about the books? "
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "q='DESCRIBE <http://data.open.ac.uk/book/9780571195473>'\n",
+      "\n",
+      "def printDesc(endpoint,q):\n",
+      "    ans=runQuery(endpoint,prefix,q)\n",
+      "    print(ans.decode('utf-8'))\n",
+      "    \n",
+      "printDesc(endpoint,q)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Let's weave elements of that in to our original query..."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "prefix='''\n",
+      "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>\n",
+      "PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\n",
+      "PREFIX dcterms: <http://purl.org/dc/terms/>\n",
+      "PREFIX aiiso: <http://purl.org/vocab/aiiso/schema#>\n",
+      "PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>\n",
+      "PREFIX saou: <http://data.open.ac.uk/saou/ontology#>\n",
+      "PREFIX bibo: <http://purl.org/ontology/bibo/>\n",
+      "'''\n",
+      "\n",
+      "q='''\n",
+      "SELECT ?booktitle ?isbn13 ?publisher WHERE {\n",
+      "    ?module aiiso:code 'AA100'^^xsd:string.\n",
+      "    ?module rdf:type aiiso:Module.\n",
+      "\n",
+      "    ?module aiiso:code ?code.\n",
+      "    ?module rdfs:label ?title.\n",
+      "    ?module dcterms:description ?desc.\n",
+      "    ?module saou:hasBook ?book.\n",
+      "    \n",
+      "    ?book dcterms:title ?booktitle.\n",
+      "    ?book bibo:ISBN13 ?isbn13.\n",
+      "    ?book dcterms:publisher ?publisher.\n",
+      "}\n",
+      "'''\n",
+      "printQuery(endpoint,prefix,q)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "For other example queries over the OU open Linked Data, see [data.open.ac.uk - Example queries](http://data.open.ac.uk/site/queries.html) (or [this gist](https://gist.github.com/Open-University-LOD)), which includes queries showing how to:\n",
+      "\n",
+      "- find any video podcasts or OpenLearn units whose descriptions contain the term \u201cearthquake\u201d;\n",
+      "- list the OU's most recent YouTube videos, sorted by date (most recent first).\n"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    }
+   ],
+   "metadata": {}
+  }
+ ]
+}
\ No newline at end of file