X-Git-Url: https://git.njae.me.uk/?a=blobdiff_plain;f=write-to-database.ipynb;fp=write-to-database.ipynb;h=96884afeb6272390af65c98a3a143d2e2be5c972;hb=cbc82ab0d7821ee263e1eabf32b1f932810775a2;hp=0000000000000000000000000000000000000000;hpb=66626d16adffb0245d288e94c36638a9eac00055;p=dmarc.git diff --git a/write-to-database.ipynb b/write-to-database.ipynb new file mode 100644 index 0000000..96884af --- /dev/null +++ b/write-to-database.ipynb @@ -0,0 +1,1513 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 92, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import configparser\n", + "import xml.etree.ElementTree\n", + "import psycopg2\n", + "import re\n", + "import datetime" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['dmarc.ini']" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "config = configparser.ConfigParser()\n", + "config.read('dmarc.ini')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['imap', 'database']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "config.sections()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['server', 'username', 'port', 'password']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[k for k in config['imap']]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['server', 'database', 'username', 'password']" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[k for k in config['database']]" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def walk(node, indent=0):\n", + " for child in node:\n", + " if child.text:\n", + " txt = child.text.strip()\n", + " else:\n", + " txt = ''\n", + " print(' '*indent, child.tag, ':', txt)\n", + " walk(child, indent+2)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "google.com!njae.me.uk!1458777600!1458863999.xml hotmail.com!njae.me.uk!1459011600!1459098000.xml yahoo.com!njae.me.uk!1459036800!1459123199.xml\r\n", + "google.com!njae.me.uk!1458950400!1459036799.xml yahoo.com!njae.me.uk!1458864000!1458950399.xml\r\n" + ] + } + ], + "source": [ + "!ls *xml" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['google.com!njae.me.uk!1458777600!1458863999.xml',\n", + " 'google.com!njae.me.uk!1458950400!1459036799.xml',\n", + " 'hotmail.com!njae.me.uk!1459011600!1459098000.xml',\n", + " 'yahoo.com!njae.me.uk!1458864000!1458950399.xml',\n", + " 'yahoo.com!njae.me.uk!1459036800!1459123199.xml']" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "xmls = !ls *xml\n", + "xmls" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'google.com!njae.me.uk!1458777600!1458863999.xml': ,\n", + " 'google.com!njae.me.uk!1458950400!1459036799.xml': ,\n", + " 'hotmail.com!njae.me.uk!1459011600!1459098000.xml': ,\n", + " 'yahoo.com!njae.me.uk!1458864000!1458950399.xml': ,\n", + " 'yahoo.com!njae.me.uk!1459036800!1459123199.xml': }" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dmarc_reports = {f: xml.etree.ElementTree.fromstring(open(f).read()) for f in xmls}\n", + "dmarc_reports" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " report_metadata : \n", + " org_name : google.com\n", + " email : noreply-dmarc-support@google.com\n", + " extra_contact_info : https://support.google.com/a/answer/2466580\n", + " report_id : 2150510829392606201\n", + " date_range : \n", + " begin : 1458950400\n", + " end : 1459036799\n", + " policy_published : \n", + " domain : njae.me.uk\n", + " adkim : r\n", + " aspf : r\n", + " p : none\n", + " sp : none\n", + " pct : 100\n", + " record : \n", + " row : \n", + " source_ip : 82.109.184.9\n", + " count : 1\n", + " policy_evaluated : \n", + " disposition : none\n", + " dkim : fail\n", + " spf : fail\n", + " identifiers : \n", + " header_from : njae.me.uk\n", + " auth_results : \n", + " dkim : \n", + " domain : clublloyds.com\n", + " result : pass\n", + " spf : \n", + " domain : clublloyds.com\n", + " result : pass\n", + " record : \n", + " row : \n", + " source_ip : 212.69.55.62\n", + " count : 1\n", + " policy_evaluated : \n", + " disposition : none\n", + " dkim : pass\n", + " spf : pass\n", + " identifiers : \n", + " header_from : njae.me.uk\n", + " auth_results : \n", + " dkim : \n", + " domain : njae.me.uk\n", + " result : pass\n", + " spf : \n", + " domain : njae.me.uk\n", + " result : pass\n", + "\n", + " report_metadata : \n", + " org_name : Yahoo! Inc.\n", + " email : postmaster@dmarc.yahoo.com\n", + " report_id : 1459129809.695034\n", + " date_range : \n", + " begin : 1459036800\n", + " end : 1459123199\n", + " policy_published : \n", + " domain : njae.me.uk\n", + " adkim : r\n", + " aspf : r\n", + " p : none\n", + " pct : 100\n", + " record : \n", + " row : \n", + " source_ip : 65.20.0.12\n", + " count : 1\n", + " policy_evaluated : \n", + " disposition : none\n", + " dkim : pass\n", + " spf : fail\n", + " identifiers : \n", + " header_from : njae.me.uk\n", + " auth_results : \n", + " dkim : \n", + " domain : njae.me.uk\n", + " result : pass\n", + " spf : \n", + " domain : njae.me.uk\n", + " result : softfail\n", + "\n", + " report_metadata : \n", + " org_name : Yahoo! Inc.\n", + " email : postmaster@dmarc.yahoo.com\n", + " report_id : 1458957186.548175\n", + " date_range : \n", + " begin : 1458864000\n", + " end : 1458950399\n", + " policy_published : \n", + " domain : njae.me.uk\n", + " adkim : r\n", + " aspf : r\n", + " p : none\n", + " pct : 100\n", + " record : \n", + " row : \n", + " source_ip : 212.69.55.62\n", + " count : 2\n", + " policy_evaluated : \n", + " disposition : none\n", + " dkim : pass\n", + " spf : pass\n", + " identifiers : \n", + " header_from : njae.me.uk\n", + " auth_results : \n", + " dkim : \n", + " domain : njae.me.uk\n", + " result : pass\n", + " spf : \n", + " domain : njae.me.uk\n", + " result : pass\n", + "\n", + " report_metadata : \n", + " org_name : Microsoft Corp.\n", + " email : dmarcrep@microsoft.com\n", + " report_id : 68aad5080a774e2c997d159b546569b9@hotmail.com\n", + " date_range : \n", + " begin : 1459011600\n", + " end : 1459098000\n", + " policy_published : \n", + " domain : njae.me.uk\n", + " adkim : r\n", + " aspf : r\n", + " p : none\n", + " sp : none\n", + " pct : 100\n", + " record : \n", + " row : \n", + " source_ip : 212.69.55.62\n", + " count : 2\n", + " policy_evaluated : \n", + " disposition : none\n", + " dkim : pass\n", + " spf : pass\n", + " identifiers : \n", + " header_from : njae.me.uk\n", + " auth_results : \n", + " spf : \n", + " domain : njae.me.uk\n", + " result : pass\n", + " dkim : \n", + " domain : njae.me.uk\n", + " result : pass\n", + "\n", + " report_metadata : \n", + " org_name : google.com\n", + " email : noreply-dmarc-support@google.com\n", + " extra_contact_info : https://support.google.com/a/answer/2466580\n", + " report_id : 13032699446207263761\n", + " date_range : \n", + " begin : 1458777600\n", + " end : 1458863999\n", + " policy_published : \n", + " domain : njae.me.uk\n", + " adkim : r\n", + " aspf : r\n", + " p : none\n", + " sp : none\n", + " pct : 100\n", + " record : \n", + " row : \n", + " source_ip : 146.185.136.235\n", + " count : 1\n", + " policy_evaluated : \n", + " disposition : none\n", + " dkim : pass\n", + " spf : fail\n", + " identifiers : \n", + " header_from : njae.me.uk\n", + " auth_results : \n", + " dkim : \n", + " domain : njae.me.uk\n", + " result : pass\n", + " spf : \n", + " domain : njae.me.uk\n", + " result : softfail\n", + "\n" + ] + } + ], + "source": [ + "for f in dmarc_reports:\n", + " walk(dmarc_reports[f])\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " report_metadata : \n", + " org_name : Yahoo! Inc.\n", + " email : postmaster@dmarc.yahoo.com\n", + " report_id : 1459129809.695034\n", + " date_range : \n", + " begin : 1459036800\n", + " end : 1459123199\n", + " policy_published : \n", + " domain : njae.me.uk\n", + " adkim : r\n", + " aspf : r\n", + " p : none\n", + " pct : 100\n", + " record : \n", + " row : \n", + " source_ip : 65.20.0.12\n", + " count : 1\n", + " policy_evaluated : \n", + " disposition : none\n", + " dkim : pass\n", + " spf : fail\n", + " identifiers : \n", + " header_from : njae.me.uk\n", + " auth_results : \n", + " dkim : \n", + " domain : njae.me.uk\n", + " result : pass\n", + " spf : \n", + " domain : njae.me.uk\n", + " result : softfail\n" + ] + } + ], + "source": [ + "root2 = xml.etree.ElementTree.fromstring(open('yahoo.com!njae.me.uk!1459036800!1459123199.xml').read())\n", + "walk(root2)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "conn = psycopg2.connect(host=config['database']['server'],\n", + " database=config['database']['database'], \n", + " user=config['database']['username'], \n", + " password=config['database']['password'])" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cur = conn.cursor()\n", + "cur" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'email': 'varchar', 'org_name': 'varchar', 'report_id': 'varchar'}" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "report_fields = {'date_range_begin': 'timestamp', 'date_range_end': 'timestamp'}\n", + "for child in root2:\n", + " if child.tag == 'report_metadata':\n", + " for c in child:\n", + " if c.tag != 'date_range':\n", + " report_fields[c.tag] = 'varchar'\n", + "report_fields" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def xpath_of(node, prefix=''):\n", + " if len(node) == 0:\n", + " if node.text:\n", + " nodes = {prefix + node.tag: node.text.strip()}\n", + " else:\n", + " nodes = {prefix + node.tag: ''}\n", + " else:\n", + " nodes = {}\n", + " if node:\n", + " for child in node:\n", + " nodes.update(xpath_of(child, \n", + " prefix + node.tag + '/'))\n", + " return nodes" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'feedback/policy_published/adkim': 'r',\n", + " 'feedback/policy_published/aspf': 'r',\n", + " 'feedback/policy_published/domain': 'njae.me.uk',\n", + " 'feedback/policy_published/p': 'none',\n", + " 'feedback/policy_published/pct': '100',\n", + " 'feedback/record/auth_results/dkim/domain': 'njae.me.uk',\n", + " 'feedback/record/auth_results/dkim/result': 'pass',\n", + " 'feedback/record/auth_results/spf/domain': 'njae.me.uk',\n", + " 'feedback/record/auth_results/spf/result': 'softfail',\n", + " 'feedback/record/identifiers/header_from': 'njae.me.uk',\n", + " 'feedback/record/row/count': '1',\n", + " 'feedback/record/row/policy_evaluated/disposition': 'none',\n", + " 'feedback/record/row/policy_evaluated/dkim': 'pass',\n", + " 'feedback/record/row/policy_evaluated/spf': 'fail',\n", + " 'feedback/record/row/source_ip': '65.20.0.12',\n", + " 'feedback/report_metadata/date_range/begin': '1459036800',\n", + " 'feedback/report_metadata/date_range/end': '1459123199',\n", + " 'feedback/report_metadata/email': 'postmaster@dmarc.yahoo.com',\n", + " 'feedback/report_metadata/org_name': 'Yahoo! Inc.',\n", + " 'feedback/report_metadata/report_id': '1459129809.695034'}" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "xpath_of(root2)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'1459036800'" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "root2.find('./report_metadata/date_range/begin').text" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'./report_metadata/date_range/begin'" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "re.sub(r'^\\w*(/.*)$', r'.\\1', 'feedback/report_metadata/date_range/begin')" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'report_metadata_date_range_begin'" + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "re.sub(r'/', r'_',\n", + " re.sub(r'^[^\\/]*/(.*)$', r'\\1', 'feedback/report_metadata/date_range/begin'))" + ] + }, + { + "cell_type": "code", + "execution_count": 156, + "metadata": { + "collapsed": false, + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'./policy_published/adkim': {'pg_field_name': 'policy_published_adkim',\n", + " 'pg_table': 'reports',\n", + " 'pg_type': 'varchar'},\n", + " './policy_published/aspf': {'pg_field_name': 'policy_published_aspf',\n", + " 'pg_table': 'reports',\n", + " 'pg_type': 'varchar'},\n", + " './policy_published/domain': {'pg_field_name': 'policy_published_domain',\n", + " 'pg_table': 'reports',\n", + " 'pg_type': 'varchar'},\n", + " './policy_published/p': {'pg_field_name': 'policy_published_p',\n", + " 'pg_table': 'reports',\n", + " 'pg_type': 'varchar'},\n", + " './policy_published/pct': {'pg_field_name': 'policy_published_pct',\n", + " 'pg_table': 'reports',\n", + " 'pg_type': 'int'},\n", + " './record[{}]/auth_results/dkim/domain': {'pg_field_name': 'auth_results_dkim_domain',\n", + " 'pg_table': 'report_items',\n", + " 'pg_type': 'varchar'},\n", + " './record[{}]/auth_results/dkim/result': {'pg_field_name': 'auth_results_dkim_result',\n", + " 'pg_table': 'report_items',\n", + " 'pg_type': 'varchar'},\n", + " './record[{}]/auth_results/spf/domain': {'pg_field_name': 'auth_results_spf_domain',\n", + " 'pg_table': 'report_items',\n", + " 'pg_type': 'varchar'},\n", + " './record[{}]/auth_results/spf/result': {'pg_field_name': 'auth_results_spf_result',\n", + " 'pg_table': 'report_items',\n", + " 'pg_type': 'varchar'},\n", + " './record[{}]/identifiers/header_from': {'pg_field_name': 'identifiers_header_from',\n", + " 'pg_table': 'report_items',\n", + " 'pg_type': 'varchar'},\n", + " './record[{}]/row/count': {'pg_field_name': 'count',\n", + " 'pg_table': 'report_items',\n", + " 'pg_type': 'int'},\n", + " './record[{}]/row/policy_evaluated/disposition': {'pg_field_name': 'policy_evaluated_disposition',\n", + " 'pg_table': 'report_items',\n", + " 'pg_type': 'varchar'},\n", + " './record[{}]/row/policy_evaluated/dkim': {'pg_field_name': 'policy_evaluated_dkim',\n", + " 'pg_table': 'report_items',\n", + " 'pg_type': 'varchar'},\n", + " './record[{}]/row/policy_evaluated/spf': {'pg_field_name': 'policy_evaluated_spf',\n", + " 'pg_table': 'report_items',\n", + " 'pg_type': 'varchar'},\n", + " './record[{}]/row/source_ip': {'pg_field_name': 'source_ip',\n", + " 'pg_table': 'report_items',\n", + " 'pg_type': 'inet'},\n", + " './report_metadata/date_range/begin': {'pg_field_name': 'report_metadata_date_range_begin',\n", + " 'pg_table': 'reports',\n", + " 'pg_type': 'timestamp'},\n", + " './report_metadata/date_range/end': {'pg_field_name': 'report_metadata_date_range_end',\n", + " 'pg_table': 'reports',\n", + " 'pg_type': 'timestamp'},\n", + " './report_metadata/email': {'pg_field_name': 'report_metadata_email',\n", + " 'pg_table': 'reports',\n", + " 'pg_type': 'varchar'},\n", + " './report_metadata/org_name': {'pg_field_name': 'report_metadata_org_name',\n", + " 'pg_table': 'reports',\n", + " 'pg_type': 'varchar'},\n", + " './report_metadata/report_id': {'pg_field_name': 'report_metadata_report_id',\n", + " 'pg_table': 'reports',\n", + " 'pg_type': 'varchar'}}" + ] + }, + "execution_count": 156, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "field_maps = {re.sub(r'^\\w*(/.*)$', r'.\\1', name): \n", + " {'pg_field_name': re.sub(r'/', r'_', re.sub(r'^[^\\/]*/(.*)$', r'\\1', name)),\n", + " 'pg_table': 'reports',\n", + " 'pg_type': 'varchar'}\n", + " for name in xpath_of(root2)\n", + " if 'record' not in name}\n", + "field_maps.update({re.sub(r'^\\w*/record(/.*)$', r'./record[{}]\\1', name): \n", + " {'pg_field_name': re.sub(r'/', r'_', re.sub(r'^.*/record(/row)?/(.*)$', r'\\2', name)),\n", + " 'pg_table': 'report_items',\n", + " 'pg_type': 'varchar'}\n", + " for name in xpath_of(root2)\n", + " if 'record' in name})\n", + "field_maps['./report_metadata/date_range/begin']['pg_type'] = 'timestamp'\n", + "field_maps['./report_metadata/date_range/end']['pg_type'] = 'timestamp'\n", + "field_maps['./policy_published/pct']['pg_type'] = 'int'\n", + "field_maps['./record[{}]/row/count']['pg_type'] = 'int'\n", + "field_maps['./record[{}]/row/source_ip']['pg_type'] = 'inet'\n", + "\n", + "field_maps" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "datetime.datetime(2016, 3, 27, 23, 59, 59)" + ] + }, + "execution_count": 95, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "datetime.datetime.utcfromtimestamp(1459123199)" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1459119599.0" + ] + }, + "execution_count": 100, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "datetime.datetime.utcfromtimestamp(1459123199).timestamp()" + ] + }, + { + "cell_type": "code", + "execution_count": 157, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'create table reports (id serial primary key, policy_published_p varchar, report_metadata_date_range_end timestamp, report_metadata_org_name varchar, policy_published_aspf varchar, policy_published_adkim varchar, policy_published_pct int, report_metadata_email varchar, report_metadata_date_range_begin timestamp, report_metadata_report_id varchar, policy_published_domain varchar);'" + ] + }, + "execution_count": 157, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "create_report_table_string = 'create table reports (id serial primary key, ' + \\\n", + "', '.join(field_maps[p]['pg_field_name'] + ' ' + field_maps[p]['pg_type'] \n", + " for p in field_maps if field_maps[p]['pg_table'] == 'reports') + \\\n", + "');'\n", + "create_report_table_string" + ] + }, + { + "cell_type": "code", + "execution_count": 193, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'create table report_items (id serial primary key, report_id integer references reports, auth_results_spf_domain varchar, auth_results_dkim_domain varchar, policy_evaluated_dkim varchar, auth_results_spf_result varchar, source_ip inet, policy_evaluated_spf varchar, count int, identifiers_header_from varchar, policy_evaluated_disposition varchar, auth_results_dkim_result varchar);'" + ] + }, + "execution_count": 193, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "create_report_item_table_string = 'create table report_items (id serial primary key, ' + \\\n", + " 'report_id integer references reports, ' + \\\n", + "', '.join(field_maps[p]['pg_field_name'] + ' ' + field_maps[p]['pg_type'] \n", + " for p in field_maps if field_maps[p]['pg_table'] == 'report_items') + \\\n", + "');'\n", + "create_report_item_table_string" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 117, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cur" + ] + }, + { + "cell_type": "code", + "execution_count": 194, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "cur.execute(create_report_table_string)" + ] + }, + { + "cell_type": "code", + "execution_count": 195, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "cur.execute(create_report_item_table_string)" + ] + }, + { + "cell_type": "code", + "execution_count": 196, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "conn.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "rep = dmarc_reports['google.com!njae.me.uk!1458777600!1458863999.xml']" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'1'" + ] + }, + "execution_count": 134, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rep.find('./record/row/count').text" + ] + }, + { + "cell_type": "code", + "execution_count": 135, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[('google.com!njae.me.uk!1458950400!1459036799.xml', '1'),\n", + " ('yahoo.com!njae.me.uk!1459036800!1459123199.xml', '1'),\n", + " ('yahoo.com!njae.me.uk!1458864000!1458950399.xml', '2'),\n", + " ('hotmail.com!njae.me.uk!1459011600!1459098000.xml', '2'),\n", + " ('google.com!njae.me.uk!1458777600!1458863999.xml', '1')]" + ] + }, + "execution_count": 135, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[(r, dmarc_reports[r].find('./record/row/count').text) for r in dmarc_reports]" + ] + }, + { + "cell_type": "code", + "execution_count": 136, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " report_metadata : \n", + " org_name : Microsoft Corp.\n", + " email : dmarcrep@microsoft.com\n", + " report_id : 68aad5080a774e2c997d159b546569b9@hotmail.com\n", + " date_range : \n", + " begin : 1459011600\n", + " end : 1459098000\n", + " policy_published : \n", + " domain : njae.me.uk\n", + " adkim : r\n", + " aspf : r\n", + " p : none\n", + " sp : none\n", + " pct : 100\n", + " record : \n", + " row : \n", + " source_ip : 212.69.55.62\n", + " count : 2\n", + " policy_evaluated : \n", + " disposition : none\n", + " dkim : pass\n", + " spf : pass\n", + " identifiers : \n", + " header_from : njae.me.uk\n", + " auth_results : \n", + " spf : \n", + " domain : njae.me.uk\n", + " result : pass\n", + " dkim : \n", + " domain : njae.me.uk\n", + " result : pass\n" + ] + } + ], + "source": [ + "walk(dmarc_reports['hotmail.com!njae.me.uk!1459011600!1459098000.xml'])" + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " report_metadata : \n", + " org_name : google.com\n", + " email : noreply-dmarc-support@google.com\n", + " extra_contact_info : https://support.google.com/a/answer/2466580\n", + " report_id : 2150510829392606201\n", + " date_range : \n", + " begin : 1458950400\n", + " end : 1459036799\n", + " policy_published : \n", + " domain : njae.me.uk\n", + " adkim : r\n", + " aspf : r\n", + " p : none\n", + " sp : none\n", + " pct : 100\n", + " record : \n", + " row : \n", + " source_ip : 82.109.184.9\n", + " count : 1\n", + " policy_evaluated : \n", + " disposition : none\n", + " dkim : fail\n", + " spf : fail\n", + " identifiers : \n", + " header_from : njae.me.uk\n", + " auth_results : \n", + " dkim : \n", + " domain : clublloyds.com\n", + " result : pass\n", + " spf : \n", + " domain : clublloyds.com\n", + " result : pass\n", + " record : \n", + " row : \n", + " source_ip : 212.69.55.62\n", + " count : 1\n", + " policy_evaluated : \n", + " disposition : none\n", + " dkim : pass\n", + " spf : pass\n", + " identifiers : \n", + " header_from : njae.me.uk\n", + " auth_results : \n", + " dkim : \n", + " domain : njae.me.uk\n", + " result : pass\n", + " spf : \n", + " domain : njae.me.uk\n", + " result : pass\n" + ] + } + ], + "source": [ + "walk(dmarc_reports['google.com!njae.me.uk!1458950400!1459036799.xml'])" + ] + }, + { + "cell_type": "code", + "execution_count": 159, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['./record[{}]/auth_results/spf/domain',\n", + " './record[{}]/auth_results/dkim/domain',\n", + " './record[{}]/row/policy_evaluated/dkim',\n", + " './record[{}]/auth_results/spf/result',\n", + " './record[{}]/row/source_ip',\n", + " './record[{}]/row/policy_evaluated/spf',\n", + " './record[{}]/row/count',\n", + " './record[{}]/identifiers/header_from',\n", + " './record[{}]/row/policy_evaluated/disposition',\n", + " './record[{}]/auth_results/dkim/result']" + ] + }, + "execution_count": 159, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[p for p in field_maps if field_maps[p]['pg_table'] == 'report_items']" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "2" + ] + }, + "execution_count": 140, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(dmarc_reports['google.com!njae.me.uk!1458950400!1459036799.xml'].findall('./record'))" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'212.69.55.62'" + ] + }, + "execution_count": 144, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dmarc_reports['google.com!njae.me.uk!1458950400!1459036799.xml'].find('./record[2]/row/source_ip').text" + ] + }, + { + "cell_type": "code", + "execution_count": 153, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'82.109.184.9'" + ] + }, + "execution_count": 153, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dmarc_reports['google.com!njae.me.uk!1458950400!1459036799.xml'].find('./record[{}]/row/source_ip'.format(1)).text" + ] + }, + { + "cell_type": "code", + "execution_count": 163, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "policy_published_p :> none\n", + "report_metadata_date_range_end :> 1459036799\n", + "report_metadata_org_name :> google.com\n", + "policy_published_aspf :> r\n", + "policy_published_adkim :> r\n", + "policy_published_pct :> 100\n", + "report_metadata_email :> noreply-dmarc-support@google.com\n", + "report_metadata_date_range_begin :> 1458950400\n", + "report_metadata_report_id :> 2150510829392606201\n", + "policy_published_domain :> njae.me.uk\n" + ] + } + ], + "source": [ + "rep = dmarc_reports['google.com!njae.me.uk!1458950400!1459036799.xml']\n", + "for f in [f for f in field_maps if field_maps[f]['pg_table'] == 'reports']:\n", + " print(field_maps[f]['pg_field_name'], ':>', rep.find(f).text)" + ] + }, + { + "cell_type": "code", + "execution_count": 165, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1\n", + "auth_results_spf_domain :> clublloyds.com\n", + "auth_results_dkim_domain :> clublloyds.com\n", + "policy_evaluated_dkim :> fail\n", + "auth_results_spf_result :> pass\n", + "source_ip :> 82.109.184.9\n", + "policy_evaluated_spf :> fail\n", + "count :> 1\n", + "identifiers_header_from :> njae.me.uk\n", + "policy_evaluated_disposition :> none\n", + "auth_results_dkim_result :> pass\n", + "2\n", + "auth_results_spf_domain :> njae.me.uk\n", + "auth_results_dkim_domain :> njae.me.uk\n", + "policy_evaluated_dkim :> pass\n", + "auth_results_spf_result :> pass\n", + "source_ip :> 212.69.55.62\n", + "policy_evaluated_spf :> pass\n", + "count :> 1\n", + "identifiers_header_from :> njae.me.uk\n", + "policy_evaluated_disposition :> none\n", + "auth_results_dkim_result :> pass\n" + ] + } + ], + "source": [ + "rep = dmarc_reports['google.com!njae.me.uk!1458950400!1459036799.xml']\n", + "for i in range(1, len(rep.findall('./record'))+1):\n", + " print(i)\n", + " for f in [f for f in field_maps if field_maps[f]['pg_table'] == 'report_items']:\n", + " print(field_maps[f]['pg_field_name'], ':>', rep.find(f.format(i)).text)" + ] + }, + { + "cell_type": "code", + "execution_count": 166, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def maybe_strip(text):\n", + " if text:\n", + " return text.strip()\n", + " else:\n", + " return ''" + ] + }, + { + "cell_type": "code", + "execution_count": 197, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "('insert into reports (policy_published_p, report_metadata_date_range_end, report_metadata_org_name, policy_published_aspf, policy_published_adkim, policy_published_pct, report_metadata_email, report_metadata_date_range_begin, report_metadata_report_id, policy_published_domain) values (%(policy_published_p)s, %(report_metadata_date_range_end)s, %(report_metadata_org_name)s, %(policy_published_aspf)s, %(policy_published_adkim)s, %(policy_published_pct)s, %(report_metadata_email)s, %(report_metadata_date_range_begin)s, %(report_metadata_report_id)s, %(policy_published_domain)s);',\n", + " {'policy_published_adkim': 'r',\n", + " 'policy_published_aspf': 'r',\n", + " 'policy_published_domain': 'njae.me.uk',\n", + " 'policy_published_p': 'none',\n", + " 'policy_published_pct': 100,\n", + " 'report_metadata_date_range_begin': datetime.datetime(2016, 3, 26, 0, 0),\n", + " 'report_metadata_date_range_end': datetime.datetime(2016, 3, 26, 23, 59, 59),\n", + " 'report_metadata_email': 'noreply-dmarc-support@google.com',\n", + " 'report_metadata_org_name': 'google.com',\n", + " 'report_metadata_report_id': '2150510829392606201'})" + ] + }, + "execution_count": 197, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rep = dmarc_reports['google.com!njae.me.uk!1458950400!1459036799.xml']\n", + "field_names = []\n", + "values = {}\n", + "for f in [f for f in field_maps if field_maps[f]['pg_table'] == 'reports']:\n", + " field_names += [field_maps[f]['pg_field_name']]\n", + " if field_maps[f]['pg_type'] == 'int':\n", + " values[field_maps[f]['pg_field_name']] = int(rep.find(f).text)\n", + " elif field_maps[f]['pg_type'] == 'timestamp':\n", + " values[field_maps[f]['pg_field_name']] = datetime.datetime.utcfromtimestamp(int(rep.find(f).text))\n", + " elif field_maps[f]['pg_type'] == 'inet':\n", + " values[field_maps[f]['pg_field_name']] = maybe_strip(rep.find(f).text)\n", + " else:\n", + " values[field_maps[f]['pg_field_name']] = maybe_strip(rep.find(f).text)\n", + "insert_string = 'insert into reports (' + ', '.join(field_names) +') '\n", + "insert_string += 'values (' + ', '.join('%({})s'.format(f) for f in field_names) + ');'\n", + "insert_string, values" + ] + }, + { + "cell_type": "code", + "execution_count": 198, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "cur.execute(insert_string, values)\n", + "conn.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": 201, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1" + ] + }, + "execution_count": 201, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cur.execute('select id, report_metadata_report_id from reports where report_metadata_report_id = %s;', \n", + " [rep.find('./report_metadata/report_id').text])\n", + "cur.fetchall()[0][0]" + ] + }, + { + "cell_type": "code", + "execution_count": 203, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1\n", + "insert into report_items (report_id, auth_results_spf_domain, auth_results_dkim_domain, policy_evaluated_dkim, auth_results_spf_result, source_ip, policy_evaluated_spf, count, identifiers_header_from, policy_evaluated_disposition, auth_results_dkim_result) values (%(report_id)s, %(auth_results_spf_domain)s, %(auth_results_dkim_domain)s, %(policy_evaluated_dkim)s, %(auth_results_spf_result)s, %(source_ip)s, %(policy_evaluated_spf)s, %(count)s, %(identifiers_header_from)s, %(policy_evaluated_disposition)s, %(auth_results_dkim_result)s); {'auth_results_spf_result': 'pass', 'count': 1, 'report_id': 1, 'identifiers_header_from': 'njae.me.uk', 'policy_evaluated_disposition': 'none', 'auth_results_dkim_result': 'pass', 'source_ip': '82.109.184.9', 'policy_evaluated_spf': 'fail', 'auth_results_dkim_domain': 'clublloyds.com', 'auth_results_spf_domain': 'clublloyds.com', 'policy_evaluated_dkim': 'fail'}\n", + "2\n", + "insert into report_items (report_id, auth_results_spf_domain, auth_results_dkim_domain, policy_evaluated_dkim, auth_results_spf_result, source_ip, policy_evaluated_spf, count, identifiers_header_from, policy_evaluated_disposition, auth_results_dkim_result) values (%(report_id)s, %(auth_results_spf_domain)s, %(auth_results_dkim_domain)s, %(policy_evaluated_dkim)s, %(auth_results_spf_result)s, %(source_ip)s, %(policy_evaluated_spf)s, %(count)s, %(identifiers_header_from)s, %(policy_evaluated_disposition)s, %(auth_results_dkim_result)s); {'auth_results_spf_result': 'pass', 'count': 1, 'report_id': 1, 'identifiers_header_from': 'njae.me.uk', 'policy_evaluated_disposition': 'none', 'auth_results_dkim_result': 'pass', 'source_ip': '212.69.55.62', 'policy_evaluated_spf': 'pass', 'auth_results_dkim_domain': 'njae.me.uk', 'auth_results_spf_domain': 'njae.me.uk', 'policy_evaluated_dkim': 'pass'}\n" + ] + } + ], + "source": [ + "rep = dmarc_reports['google.com!njae.me.uk!1458950400!1459036799.xml']\n", + "for i in range(1, len(rep.findall('./record'))+1):\n", + " print(i)\n", + " field_names = []\n", + " cur.execute('select id, report_metadata_report_id from reports where report_metadata_report_id = %s;', \n", + " [rep.find('./report_metadata/report_id').text])\n", + " results = cur.fetchall()\n", + " if len(results) != 1:\n", + " raise\n", + " else:\n", + " report_id = results[0][0]\n", + " values = {'report_id': report_id}\n", + " for f in [f for f in field_maps if field_maps[f]['pg_table'] == 'report_items']:\n", + " field_names += [field_maps[f]['pg_field_name']]\n", + " if field_maps[f]['pg_type'] == 'int':\n", + " values[field_maps[f]['pg_field_name']] = int(rep.find(f.format(i)).text)\n", + " elif field_maps[f]['pg_type'] == 'timestamp':\n", + " values[field_maps[f]['pg_field_name']] = datetime.datetime.utcfromtimestamp(int(rep.find(f.format(i)).text))\n", + " elif field_maps[f]['pg_type'] == 'inet':\n", + " values[field_maps[f]['pg_field_name']] = maybe_strip(rep.find(f.format(i)).text)\n", + " else:\n", + " values[field_maps[f]['pg_field_name']] = maybe_strip(rep.find(f.format(i)).text)\n", + " insert_string = 'insert into report_items (report_id, ' + ', '.join(field_names) +') '\n", + " insert_string += 'values (%(report_id)s, ' + ', '.join('%({})s'.format(f) for f in field_names) + ');'\n", + " print(insert_string, values)\n", + " cur.execute(insert_string, values)\n", + "conn.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": 191, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "conn.rollback()" + ] + }, + { + "cell_type": "code", + "execution_count": 209, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def write_report(connection, cursor, report):\n", + " \n", + " field_names = []\n", + " values = {}\n", + " for f in [f for f in field_maps if field_maps[f]['pg_table'] == 'reports']:\n", + " field_names += [field_maps[f]['pg_field_name']]\n", + " if field_maps[f]['pg_type'] == 'int':\n", + " values[field_maps[f]['pg_field_name']] = int(report.find(f).text)\n", + " elif field_maps[f]['pg_type'] == 'timestamp':\n", + " values[field_maps[f]['pg_field_name']] = datetime.datetime.utcfromtimestamp(int(report.find(f).text))\n", + " elif field_maps[f]['pg_type'] == 'inet':\n", + " values[field_maps[f]['pg_field_name']] = maybe_strip(report.find(f).text)\n", + " else:\n", + " values[field_maps[f]['pg_field_name']] = maybe_strip(report.find(f).text)\n", + " insert_string = 'insert into reports (' + ', '.join(field_names) + ') '\n", + " insert_string += 'values (' + ', '.join('%({})s'.format(f) for f in field_names) + ');'\n", + " cursor.execute(insert_string, values)\n", + " \n", + " for i in range(1, len(report.findall('./record'))+1):\n", + " field_names = []\n", + " cursor.execute('select id, report_metadata_report_id from reports where report_metadata_report_id = %s;', \n", + " [report.find('./report_metadata/report_id').text])\n", + " results = cursor.fetchall()\n", + " if len(results) != 1:\n", + " raise\n", + " else:\n", + " report_id = results[0][0]\n", + " values = {'report_id': report_id}\n", + " for f in [f for f in field_maps if field_maps[f]['pg_table'] == 'report_items']:\n", + " field_names += [field_maps[f]['pg_field_name']]\n", + " if field_maps[f]['pg_type'] == 'int':\n", + " values[field_maps[f]['pg_field_name']] = int(report.find(f.format(i)).text)\n", + " elif field_maps[f]['pg_type'] == 'timestamp':\n", + " values[field_maps[f]['pg_field_name']] = datetime.datetime.utcfromtimestamp(int(report.find(f.format(i)).text))\n", + " elif field_maps[f]['pg_type'] == 'inet':\n", + " values[field_maps[f]['pg_field_name']] = maybe_strip(report.find(f.format(i)).text)\n", + " else:\n", + " values[field_maps[f]['pg_field_name']] = maybe_strip(report.find(f.format(i)).text)\n", + " insert_string = 'insert into report_items (report_id, ' + ', '.join(field_names) + ') '\n", + " insert_string += 'values (%(report_id)s, ' + ', '.join('%({})s'.format(f) for f in field_names) + ');'\n", + " cursor.execute(insert_string, values)\n", + " connection.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": 210, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "for rep in dmarc_reports:\n", + " write_report(conn, cur, dmarc_reports[rep])" + ] + }, + { + "cell_type": "code", + "execution_count": 211, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "conn.close()" + ] + }, + { + "cell_type": "code", + "execution_count": 208, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "conn.rollback()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.4.3+" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}