2018/make-history-words.ipynb

   1 {
   2  "cells": [
   3   {
   4    "cell_type": "code",
   5    "execution_count": 4,
   6    "metadata": {},
   7    "outputs": [],
   8    "source": [
   9     "import os,sys,inspect\n",
  10     "currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))\n",
  11     "parentdir = os.path.dirname(currentdir)\n",
  12     "sys.path.insert(0,parentdir) "
  13    ]
  14   },
  15   {
  16    "cell_type": "code",
  17    "execution_count": 5,
  18    "metadata": {},
  19    "outputs": [],
  20    "source": [
  21     "import string\n",
  22     "from support.utilities import *"
  23    ]
  24   },
  25   {
  26    "cell_type": "code",
  27    "execution_count": 6,
  28    "metadata": {},
  29    "outputs": [],
  30    "source": [
  31     "text = open('history-words-raw.txt').read().lower()"
  32    ]
  33   },
  34   {
  35    "cell_type": "code",
  36    "execution_count": 7,
  37    "metadata": {},
  38    "outputs": [
  39     {
  40      "data": {
  41       "text/plain": [
  42        "437770"
  43       ]
  44      },
  45      "execution_count": 7,
  46      "metadata": {},
  47      "output_type": "execute_result"
  48     }
  49    ],
  50    "source": [
  51     "len(text)"
  52    ]
  53   },
  54   {
  55    "cell_type": "code",
  56    "execution_count": 14,
  57    "metadata": {},
  58    "outputs": [],
  59    "source": [
  60     "cleaned = cat(c if c in string.ascii_letters else ' ' for c in unaccent(text))"
  61    ]
  62   },
  63   {
  64    "cell_type": "code",
  65    "execution_count": 15,
  66    "metadata": {},
  67    "outputs": [
  68     {
  69      "data": {
  70       "text/plain": [
  71        "'  notoc   noinclude   europeanhistorytoc    noinclude      border     id  toc  style  margin    auto'"
  72       ]
  73      },
  74      "execution_count": 15,
  75      "metadata": {},
  76      "output_type": "execute_result"
  77     }
  78    ],
  79    "source": [
  80     "cleaned[:100]"
  81    ]
  82   },
  83   {
  84    "cell_type": "code",
  85    "execution_count": 16,
  86    "metadata": {},
  87    "outputs": [
  88     {
  89      "data": {
  90       "text/plain": [
  91        "8197"
  92       ]
  93      },
  94      "execution_count": 16,
  95      "metadata": {},
  96      "output_type": "execute_result"
  97     }
  98    ],
  99    "source": [
 100     "cleaned_words = set(cleaned.split())\n",
 101     "len(cleaned_words)"
 102    ]
 103   },
 104   {
 105    "cell_type": "code",
 106    "execution_count": 17,
 107    "metadata": {},
 108    "outputs": [
 109     {
 110      "data": {
 111       "text/plain": [
 112        "67866"
 113       ]
 114      },
 115      "execution_count": 17,
 116      "metadata": {},
 117      "output_type": "execute_result"
 118     }
 119    ],
 120    "source": [
 121     "open('history-words.txt', 'w').write(lcat(cleaned_words))"
 122    ]
 123   },
 124   {
 125    "cell_type": "code",
 126    "execution_count": 18,
 127    "metadata": {},
 128    "outputs": [
 129     {
 130      "data": {
 131       "text/plain": [
 132        "True"
 133       ]
 134      },
 135      "execution_count": 18,
 136      "metadata": {},
 137      "output_type": "execute_result"
 138     }
 139    ],
 140    "source": [
 141     "'ottoman' in cleaned_words"
 142    ]
 143   },
 144   {
 145    "cell_type": "code",
 146    "execution_count": null,
 147    "metadata": {},
 148    "outputs": [],
 149    "source": []
 150   }
 151  ],
 152  "metadata": {
 153   "kernelspec": {
 154    "display_name": "Python 3",
 155    "language": "python",
 156    "name": "python3"
 157   },
 158   "language_info": {
 159    "codemirror_mode": {
 160     "name": "ipython",
 161     "version": 3
 162    },
 163    "file_extension": ".py",
 164    "mimetype": "text/x-python",
 165    "name": "python",
 166    "nbconvert_exporter": "python",
 167    "pygments_lexer": "ipython3",
 168    "version": "3.6.6"
 169   }
 170  },
 171  "nbformat": 4,
 172  "nbformat_minor": 2
 173 }