Updated for challenge 9
[cipher-tools.git] / 2018 / make-history-words.ipynb
1 {
2 "cells": [
3 {
4 "cell_type": "code",
5 "execution_count": 4,
6 "metadata": {},
7 "outputs": [],
8 "source": [
9 "import os,sys,inspect\n",
10 "currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))\n",
11 "parentdir = os.path.dirname(currentdir)\n",
12 "sys.path.insert(0,parentdir) "
13 ]
14 },
15 {
16 "cell_type": "code",
17 "execution_count": 5,
18 "metadata": {},
19 "outputs": [],
20 "source": [
21 "import string\n",
22 "from support.utilities import *"
23 ]
24 },
25 {
26 "cell_type": "code",
27 "execution_count": 6,
28 "metadata": {},
29 "outputs": [],
30 "source": [
31 "text = open('history-words-raw.txt').read().lower()"
32 ]
33 },
34 {
35 "cell_type": "code",
36 "execution_count": 7,
37 "metadata": {},
38 "outputs": [
39 {
40 "data": {
41 "text/plain": [
42 "437770"
43 ]
44 },
45 "execution_count": 7,
46 "metadata": {},
47 "output_type": "execute_result"
48 }
49 ],
50 "source": [
51 "len(text)"
52 ]
53 },
54 {
55 "cell_type": "code",
56 "execution_count": 14,
57 "metadata": {},
58 "outputs": [],
59 "source": [
60 "cleaned = cat(c if c in string.ascii_letters else ' ' for c in unaccent(text))"
61 ]
62 },
63 {
64 "cell_type": "code",
65 "execution_count": 15,
66 "metadata": {},
67 "outputs": [
68 {
69 "data": {
70 "text/plain": [
71 "' notoc noinclude europeanhistorytoc noinclude border id toc style margin auto'"
72 ]
73 },
74 "execution_count": 15,
75 "metadata": {},
76 "output_type": "execute_result"
77 }
78 ],
79 "source": [
80 "cleaned[:100]"
81 ]
82 },
83 {
84 "cell_type": "code",
85 "execution_count": 16,
86 "metadata": {},
87 "outputs": [
88 {
89 "data": {
90 "text/plain": [
91 "8197"
92 ]
93 },
94 "execution_count": 16,
95 "metadata": {},
96 "output_type": "execute_result"
97 }
98 ],
99 "source": [
100 "cleaned_words = set(cleaned.split())\n",
101 "len(cleaned_words)"
102 ]
103 },
104 {
105 "cell_type": "code",
106 "execution_count": 17,
107 "metadata": {},
108 "outputs": [
109 {
110 "data": {
111 "text/plain": [
112 "67866"
113 ]
114 },
115 "execution_count": 17,
116 "metadata": {},
117 "output_type": "execute_result"
118 }
119 ],
120 "source": [
121 "open('history-words.txt', 'w').write(lcat(cleaned_words))"
122 ]
123 },
124 {
125 "cell_type": "code",
126 "execution_count": 18,
127 "metadata": {},
128 "outputs": [
129 {
130 "data": {
131 "text/plain": [
132 "True"
133 ]
134 },
135 "execution_count": 18,
136 "metadata": {},
137 "output_type": "execute_result"
138 }
139 ],
140 "source": [
141 "'ottoman' in cleaned_words"
142 ]
143 },
144 {
145 "cell_type": "code",
146 "execution_count": null,
147 "metadata": {},
148 "outputs": [],
149 "source": []
150 }
151 ],
152 "metadata": {
153 "kernelspec": {
154 "display_name": "Python 3",
155 "language": "python",
156 "name": "python3"
157 },
158 "language_info": {
159 "codemirror_mode": {
160 "name": "ipython",
161 "version": 3
162 },
163 "file_extension": ".py",
164 "mimetype": "text/x-python",
165 "name": "python",
166 "nbconvert_exporter": "python",
167 "pygments_lexer": "ipython3",
168 "version": "3.6.6"
169 }
170 },
171 "nbformat": 4,
172 "nbformat_minor": 2
173 }