765e22c746ba1a95ec64c20387784f47e05b76eb
[szyfrow.git] / docs / szyfrow / support / language_models.html
1 <!doctype html>
2 <html lang="en">
3 <head>
4 <meta charset="utf-8">
5 <meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
6 <meta name="generator" content="pdoc 0.9.2" />
7 <title>szyfrow.support.language_models API documentation</title>
8 <meta name="description" content="Descriptive models of a natural language (in this case, English) …" />
9 <link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
10 <link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
11 <link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
12 <style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
13 <style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
14 <style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
15 <script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
16 <script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
17 </head>
18 <body>
19 <main>
20 <article id="content">
21 <header>
22 <h1 class="title">Module <code>szyfrow.support.language_models</code></h1>
23 </header>
24 <section id="section-intro">
25 <p>Descriptive models of a natural language (in this case, English).</p>
26 <p>The functions <code><a title="szyfrow.support.language_models.Pwords" href="#szyfrow.support.language_models.Pwords">Pwords()</a></code>, <code><a title="szyfrow.support.language_models.Pletters" href="#szyfrow.support.language_models.Pletters">Pletters()</a></code>, <code><a title="szyfrow.support.language_models.Pbigrams" href="#szyfrow.support.language_models.Pbigrams">Pbigrams()</a></code>, and <code><a title="szyfrow.support.language_models.Ptrigrams" href="#szyfrow.support.language_models.Ptrigrams">Ptrigrams()</a></code> return the
27 log probability of a section of text.</p>
28 <p>If you want to use a different language, replace the data files in
29 <a href="../language_model_files/index.html"><code>szyfrow/language_model_files</code></a>.</p>
30 <ul>
31 <li><code>count_1l.txt</code>: counts of single letters</li>
32 <li><code>count_2l.txt</code>: counts of pairs letters, bigrams</li>
33 <li><code>count_3l.txt</code>: counts of triples of letters, triagrams</li>
34 <li><code>words.txt</code>: a dictionary of words, used for keyword-based cipher breaking.
35 These words should only contain characters cointained in
36 <code>string.ascii_letters</code>.</li>
37 </ul>
38 <details class="source">
39 <summary>
40 <span>Expand source code</span>
41 </summary>
42 <pre><code class="python">&#34;&#34;&#34;Descriptive models of a natural language (in this case, English).
43
44 The functions `Pwords`, `Pletters`, `Pbigrams`, and `Ptrigrams` return the
45 log probability of a section of text.
46
47 If you want to use a different language, replace the data files in
48 [`szyfrow/language_model_files`](../language_model_files/index.html).
49
50 * `count_1l.txt`: counts of single letters
51 * `count_2l.txt`: counts of pairs letters, bigrams
52 * `count_3l.txt`: counts of triples of letters, triagrams
53 * `words.txt`: a dictionary of words, used for keyword-based cipher breaking.
54 These words should only contain characters cointained in
55 `string.ascii_letters`.
56
57 &#34;&#34;&#34;
58
59 import string
60 import random
61 import collections
62 import itertools
63 from math import log10
64 import os
65 import importlib.resources as pkg_resources
66
67 import szyfrow.support.norms
68 from szyfrow.support.utilities import sanitise, deduplicate
69 from szyfrow import language_model_files
70
71
72 def datafile(name, sep=&#39;\t&#39;):
73 &#34;&#34;&#34;Read key,value pairs from file.
74 &#34;&#34;&#34;
75 with pkg_resources.open_text(language_model_files, name) as f:
76 # with open(p name), &#39;r&#39;) as f:
77 for line in f:
78 splits = line.split(sep)
79 yield [splits[0], int(splits[1])]
80
81 english_counts = collections.Counter(dict(datafile(&#39;count_1l.txt&#39;)))
82 &#34;&#34;&#34;Counts of single letters in English.&#34;&#34;&#34;
83 normalised_english_counts = szyfrow.support.norms.normalise(english_counts)
84 &#34;&#34;&#34;Normalised counts of single letters in English (the sum of all counts
85 adds to 1).&#34;&#34;&#34;
86
87 english_bigram_counts = collections.Counter(dict(datafile(&#39;count_2l.txt&#39;)))
88 &#34;&#34;&#34;Counts of letter bigrams in English.&#34;&#34;&#34;
89 normalised_english_bigram_counts = szyfrow.support.norms.normalise(english_bigram_counts)
90 &#34;&#34;&#34;Normalised counts of letter bigrams in English (the sum of all counts
91 adds to 1).&#34;&#34;&#34;
92
93 english_trigram_counts = collections.Counter(dict(datafile(&#39;count_3l.txt&#39;)))
94 &#34;&#34;&#34;Counts of letter trigrams in English.&#34;&#34;&#34;
95 normalised_english_trigram_counts = szyfrow.support.norms.normalise(english_trigram_counts)
96 &#34;&#34;&#34;Normalised counts of letter trigrams in English (the sum of all counts
97 adds to 1).&#34;&#34;&#34;
98
99 keywords = []
100 &#34;&#34;&#34;A sample list of keywords, to act as a dictionary for
101 dictionary-based cipher breaking attempts.&#34;&#34;&#34;
102 with pkg_resources.open_text(language_model_files, &#39;words.txt&#39;) as f:
103 keywords = [line.rstrip() for line in f]
104
105
106 def transpositions_of(keyword):
107 &#34;&#34;&#34;Finds the transpostions given by a keyword. For instance, the keyword
108 &#39;clever&#39; rearranges to &#39;celrv&#39;, so the first column (0) stays first, the
109 second column (1) moves to third, the third column (2) moves to second,
110 and so on.
111
112 If passed a tuple, assume it&#39;s already a transposition and just return it.
113
114 &gt;&gt;&gt; transpositions_of(&#39;clever&#39;)
115 (0, 2, 1, 4, 3)
116 &gt;&gt;&gt; transpositions_of(&#39;fred&#39;)
117 (3, 2, 0, 1)
118 &gt;&gt;&gt; transpositions_of((3, 2, 0, 1))
119 (3, 2, 0, 1)
120 &#34;&#34;&#34;
121 if isinstance(keyword, tuple):
122 return keyword
123 else:
124 key = deduplicate(keyword)
125 transpositions = tuple(key.index(l) for l in sorted(key))
126 return transpositions
127
128 transpositions = collections.defaultdict(list)
129 &#34;&#34;&#34;A sample dict of transpositions, to act as a dictionary for
130 dictionary-based cipher breaking attempts. Each key is a transposition,
131 each value is a list of words that give that transposition.&#34;&#34;&#34;
132 for word in keywords:
133 transpositions[transpositions_of(word)] += [word]
134
135
136 def weighted_choice(d):
137 &#34;&#34;&#34;Generate random item from a dictionary of item counts
138 &#34;&#34;&#34;
139 delems, dweights = list(zip(*d.items()))
140 return random.choices(delems, dweights)[0]
141 # target = random.uniform(0, sum(d.values()))
142 # cuml = 0.0
143 # for (l, p) in d.items():
144 # cuml += p
145 # if cuml &gt; target:
146 # return l
147 # return None
148
149 def random_english_letter():
150 &#34;&#34;&#34;Generate a random letter based on English letter counts
151 &#34;&#34;&#34;
152 return weighted_choice(normalised_english_counts)
153
154
155 def ngrams(text, n):
156 &#34;&#34;&#34;Returns all n-grams of a text
157
158 &gt;&gt;&gt; ngrams(sanitise(&#39;the quick brown fox&#39;), 2) # doctest: +NORMALIZE_WHITESPACE
159 [&#39;th&#39;, &#39;he&#39;, &#39;eq&#39;, &#39;qu&#39;, &#39;ui&#39;, &#39;ic&#39;, &#39;ck&#39;, &#39;kb&#39;, &#39;br&#39;, &#39;ro&#39;, &#39;ow&#39;, &#39;wn&#39;,
160 &#39;nf&#39;, &#39;fo&#39;, &#39;ox&#39;]
161 &gt;&gt;&gt; ngrams(sanitise(&#39;the quick brown fox&#39;), 4) # doctest: +NORMALIZE_WHITESPACE
162 [&#39;theq&#39;, &#39;hequ&#39;, &#39;equi&#39;, &#39;quic&#39;, &#39;uick&#39;, &#39;ickb&#39;, &#39;ckbr&#39;, &#39;kbro&#39;, &#39;brow&#39;,
163 &#39;rown&#39;, &#39;ownf&#39;, &#39;wnfo&#39;, &#39;nfox&#39;]
164 &#34;&#34;&#34;
165 return [text[i:i+n] for i in range(len(text)-n+1)]
166
167
168 class Pdist(dict):
169 &#34;&#34;&#34;A probability distribution estimated from counts in datafile.
170 Values are stored and returned as log probabilities.
171 &#34;&#34;&#34;
172 def __init__(self, data=[], estimate_of_missing=None):
173 data1, data2 = itertools.tee(data)
174 self.total = sum([d[1] for d in data1])
175 for key, count in data2:
176 self[key] = log10(count / self.total)
177 self.estimate_of_missing = estimate_of_missing or (lambda k, N: 1./N)
178 def __missing__(self, key):
179 return self.estimate_of_missing(key, self.total)
180
181 def log_probability_of_unknown_word(key, N):
182 &#34;&#34;&#34;Estimate the probability of an unknown word.
183 &#34;&#34;&#34;
184 return -log10(N * 10**((len(key) - 2) * 1.4))
185
186 Pw = Pdist(datafile(&#39;count_1w.txt&#39;), log_probability_of_unknown_word)
187 &#34;&#34;&#34;A [Pdist](#szyfrow.support.language_models.Pdist) holding log probabilities
188 of words. Unknown words have their probability estimated by
189 [log_probability_of_unknown_word](#szyfrow.support.language_models.log_probability_of_unknown_word)&#34;&#34;&#34;
190 Pl = Pdist(datafile(&#39;count_1l.txt&#39;), lambda _k, _N: 0)
191 &#34;&#34;&#34;A [Pdist](#szyfrow.support.language_models.Pdist) holding log probabilities
192 of single letters. Unknown words have their probability estimated as zero.&#34;&#34;&#34;
193 P2l = Pdist(datafile(&#39;count_2l.txt&#39;), lambda _k, _N: 0)
194 &#34;&#34;&#34;A [Pdist](#szyfrow.support.language_models.Pdist) holding log probabilities
195 of letter bigrams. Unknown words have their probability estimated as zero.&#34;&#34;&#34;
196 P3l = Pdist(datafile(&#39;count_3l.txt&#39;), lambda _k, _N: 0)
197 &#34;&#34;&#34;A [Pdist](#szyfrow.support.language_models.Pdist) holding log probabilities
198 of letter trigrams. Unknown words have their probability estimated as zero.&#34;&#34;&#34;
199
200 def Pwords(words):
201 &#34;&#34;&#34;The Naive Bayes log probability of a sequence of words.
202 &#34;&#34;&#34;
203 return sum(Pw[w.lower()] for w in words)
204
205 def Pletters(letters):
206 &#34;&#34;&#34;The Naive Bayes log probability of a sequence of letters.
207 &#34;&#34;&#34;
208 return sum(Pl[l.lower()] for l in letters)
209
210 def Pbigrams(letters):
211 &#34;&#34;&#34;The Naive Bayes log probability of the bigrams formed from a sequence
212 of letters.
213 &#34;&#34;&#34;
214 return sum(P2l[p] for p in ngrams(letters, 2))
215
216 def Ptrigrams(letters):
217 &#34;&#34;&#34;The Naive Bayes log probability of the trigrams formed from a sequence
218 of letters.
219 &#34;&#34;&#34;
220 return sum(P3l[p] for p in ngrams(letters, 3))
221
222
223 def cosine_distance_score(text):
224 &#34;&#34;&#34;Finds the dissimilarity of a text to English, using the cosine distance
225 of the frequency distribution.
226
227 &gt;&gt;&gt; cosine_distance_score(&#39;abcabc&#39;) # doctest: +ELLIPSIS
228 0.73771...
229 &#34;&#34;&#34;
230 # return szyfrow.support.norms.cosine_distance(english_counts,
231 # collections.Counter(sanitise(text)))
232 return 1 - szyfrow.support.norms.cosine_similarity(english_counts,
233 collections.Counter(sanitise(text)))
234
235
236 if __name__ == &#34;__main__&#34;:
237 import doctest
238 doctest.testmod()</code></pre>
239 </details>
240 </section>
241 <section>
242 </section>
243 <section>
244 <h2 class="section-title" id="header-variables">Global variables</h2>
245 <dl>
246 <dt id="szyfrow.support.language_models.P2l"><code class="name">var <span class="ident">P2l</span></code></dt>
247 <dd>
248 <div class="desc"><p>A <a href="#szyfrow.support.language_models.Pdist">Pdist</a> holding log probabilities
249 of letter bigrams. Unknown words have their probability estimated as zero.</p></div>
250 </dd>
251 <dt id="szyfrow.support.language_models.P3l"><code class="name">var <span class="ident">P3l</span></code></dt>
252 <dd>
253 <div class="desc"><p>A <a href="#szyfrow.support.language_models.Pdist">Pdist</a> holding log probabilities
254 of letter trigrams. Unknown words have their probability estimated as zero.</p></div>
255 </dd>
256 <dt id="szyfrow.support.language_models.Pl"><code class="name">var <span class="ident">Pl</span></code></dt>
257 <dd>
258 <div class="desc"><p>A <a href="#szyfrow.support.language_models.Pdist">Pdist</a> holding log probabilities
259 of single letters. Unknown words have their probability estimated as zero.</p></div>
260 </dd>
261 <dt id="szyfrow.support.language_models.Pw"><code class="name">var <span class="ident">Pw</span></code></dt>
262 <dd>
263 <div class="desc"><p>A <a href="#szyfrow.support.language_models.Pdist">Pdist</a> holding log probabilities
264 of words. Unknown words have their probability estimated by
265 <a href="#szyfrow.support.language_models.log_probability_of_unknown_word">log_probability_of_unknown_word</a></p></div>
266 </dd>
267 <dt id="szyfrow.support.language_models.english_bigram_counts"><code class="name">var <span class="ident">english_bigram_counts</span></code></dt>
268 <dd>
269 <div class="desc"><p>Counts of letter bigrams in English.</p></div>
270 </dd>
271 <dt id="szyfrow.support.language_models.english_counts"><code class="name">var <span class="ident">english_counts</span></code></dt>
272 <dd>
273 <div class="desc"><p>Counts of single letters in English.</p></div>
274 </dd>
275 <dt id="szyfrow.support.language_models.english_trigram_counts"><code class="name">var <span class="ident">english_trigram_counts</span></code></dt>
276 <dd>
277 <div class="desc"><p>Counts of letter trigrams in English.</p></div>
278 </dd>
279 <dt id="szyfrow.support.language_models.keywords"><code class="name">var <span class="ident">keywords</span></code></dt>
280 <dd>
281 <div class="desc"><p>A sample list of keywords, to act as a dictionary for
282 dictionary-based cipher breaking attempts.</p></div>
283 </dd>
284 <dt id="szyfrow.support.language_models.normalised_english_bigram_counts"><code class="name">var <span class="ident">normalised_english_bigram_counts</span></code></dt>
285 <dd>
286 <div class="desc"><p>Normalised counts of letter bigrams in English (the sum of all counts
287 adds to 1).</p></div>
288 </dd>
289 <dt id="szyfrow.support.language_models.normalised_english_counts"><code class="name">var <span class="ident">normalised_english_counts</span></code></dt>
290 <dd>
291 <div class="desc"><p>Normalised counts of single letters in English (the sum of all counts
292 adds to 1).</p></div>
293 </dd>
294 <dt id="szyfrow.support.language_models.normalised_english_trigram_counts"><code class="name">var <span class="ident">normalised_english_trigram_counts</span></code></dt>
295 <dd>
296 <div class="desc"><p>Normalised counts of letter trigrams in English (the sum of all counts
297 adds to 1).</p></div>
298 </dd>
299 <dt id="szyfrow.support.language_models.transpositions"><code class="name">var <span class="ident">transpositions</span></code></dt>
300 <dd>
301 <div class="desc"><p>A sample dict of transpositions, to act as a dictionary for
302 dictionary-based cipher breaking attempts. Each key is a transposition,
303 each value is a list of words that give that transposition.</p></div>
304 </dd>
305 </dl>
306 </section>
307 <section>
308 <h2 class="section-title" id="header-functions">Functions</h2>
309 <dl>
310 <dt id="szyfrow.support.language_models.Pbigrams"><code class="name flex">
311 <span>def <span class="ident">Pbigrams</span></span>(<span>letters)</span>
312 </code></dt>
313 <dd>
314 <div class="desc"><p>The Naive Bayes log probability of the bigrams formed from a sequence
315 of letters.</p></div>
316 <details class="source">
317 <summary>
318 <span>Expand source code</span>
319 </summary>
320 <pre><code class="python">def Pbigrams(letters):
321 &#34;&#34;&#34;The Naive Bayes log probability of the bigrams formed from a sequence
322 of letters.
323 &#34;&#34;&#34;
324 return sum(P2l[p] for p in ngrams(letters, 2))</code></pre>
325 </details>
326 </dd>
327 <dt id="szyfrow.support.language_models.Pletters"><code class="name flex">
328 <span>def <span class="ident">Pletters</span></span>(<span>letters)</span>
329 </code></dt>
330 <dd>
331 <div class="desc"><p>The Naive Bayes log probability of a sequence of letters.</p></div>
332 <details class="source">
333 <summary>
334 <span>Expand source code</span>
335 </summary>
336 <pre><code class="python">def Pletters(letters):
337 &#34;&#34;&#34;The Naive Bayes log probability of a sequence of letters.
338 &#34;&#34;&#34;
339 return sum(Pl[l.lower()] for l in letters)</code></pre>
340 </details>
341 </dd>
342 <dt id="szyfrow.support.language_models.Ptrigrams"><code class="name flex">
343 <span>def <span class="ident">Ptrigrams</span></span>(<span>letters)</span>
344 </code></dt>
345 <dd>
346 <div class="desc"><p>The Naive Bayes log probability of the trigrams formed from a sequence
347 of letters.</p></div>
348 <details class="source">
349 <summary>
350 <span>Expand source code</span>
351 </summary>
352 <pre><code class="python">def Ptrigrams(letters):
353 &#34;&#34;&#34;The Naive Bayes log probability of the trigrams formed from a sequence
354 of letters.
355 &#34;&#34;&#34;
356 return sum(P3l[p] for p in ngrams(letters, 3))</code></pre>
357 </details>
358 </dd>
359 <dt id="szyfrow.support.language_models.Pwords"><code class="name flex">
360 <span>def <span class="ident">Pwords</span></span>(<span>words)</span>
361 </code></dt>
362 <dd>
363 <div class="desc"><p>The Naive Bayes log probability of a sequence of words.</p></div>
364 <details class="source">
365 <summary>
366 <span>Expand source code</span>
367 </summary>
368 <pre><code class="python">def Pwords(words):
369 &#34;&#34;&#34;The Naive Bayes log probability of a sequence of words.
370 &#34;&#34;&#34;
371 return sum(Pw[w.lower()] for w in words)</code></pre>
372 </details>
373 </dd>
374 <dt id="szyfrow.support.language_models.cosine_distance_score"><code class="name flex">
375 <span>def <span class="ident">cosine_distance_score</span></span>(<span>text)</span>
376 </code></dt>
377 <dd>
378 <div class="desc"><p>Finds the dissimilarity of a text to English, using the cosine distance
379 of the frequency distribution.</p>
380 <pre><code class="language-python-repl">&gt;&gt;&gt; cosine_distance_score('abcabc') # doctest: +ELLIPSIS
381 0.73771...
382 </code></pre></div>
383 <details class="source">
384 <summary>
385 <span>Expand source code</span>
386 </summary>
387 <pre><code class="python">def cosine_distance_score(text):
388 &#34;&#34;&#34;Finds the dissimilarity of a text to English, using the cosine distance
389 of the frequency distribution.
390
391 &gt;&gt;&gt; cosine_distance_score(&#39;abcabc&#39;) # doctest: +ELLIPSIS
392 0.73771...
393 &#34;&#34;&#34;
394 # return szyfrow.support.norms.cosine_distance(english_counts,
395 # collections.Counter(sanitise(text)))
396 return 1 - szyfrow.support.norms.cosine_similarity(english_counts,
397 collections.Counter(sanitise(text)))</code></pre>
398 </details>
399 </dd>
400 <dt id="szyfrow.support.language_models.datafile"><code class="name flex">
401 <span>def <span class="ident">datafile</span></span>(<span>name, sep='\t')</span>
402 </code></dt>
403 <dd>
404 <div class="desc"><p>Read key,value pairs from file.</p></div>
405 <details class="source">
406 <summary>
407 <span>Expand source code</span>
408 </summary>
409 <pre><code class="python">def datafile(name, sep=&#39;\t&#39;):
410 &#34;&#34;&#34;Read key,value pairs from file.
411 &#34;&#34;&#34;
412 with pkg_resources.open_text(language_model_files, name) as f:
413 # with open(p name), &#39;r&#39;) as f:
414 for line in f:
415 splits = line.split(sep)
416 yield [splits[0], int(splits[1])]</code></pre>
417 </details>
418 </dd>
419 <dt id="szyfrow.support.language_models.log_probability_of_unknown_word"><code class="name flex">
420 <span>def <span class="ident">log_probability_of_unknown_word</span></span>(<span>key, N)</span>
421 </code></dt>
422 <dd>
423 <div class="desc"><p>Estimate the probability of an unknown word.</p></div>
424 <details class="source">
425 <summary>
426 <span>Expand source code</span>
427 </summary>
428 <pre><code class="python">def log_probability_of_unknown_word(key, N):
429 &#34;&#34;&#34;Estimate the probability of an unknown word.
430 &#34;&#34;&#34;
431 return -log10(N * 10**((len(key) - 2) * 1.4))</code></pre>
432 </details>
433 </dd>
434 <dt id="szyfrow.support.language_models.ngrams"><code class="name flex">
435 <span>def <span class="ident">ngrams</span></span>(<span>text, n)</span>
436 </code></dt>
437 <dd>
438 <div class="desc"><p>Returns all n-grams of a text</p>
439 <pre><code class="language-python-repl">&gt;&gt;&gt; ngrams(sanitise('the quick brown fox'), 2) # doctest: +NORMALIZE_WHITESPACE
440 ['th', 'he', 'eq', 'qu', 'ui', 'ic', 'ck', 'kb', 'br', 'ro', 'ow', 'wn',
441 'nf', 'fo', 'ox']
442 &gt;&gt;&gt; ngrams(sanitise('the quick brown fox'), 4) # doctest: +NORMALIZE_WHITESPACE
443 ['theq', 'hequ', 'equi', 'quic', 'uick', 'ickb', 'ckbr', 'kbro', 'brow',
444 'rown', 'ownf', 'wnfo', 'nfox']
445 </code></pre></div>
446 <details class="source">
447 <summary>
448 <span>Expand source code</span>
449 </summary>
450 <pre><code class="python">def ngrams(text, n):
451 &#34;&#34;&#34;Returns all n-grams of a text
452
453 &gt;&gt;&gt; ngrams(sanitise(&#39;the quick brown fox&#39;), 2) # doctest: +NORMALIZE_WHITESPACE
454 [&#39;th&#39;, &#39;he&#39;, &#39;eq&#39;, &#39;qu&#39;, &#39;ui&#39;, &#39;ic&#39;, &#39;ck&#39;, &#39;kb&#39;, &#39;br&#39;, &#39;ro&#39;, &#39;ow&#39;, &#39;wn&#39;,
455 &#39;nf&#39;, &#39;fo&#39;, &#39;ox&#39;]
456 &gt;&gt;&gt; ngrams(sanitise(&#39;the quick brown fox&#39;), 4) # doctest: +NORMALIZE_WHITESPACE
457 [&#39;theq&#39;, &#39;hequ&#39;, &#39;equi&#39;, &#39;quic&#39;, &#39;uick&#39;, &#39;ickb&#39;, &#39;ckbr&#39;, &#39;kbro&#39;, &#39;brow&#39;,
458 &#39;rown&#39;, &#39;ownf&#39;, &#39;wnfo&#39;, &#39;nfox&#39;]
459 &#34;&#34;&#34;
460 return [text[i:i+n] for i in range(len(text)-n+1)]</code></pre>
461 </details>
462 </dd>
463 <dt id="szyfrow.support.language_models.random_english_letter"><code class="name flex">
464 <span>def <span class="ident">random_english_letter</span></span>(<span>)</span>
465 </code></dt>
466 <dd>
467 <div class="desc"><p>Generate a random letter based on English letter counts</p></div>
468 <details class="source">
469 <summary>
470 <span>Expand source code</span>
471 </summary>
472 <pre><code class="python">def random_english_letter():
473 &#34;&#34;&#34;Generate a random letter based on English letter counts
474 &#34;&#34;&#34;
475 return weighted_choice(normalised_english_counts)</code></pre>
476 </details>
477 </dd>
478 <dt id="szyfrow.support.language_models.transpositions_of"><code class="name flex">
479 <span>def <span class="ident">transpositions_of</span></span>(<span>keyword)</span>
480 </code></dt>
481 <dd>
482 <div class="desc"><p>Finds the transpostions given by a keyword. For instance, the keyword
483 'clever' rearranges to 'celrv', so the first column (0) stays first, the
484 second column (1) moves to third, the third column (2) moves to second,
485 and so on.</p>
486 <p>If passed a tuple, assume it's already a transposition and just return it.</p>
487 <pre><code class="language-python-repl">&gt;&gt;&gt; transpositions_of('clever')
488 (0, 2, 1, 4, 3)
489 &gt;&gt;&gt; transpositions_of('fred')
490 (3, 2, 0, 1)
491 &gt;&gt;&gt; transpositions_of((3, 2, 0, 1))
492 (3, 2, 0, 1)
493 </code></pre></div>
494 <details class="source">
495 <summary>
496 <span>Expand source code</span>
497 </summary>
498 <pre><code class="python">def transpositions_of(keyword):
499 &#34;&#34;&#34;Finds the transpostions given by a keyword. For instance, the keyword
500 &#39;clever&#39; rearranges to &#39;celrv&#39;, so the first column (0) stays first, the
501 second column (1) moves to third, the third column (2) moves to second,
502 and so on.
503
504 If passed a tuple, assume it&#39;s already a transposition and just return it.
505
506 &gt;&gt;&gt; transpositions_of(&#39;clever&#39;)
507 (0, 2, 1, 4, 3)
508 &gt;&gt;&gt; transpositions_of(&#39;fred&#39;)
509 (3, 2, 0, 1)
510 &gt;&gt;&gt; transpositions_of((3, 2, 0, 1))
511 (3, 2, 0, 1)
512 &#34;&#34;&#34;
513 if isinstance(keyword, tuple):
514 return keyword
515 else:
516 key = deduplicate(keyword)
517 transpositions = tuple(key.index(l) for l in sorted(key))
518 return transpositions</code></pre>
519 </details>
520 </dd>
521 <dt id="szyfrow.support.language_models.weighted_choice"><code class="name flex">
522 <span>def <span class="ident">weighted_choice</span></span>(<span>d)</span>
523 </code></dt>
524 <dd>
525 <div class="desc"><p>Generate random item from a dictionary of item counts</p></div>
526 <details class="source">
527 <summary>
528 <span>Expand source code</span>
529 </summary>
530 <pre><code class="python">def weighted_choice(d):
531 &#34;&#34;&#34;Generate random item from a dictionary of item counts
532 &#34;&#34;&#34;
533 delems, dweights = list(zip(*d.items()))
534 return random.choices(delems, dweights)[0]
535 # target = random.uniform(0, sum(d.values()))
536 # cuml = 0.0
537 # for (l, p) in d.items():
538 # cuml += p
539 # if cuml &gt; target:
540 # return l
541 # return None</code></pre>
542 </details>
543 </dd>
544 </dl>
545 </section>
546 <section>
547 <h2 class="section-title" id="header-classes">Classes</h2>
548 <dl>
549 <dt id="szyfrow.support.language_models.Pdist"><code class="flex name class">
550 <span>class <span class="ident">Pdist</span></span>
551 <span>(</span><span>data=[], estimate_of_missing=None)</span>
552 </code></dt>
553 <dd>
554 <div class="desc"><p>A probability distribution estimated from counts in datafile.
555 Values are stored and returned as log probabilities.</p></div>
556 <details class="source">
557 <summary>
558 <span>Expand source code</span>
559 </summary>
560 <pre><code class="python">class Pdist(dict):
561 &#34;&#34;&#34;A probability distribution estimated from counts in datafile.
562 Values are stored and returned as log probabilities.
563 &#34;&#34;&#34;
564 def __init__(self, data=[], estimate_of_missing=None):
565 data1, data2 = itertools.tee(data)
566 self.total = sum([d[1] for d in data1])
567 for key, count in data2:
568 self[key] = log10(count / self.total)
569 self.estimate_of_missing = estimate_of_missing or (lambda k, N: 1./N)
570 def __missing__(self, key):
571 return self.estimate_of_missing(key, self.total)</code></pre>
572 </details>
573 <h3>Ancestors</h3>
574 <ul class="hlist">
575 <li>builtins.dict</li>
576 </ul>
577 </dd>
578 </dl>
579 </section>
580 </article>
581 <nav id="sidebar">
582 <h1>Index</h1>
583 <div class="toc">
584 <ul></ul>
585 </div>
586 <ul id="index">
587 <li><h3>Super-module</h3>
588 <ul>
589 <li><code><a title="szyfrow.support" href="index.html">szyfrow.support</a></code></li>
590 </ul>
591 </li>
592 <li><h3><a href="#header-variables">Global variables</a></h3>
593 <ul class="">
594 <li><code><a title="szyfrow.support.language_models.P2l" href="#szyfrow.support.language_models.P2l">P2l</a></code></li>
595 <li><code><a title="szyfrow.support.language_models.P3l" href="#szyfrow.support.language_models.P3l">P3l</a></code></li>
596 <li><code><a title="szyfrow.support.language_models.Pl" href="#szyfrow.support.language_models.Pl">Pl</a></code></li>
597 <li><code><a title="szyfrow.support.language_models.Pw" href="#szyfrow.support.language_models.Pw">Pw</a></code></li>
598 <li><code><a title="szyfrow.support.language_models.english_bigram_counts" href="#szyfrow.support.language_models.english_bigram_counts">english_bigram_counts</a></code></li>
599 <li><code><a title="szyfrow.support.language_models.english_counts" href="#szyfrow.support.language_models.english_counts">english_counts</a></code></li>
600 <li><code><a title="szyfrow.support.language_models.english_trigram_counts" href="#szyfrow.support.language_models.english_trigram_counts">english_trigram_counts</a></code></li>
601 <li><code><a title="szyfrow.support.language_models.keywords" href="#szyfrow.support.language_models.keywords">keywords</a></code></li>
602 <li><code><a title="szyfrow.support.language_models.normalised_english_bigram_counts" href="#szyfrow.support.language_models.normalised_english_bigram_counts">normalised_english_bigram_counts</a></code></li>
603 <li><code><a title="szyfrow.support.language_models.normalised_english_counts" href="#szyfrow.support.language_models.normalised_english_counts">normalised_english_counts</a></code></li>
604 <li><code><a title="szyfrow.support.language_models.normalised_english_trigram_counts" href="#szyfrow.support.language_models.normalised_english_trigram_counts">normalised_english_trigram_counts</a></code></li>
605 <li><code><a title="szyfrow.support.language_models.transpositions" href="#szyfrow.support.language_models.transpositions">transpositions</a></code></li>
606 </ul>
607 </li>
608 <li><h3><a href="#header-functions">Functions</a></h3>
609 <ul class="">
610 <li><code><a title="szyfrow.support.language_models.Pbigrams" href="#szyfrow.support.language_models.Pbigrams">Pbigrams</a></code></li>
611 <li><code><a title="szyfrow.support.language_models.Pletters" href="#szyfrow.support.language_models.Pletters">Pletters</a></code></li>
612 <li><code><a title="szyfrow.support.language_models.Ptrigrams" href="#szyfrow.support.language_models.Ptrigrams">Ptrigrams</a></code></li>
613 <li><code><a title="szyfrow.support.language_models.Pwords" href="#szyfrow.support.language_models.Pwords">Pwords</a></code></li>
614 <li><code><a title="szyfrow.support.language_models.cosine_distance_score" href="#szyfrow.support.language_models.cosine_distance_score">cosine_distance_score</a></code></li>
615 <li><code><a title="szyfrow.support.language_models.datafile" href="#szyfrow.support.language_models.datafile">datafile</a></code></li>
616 <li><code><a title="szyfrow.support.language_models.log_probability_of_unknown_word" href="#szyfrow.support.language_models.log_probability_of_unknown_word">log_probability_of_unknown_word</a></code></li>
617 <li><code><a title="szyfrow.support.language_models.ngrams" href="#szyfrow.support.language_models.ngrams">ngrams</a></code></li>
618 <li><code><a title="szyfrow.support.language_models.random_english_letter" href="#szyfrow.support.language_models.random_english_letter">random_english_letter</a></code></li>
619 <li><code><a title="szyfrow.support.language_models.transpositions_of" href="#szyfrow.support.language_models.transpositions_of">transpositions_of</a></code></li>
620 <li><code><a title="szyfrow.support.language_models.weighted_choice" href="#szyfrow.support.language_models.weighted_choice">weighted_choice</a></code></li>
621 </ul>
622 </li>
623 <li><h3><a href="#header-classes">Classes</a></h3>
624 <ul>
625 <li>
626 <h4><code><a title="szyfrow.support.language_models.Pdist" href="#szyfrow.support.language_models.Pdist">Pdist</a></code></h4>
627 </li>
628 </ul>
629 </li>
630 </ul>
631 </nav>
632 </main>
633 <footer id="footer">
634 <p>Generated by <a href="https://pdoc3.github.io/pdoc"><cite>pdoc</cite> 0.9.2</a>.</p>
635 </footer>
636 </body>
637 </html>