szyfrow/support/language_models.html

   1 <!doctype html>
   2 <html lang="en">
   3 <head>
   4 <meta charset="utf-8">
   5 <meta name="viewport" content="width=device-width, initial-scale=1, minimum-scale=1" />
   6 <meta name="generator" content="pdoc 0.9.2" />
   7 <title>szyfrow.support.language_models API documentation</title>
   8 <meta name="description" content="Descriptive models of a natural language (in this case, English) …" />
   9 <link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity="sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin>
  10 <link rel="preload stylesheet" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity="sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin>
  11 <link rel="stylesheet preload" as="style" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin>
  12 <style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:1.5em}#content{padding:20px}#sidebar{padding:30px;overflow:hidden}#sidebar > *:last-child{margin-bottom:2cm}.http-server-breadcrumbs{font-size:130%;margin:0 0 15px 0}#footer{font-size:.75em;padding:5px 30px;border-top:1px solid #ddd;text-align:right}#footer p{margin:0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:30px}h1,h2,h3,h4,h5{font-weight:300}h1{font-size:2.5em;line-height:1.1em}h2{font-size:1.75em;margin:1em 0 .50em 0}h3{font-size:1.4em;margin:25px 0 10px 0}h4{margin:0;font-size:105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:.2em 0}a{color:#058;text-decoration:none;transition:color .3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^="header-"]{margin-top:2em}.ident{color:#900}pre code{background:#f8f8f8;font-size:.8em;line-height:1.4em}code{background:#f2f2f1;padding:1px 4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:0;border-top:1px solid #ccc;border-bottom:1px solid #ccc;margin:1em 0;padding:1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:10%}#http-server-module-list p{margin-top:0}.toc ul,#index{list-style-type:none;margin:0;padding:0}#index code{background:transparent}#index h3{border-bottom:1px solid #ddd}#index ul{padding:0}#index h4{margin-top:.6em;font-weight:bold}@media (min-width:200ex){#index .two-column{column-count:2}}@media (min-width:300ex){#index .two-column{column-count:3}}dl{margin-bottom:2em}dl dl:last-child{margin-bottom:4em}dd{margin:0 0 1em 3em}#header-classes + dl > dd{margin-bottom:3em}dd dd{margin-left:2em}dd p{margin:10px 0}.name{background:#eee;font-weight:bold;font-size:.85em;padding:5px 10px;display:inline-block;min-width:40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name > span:first-child{white-space:nowrap}.name.class > span:nth-child(2){margin-left:.4em}.inherited{color:#999;border-left:5px solid #eee;padding-left:1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:400;font-size:1.25em}.desc h3{font-size:1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#666;text-align:right;font-weight:400;font-size:.8em;text-transform:uppercase}.source summary > *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:1em}.source pre{max-height:500px;overflow:auto;margin:0}.source pre code{font-size:12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:1em}img{max-width:100%}td{padding:0 .5em}.admonition{padding:.1em .5em;margin-bottom:1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}</style>
  13 <style media="screen and (min-width: 700px)">@media screen and (min-width:700px){#sidebar{width:30%;height:100vh;overflow:auto;position:sticky;top:0}#content{width:70%;max-width:100ch;padding:3em 4em;border-left:1px solid #ddd}pre code{font-size:1em}.item .name{font-size:1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:1.5em}.toc > ul > li{margin-top:.5em}}</style>
  14 <style media="print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:" (" attr(href) ")";font-size:90%}a[href][title]:after{content:none}abbr[title]:after{content:" (" attr(title) ")"}.ir a:after,a[href^="javascript:"]:after,a[href^="#"]:after{content:""}pre,blockquote{border:1px solid #999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:100% !important}@page{margin:0.5cm}p,h2,h3{orphans:3;widows:3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}</style>
  15 <script defer src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity="sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin></script>
  16 <script>window.addEventListener('DOMContentLoaded', () => hljs.initHighlighting())</script>
  17 </head>
  18 <body>
  19 <main>
  20 <article id="content">
  21 <header>
  22 <h1 class="title">Module <code>szyfrow.support.language_models</code></h1>
  23 </header>
  24 <section id="section-intro">
  25 <p>Descriptive models of a natural language (in this case, English).</p>
  26 <p>The functions <code><a title="szyfrow.support.language_models.Pwords" href="#szyfrow.support.language_models.Pwords">Pwords()</a></code>, <code><a title="szyfrow.support.language_models.Pletters" href="#szyfrow.support.language_models.Pletters">Pletters()</a></code>, <code><a title="szyfrow.support.language_models.Pbigrams" href="#szyfrow.support.language_models.Pbigrams">Pbigrams()</a></code>, and <code><a title="szyfrow.support.language_models.Ptrigrams" href="#szyfrow.support.language_models.Ptrigrams">Ptrigrams()</a></code> return the
  27 log probability of a section of text.</p>
  28 <p>If you want to use a different language, replace the data files in
  29 <a href="../language_model_files/index.html"><code>szyfrow/language_model_files</code></a>.</p>
  30 <ul>
  31 <li><code>count_1l.txt</code>: counts of single letters</li>
  32 <li><code>count_2l.txt</code>: counts of pairs letters, bigrams</li>
  33 <li><code>count_3l.txt</code>: counts of triples of letters, triagrams</li>
  34 <li><code>words.txt</code>: a dictionary of words, used for keyword-based cipher breaking.
  35 These words should only contain characters cointained in
  36 <code>string.ascii_letters</code>.</li>
  37 </ul>
  38 <details class="source">
  39 <summary>
  40 <span>Expand source code</span>
  41 </summary>
  42 <pre><code class="python">&#34;&#34;&#34;Descriptive models of a natural language (in this case, English).
  43
  44 The functions `Pwords`, `Pletters`, `Pbigrams`, and `Ptrigrams` return the
  45 log probability of a section of text.
  46
  47 If you want to use a different language, replace the data files in
  48 [`szyfrow/language_model_files`](../language_model_files/index.html).
  49
  50 * `count_1l.txt`: counts of single letters
  51 * `count_2l.txt`: counts of pairs letters, bigrams
  52 * `count_3l.txt`: counts of triples of letters, triagrams
  53 * `words.txt`: a dictionary of words, used for keyword-based cipher breaking.
  54   These words should only contain characters cointained in
  55   `string.ascii_letters`.
  56
  57 &#34;&#34;&#34;
  58
  59 import string
  60 import random
  61 import collections
  62 import itertools
  63 from math import log10
  64 import os
  65 import importlib.resources as pkg_resources
  66
  67 import szyfrow.support.norms
  68 from szyfrow.support.utilities import sanitise, deduplicate
  69 from szyfrow import language_model_files
  70
  71
  72 def datafile(name, sep=&#39;\t&#39;):
  73     &#34;&#34;&#34;Read key,value pairs from file.
  74     &#34;&#34;&#34;
  75     with pkg_resources.open_text(language_model_files, name) as f:
  76     # with open(p name), &#39;r&#39;) as f:
  77         for line in f:
  78             splits = line.split(sep)
  79             yield [splits[0], int(splits[1])]
  80
  81 english_counts = collections.Counter(dict(datafile(&#39;count_1l.txt&#39;)))
  82 &#34;&#34;&#34;Counts of single letters in English.&#34;&#34;&#34;
  83 normalised_english_counts = szyfrow.support.norms.normalise(english_counts)
  84 &#34;&#34;&#34;Normalised counts of single letters in English (the sum of all counts
  85 adds to 1).&#34;&#34;&#34;
  86
  87 english_bigram_counts = collections.Counter(dict(datafile(&#39;count_2l.txt&#39;)))
  88 &#34;&#34;&#34;Counts of letter bigrams in English.&#34;&#34;&#34;
  89 normalised_english_bigram_counts = szyfrow.support.norms.normalise(english_bigram_counts)
  90 &#34;&#34;&#34;Normalised counts of letter bigrams in English (the sum of all counts
  91 adds to 1).&#34;&#34;&#34;
  92
  93 english_trigram_counts = collections.Counter(dict(datafile(&#39;count_3l.txt&#39;)))
  94 &#34;&#34;&#34;Counts of letter trigrams in English.&#34;&#34;&#34;
  95 normalised_english_trigram_counts = szyfrow.support.norms.normalise(english_trigram_counts)
  96 &#34;&#34;&#34;Normalised counts of letter trigrams in English (the sum of all counts
  97 adds to 1).&#34;&#34;&#34;
  98
  99 keywords = []
 100 &#34;&#34;&#34;A sample list of keywords, to act as a dictionary for
 101 dictionary-based cipher breaking attempts.&#34;&#34;&#34;
 102 with pkg_resources.open_text(language_model_files, &#39;words.txt&#39;) as f:
 103     keywords = [line.rstrip() for line in f]
 104
 105
 106 def transpositions_of(keyword):
 107     &#34;&#34;&#34;Finds the transpostions given by a keyword. For instance, the keyword
 108     &#39;clever&#39; rearranges to &#39;celrv&#39;, so the first column (0) stays first, the
 109     second column (1) moves to third, the third column (2) moves to second,
 110     and so on.
 111
 112     If passed a tuple, assume it&#39;s already a transposition and just return it.
 113
 114     &gt;&gt;&gt; transpositions_of(&#39;clever&#39;)
 115     (0, 2, 1, 4, 3)
 116     &gt;&gt;&gt; transpositions_of(&#39;fred&#39;)
 117     (3, 2, 0, 1)
 118     &gt;&gt;&gt; transpositions_of((3, 2, 0, 1))
 119     (3, 2, 0, 1)
 120     &#34;&#34;&#34;
 121     if isinstance(keyword, tuple):
 122         return keyword
 123     else:
 124         key = deduplicate(keyword)
 125         transpositions = tuple(key.index(l) for l in sorted(key))
 126         return transpositions
 127
 128 transpositions = collections.defaultdict(list)
 129 &#34;&#34;&#34;A sample dict of transpositions, to act as a dictionary for
 130 dictionary-based cipher breaking attempts. Each key is a transposition,
 131 each value is a list of words that give that transposition.&#34;&#34;&#34;
 132 for word in keywords:
 133     transpositions[transpositions_of(word)] += [word]
 134
 135
 136 def weighted_choice(d):
 137     &#34;&#34;&#34;Generate random item from a dictionary of item counts
 138     &#34;&#34;&#34;
 139     delems, dweights = list(zip(*d.items()))
 140     return random.choices(delems, dweights)[0]
 141     # target = random.uniform(0, sum(d.values()))
 142     # cuml = 0.0
 143     # for (l, p) in d.items():
 144     #     cuml += p
 145     #     if cuml &gt; target:
 146     #         return l
 147     # return None
 148
 149 def random_english_letter():
 150     &#34;&#34;&#34;Generate a random letter based on English letter counts
 151     &#34;&#34;&#34;
 152     return weighted_choice(normalised_english_counts)
 153
 154
 155 def ngrams(text, n):
 156     &#34;&#34;&#34;Returns all n-grams of a text
 157
 158     &gt;&gt;&gt; ngrams(sanitise(&#39;the quick brown fox&#39;), 2) # doctest: +NORMALIZE_WHITESPACE
 159     [&#39;th&#39;, &#39;he&#39;, &#39;eq&#39;, &#39;qu&#39;, &#39;ui&#39;, &#39;ic&#39;, &#39;ck&#39;, &#39;kb&#39;, &#39;br&#39;, &#39;ro&#39;, &#39;ow&#39;, &#39;wn&#39;,
 160      &#39;nf&#39;, &#39;fo&#39;, &#39;ox&#39;]
 161     &gt;&gt;&gt; ngrams(sanitise(&#39;the quick brown fox&#39;), 4) # doctest: +NORMALIZE_WHITESPACE
 162     [&#39;theq&#39;, &#39;hequ&#39;, &#39;equi&#39;, &#39;quic&#39;, &#39;uick&#39;, &#39;ickb&#39;, &#39;ckbr&#39;, &#39;kbro&#39;, &#39;brow&#39;,
 163      &#39;rown&#39;, &#39;ownf&#39;, &#39;wnfo&#39;, &#39;nfox&#39;]
 164     &#34;&#34;&#34;
 165     return [text[i:i+n] for i in range(len(text)-n+1)]
 166
 167
 168 class Pdist(dict):
 169     &#34;&#34;&#34;A probability distribution estimated from counts in datafile.
 170     Values are stored and returned as log probabilities.
 171     &#34;&#34;&#34;
 172     def __init__(self, data=[], estimate_of_missing=None):
 173         data1, data2 = itertools.tee(data)
 174         self.total = sum([d[1] for d in data1])
 175         for key, count in data2:
 176             self[key] = log10(count / self.total)
 177         self.estimate_of_missing = estimate_of_missing or (lambda k, N: 1./N)
 178     def __missing__(self, key):
 179         return self.estimate_of_missing(key, self.total)
 180
 181 def log_probability_of_unknown_word(key, N):
 182     &#34;&#34;&#34;Estimate the probability of an unknown word.
 183     &#34;&#34;&#34;
 184     return -log10(N * 10**((len(key) - 2) * 1.4))
 185
 186 Pw = Pdist(datafile(&#39;count_1w.txt&#39;), log_probability_of_unknown_word)
 187 &#34;&#34;&#34;A [Pdist](#szyfrow.support.language_models.Pdist) holding log probabilities
 188 of words. Unknown words have their probability estimated by
 189 [log_probability_of_unknown_word](#szyfrow.support.language_models.log_probability_of_unknown_word)&#34;&#34;&#34;
 190 Pl = Pdist(datafile(&#39;count_1l.txt&#39;), lambda _k, _N: 0)
 191 &#34;&#34;&#34;A [Pdist](#szyfrow.support.language_models.Pdist) holding log probabilities
 192 of single letters. Unknown words have their probability estimated as zero.&#34;&#34;&#34;
 193 P2l = Pdist(datafile(&#39;count_2l.txt&#39;), lambda _k, _N: 0)
 194 &#34;&#34;&#34;A [Pdist](#szyfrow.support.language_models.Pdist) holding log probabilities
 195 of letter bigrams. Unknown words have their probability estimated as zero.&#34;&#34;&#34;
 196 P3l = Pdist(datafile(&#39;count_3l.txt&#39;), lambda _k, _N: 0)
 197 &#34;&#34;&#34;A [Pdist](#szyfrow.support.language_models.Pdist) holding log probabilities
 198 of letter trigrams. Unknown words have their probability estimated as zero.&#34;&#34;&#34;
 199
 200 def Pwords(words):
 201     &#34;&#34;&#34;The Naive Bayes log probability of a sequence of words.
 202     &#34;&#34;&#34;
 203     return sum(Pw[w.lower()] for w in words)
 204
 205 def Pletters(letters):
 206     &#34;&#34;&#34;The Naive Bayes log probability of a sequence of letters.
 207     &#34;&#34;&#34;
 208     return sum(Pl[l.lower()] for l in letters)
 209
 210 def Pbigrams(letters):
 211     &#34;&#34;&#34;The Naive Bayes log probability of the bigrams formed from a sequence
 212     of letters.
 213     &#34;&#34;&#34;
 214     return sum(P2l[p] for p in ngrams(letters, 2))
 215
 216 def Ptrigrams(letters):
 217     &#34;&#34;&#34;The Naive Bayes log probability of the trigrams formed from a sequence
 218     of letters.
 219     &#34;&#34;&#34;
 220     return sum(P3l[p] for p in ngrams(letters, 3))
 221
 222
 223 def cosine_distance_score(text):
 224     &#34;&#34;&#34;Finds the dissimilarity of a text to English, using the cosine distance
 225     of the frequency distribution.
 226
 227     &gt;&gt;&gt; cosine_distance_score(&#39;abcabc&#39;) # doctest: +ELLIPSIS
 228     0.73771...
 229     &#34;&#34;&#34;
 230     # return szyfrow.support.norms.cosine_distance(english_counts,
 231     #     collections.Counter(sanitise(text)))
 232     return 1 - szyfrow.support.norms.cosine_similarity(english_counts,
 233         collections.Counter(sanitise(text)))
 234
 235
 236 if __name__ == &#34;__main__&#34;:
 237     import doctest
 238     doctest.testmod()</code></pre>
 239 </details>
 240 </section>
 241 <section>
 242 </section>
 243 <section>
 244 <h2 class="section-title" id="header-variables">Global variables</h2>
 245 <dl>
 246 <dt id="szyfrow.support.language_models.P2l"><code class="name">var <span class="ident">P2l</span></code></dt>
 247 <dd>
 248 <div class="desc"><p>A <a href="#szyfrow.support.language_models.Pdist">Pdist</a> holding log probabilities
 249 of letter bigrams. Unknown words have their probability estimated as zero.</p></div>
 250 </dd>
 251 <dt id="szyfrow.support.language_models.P3l"><code class="name">var <span class="ident">P3l</span></code></dt>
 252 <dd>
 253 <div class="desc"><p>A <a href="#szyfrow.support.language_models.Pdist">Pdist</a> holding log probabilities
 254 of letter trigrams. Unknown words have their probability estimated as zero.</p></div>
 255 </dd>
 256 <dt id="szyfrow.support.language_models.Pl"><code class="name">var <span class="ident">Pl</span></code></dt>
 257 <dd>
 258 <div class="desc"><p>A <a href="#szyfrow.support.language_models.Pdist">Pdist</a> holding log probabilities
 259 of single letters. Unknown words have their probability estimated as zero.</p></div>
 260 </dd>
 261 <dt id="szyfrow.support.language_models.Pw"><code class="name">var <span class="ident">Pw</span></code></dt>
 262 <dd>
 263 <div class="desc"><p>A <a href="#szyfrow.support.language_models.Pdist">Pdist</a> holding log probabilities
 264 of words. Unknown words have their probability estimated by
 265 <a href="#szyfrow.support.language_models.log_probability_of_unknown_word">log_probability_of_unknown_word</a></p></div>
 266 </dd>
 267 <dt id="szyfrow.support.language_models.english_bigram_counts"><code class="name">var <span class="ident">english_bigram_counts</span></code></dt>
 268 <dd>
 269 <div class="desc"><p>Counts of letter bigrams in English.</p></div>
 270 </dd>
 271 <dt id="szyfrow.support.language_models.english_counts"><code class="name">var <span class="ident">english_counts</span></code></dt>
 272 <dd>
 273 <div class="desc"><p>Counts of single letters in English.</p></div>
 274 </dd>
 275 <dt id="szyfrow.support.language_models.english_trigram_counts"><code class="name">var <span class="ident">english_trigram_counts</span></code></dt>
 276 <dd>
 277 <div class="desc"><p>Counts of letter trigrams in English.</p></div>
 278 </dd>
 279 <dt id="szyfrow.support.language_models.keywords"><code class="name">var <span class="ident">keywords</span></code></dt>
 280 <dd>
 281 <div class="desc"><p>A sample list of keywords, to act as a dictionary for
 282 dictionary-based cipher breaking attempts.</p></div>
 283 </dd>
 284 <dt id="szyfrow.support.language_models.normalised_english_bigram_counts"><code class="name">var <span class="ident">normalised_english_bigram_counts</span></code></dt>
 285 <dd>
 286 <div class="desc"><p>Normalised counts of letter bigrams in English (the sum of all counts
 287 adds to 1).</p></div>
 288 </dd>
 289 <dt id="szyfrow.support.language_models.normalised_english_counts"><code class="name">var <span class="ident">normalised_english_counts</span></code></dt>
 290 <dd>
 291 <div class="desc"><p>Normalised counts of single letters in English (the sum of all counts
 292 adds to 1).</p></div>
 293 </dd>
 294 <dt id="szyfrow.support.language_models.normalised_english_trigram_counts"><code class="name">var <span class="ident">normalised_english_trigram_counts</span></code></dt>
 295 <dd>
 296 <div class="desc"><p>Normalised counts of letter trigrams in English (the sum of all counts
 297 adds to 1).</p></div>
 298 </dd>
 299 <dt id="szyfrow.support.language_models.transpositions"><code class="name">var <span class="ident">transpositions</span></code></dt>
 300 <dd>
 301 <div class="desc"><p>A sample dict of transpositions, to act as a dictionary for
 302 dictionary-based cipher breaking attempts. Each key is a transposition,
 303 each value is a list of words that give that transposition.</p></div>
 304 </dd>
 305 </dl>
 306 </section>
 307 <section>
 308 <h2 class="section-title" id="header-functions">Functions</h2>
 309 <dl>
 310 <dt id="szyfrow.support.language_models.Pbigrams"><code class="name flex">
 311 <span>def <span class="ident">Pbigrams</span></span>(<span>letters)</span>
 312 </code></dt>
 313 <dd>
 314 <div class="desc"><p>The Naive Bayes log probability of the bigrams formed from a sequence
 315 of letters.</p></div>
 316 <details class="source">
 317 <summary>
 318 <span>Expand source code</span>
 319 </summary>
 320 <pre><code class="python">def Pbigrams(letters):
 321     &#34;&#34;&#34;The Naive Bayes log probability of the bigrams formed from a sequence
 322     of letters.
 323     &#34;&#34;&#34;
 324     return sum(P2l[p] for p in ngrams(letters, 2))</code></pre>
 325 </details>
 326 </dd>
 327 <dt id="szyfrow.support.language_models.Pletters"><code class="name flex">
 328 <span>def <span class="ident">Pletters</span></span>(<span>letters)</span>
 329 </code></dt>
 330 <dd>
 331 <div class="desc"><p>The Naive Bayes log probability of a sequence of letters.</p></div>
 332 <details class="source">
 333 <summary>
 334 <span>Expand source code</span>
 335 </summary>
 336 <pre><code class="python">def Pletters(letters):
 337     &#34;&#34;&#34;The Naive Bayes log probability of a sequence of letters.
 338     &#34;&#34;&#34;
 339     return sum(Pl[l.lower()] for l in letters)</code></pre>
 340 </details>
 341 </dd>
 342 <dt id="szyfrow.support.language_models.Ptrigrams"><code class="name flex">
 343 <span>def <span class="ident">Ptrigrams</span></span>(<span>letters)</span>
 344 </code></dt>
 345 <dd>
 346 <div class="desc"><p>The Naive Bayes log probability of the trigrams formed from a sequence
 347 of letters.</p></div>
 348 <details class="source">
 349 <summary>
 350 <span>Expand source code</span>
 351 </summary>
 352 <pre><code class="python">def Ptrigrams(letters):
 353     &#34;&#34;&#34;The Naive Bayes log probability of the trigrams formed from a sequence
 354     of letters.
 355     &#34;&#34;&#34;
 356     return sum(P3l[p] for p in ngrams(letters, 3))</code></pre>
 357 </details>
 358 </dd>
 359 <dt id="szyfrow.support.language_models.Pwords"><code class="name flex">
 360 <span>def <span class="ident">Pwords</span></span>(<span>words)</span>
 361 </code></dt>
 362 <dd>
 363 <div class="desc"><p>The Naive Bayes log probability of a sequence of words.</p></div>
 364 <details class="source">
 365 <summary>
 366 <span>Expand source code</span>
 367 </summary>
 368 <pre><code class="python">def Pwords(words):
 369     &#34;&#34;&#34;The Naive Bayes log probability of a sequence of words.
 370     &#34;&#34;&#34;
 371     return sum(Pw[w.lower()] for w in words)</code></pre>
 372 </details>
 373 </dd>
 374 <dt id="szyfrow.support.language_models.cosine_distance_score"><code class="name flex">
 375 <span>def <span class="ident">cosine_distance_score</span></span>(<span>text)</span>
 376 </code></dt>
 377 <dd>
 378 <div class="desc"><p>Finds the dissimilarity of a text to English, using the cosine distance
 379 of the frequency distribution.</p>
 380 <pre><code class="language-python-repl">&gt;&gt;&gt; cosine_distance_score('abcabc') # doctest: +ELLIPSIS
 381 0.73771...
 382 </code></pre></div>
 383 <details class="source">
 384 <summary>
 385 <span>Expand source code</span>
 386 </summary>
 387 <pre><code class="python">def cosine_distance_score(text):
 388     &#34;&#34;&#34;Finds the dissimilarity of a text to English, using the cosine distance
 389     of the frequency distribution.
 390
 391     &gt;&gt;&gt; cosine_distance_score(&#39;abcabc&#39;) # doctest: +ELLIPSIS
 392     0.73771...
 393     &#34;&#34;&#34;
 394     # return szyfrow.support.norms.cosine_distance(english_counts,
 395     #     collections.Counter(sanitise(text)))
 396     return 1 - szyfrow.support.norms.cosine_similarity(english_counts,
 397         collections.Counter(sanitise(text)))</code></pre>
 398 </details>
 399 </dd>
 400 <dt id="szyfrow.support.language_models.datafile"><code class="name flex">
 401 <span>def <span class="ident">datafile</span></span>(<span>name, sep='\t')</span>
 402 </code></dt>
 403 <dd>
 404 <div class="desc"><p>Read key,value pairs from file.</p></div>
 405 <details class="source">
 406 <summary>
 407 <span>Expand source code</span>
 408 </summary>
 409 <pre><code class="python">def datafile(name, sep=&#39;\t&#39;):
 410     &#34;&#34;&#34;Read key,value pairs from file.
 411     &#34;&#34;&#34;
 412     with pkg_resources.open_text(language_model_files, name) as f:
 413     # with open(p name), &#39;r&#39;) as f:
 414         for line in f:
 415             splits = line.split(sep)
 416             yield [splits[0], int(splits[1])]</code></pre>
 417 </details>
 418 </dd>
 419 <dt id="szyfrow.support.language_models.log_probability_of_unknown_word"><code class="name flex">
 420 <span>def <span class="ident">log_probability_of_unknown_word</span></span>(<span>key, N)</span>
 421 </code></dt>
 422 <dd>
 423 <div class="desc"><p>Estimate the probability of an unknown word.</p></div>
 424 <details class="source">
 425 <summary>
 426 <span>Expand source code</span>
 427 </summary>
 428 <pre><code class="python">def log_probability_of_unknown_word(key, N):
 429     &#34;&#34;&#34;Estimate the probability of an unknown word.
 430     &#34;&#34;&#34;
 431     return -log10(N * 10**((len(key) - 2) * 1.4))</code></pre>
 432 </details>
 433 </dd>
 434 <dt id="szyfrow.support.language_models.ngrams"><code class="name flex">
 435 <span>def <span class="ident">ngrams</span></span>(<span>text, n)</span>
 436 </code></dt>
 437 <dd>
 438 <div class="desc"><p>Returns all n-grams of a text</p>
 439 <pre><code class="language-python-repl">&gt;&gt;&gt; ngrams(sanitise('the quick brown fox'), 2) # doctest: +NORMALIZE_WHITESPACE
 440 ['th', 'he', 'eq', 'qu', 'ui', 'ic', 'ck', 'kb', 'br', 'ro', 'ow', 'wn',
 441  'nf', 'fo', 'ox']
 442 &gt;&gt;&gt; ngrams(sanitise('the quick brown fox'), 4) # doctest: +NORMALIZE_WHITESPACE
 443 ['theq', 'hequ', 'equi', 'quic', 'uick', 'ickb', 'ckbr', 'kbro', 'brow',
 444  'rown', 'ownf', 'wnfo', 'nfox']
 445 </code></pre></div>
 446 <details class="source">
 447 <summary>
 448 <span>Expand source code</span>
 449 </summary>
 450 <pre><code class="python">def ngrams(text, n):
 451     &#34;&#34;&#34;Returns all n-grams of a text
 452
 453     &gt;&gt;&gt; ngrams(sanitise(&#39;the quick brown fox&#39;), 2) # doctest: +NORMALIZE_WHITESPACE
 454     [&#39;th&#39;, &#39;he&#39;, &#39;eq&#39;, &#39;qu&#39;, &#39;ui&#39;, &#39;ic&#39;, &#39;ck&#39;, &#39;kb&#39;, &#39;br&#39;, &#39;ro&#39;, &#39;ow&#39;, &#39;wn&#39;,
 455      &#39;nf&#39;, &#39;fo&#39;, &#39;ox&#39;]
 456     &gt;&gt;&gt; ngrams(sanitise(&#39;the quick brown fox&#39;), 4) # doctest: +NORMALIZE_WHITESPACE
 457     [&#39;theq&#39;, &#39;hequ&#39;, &#39;equi&#39;, &#39;quic&#39;, &#39;uick&#39;, &#39;ickb&#39;, &#39;ckbr&#39;, &#39;kbro&#39;, &#39;brow&#39;,
 458      &#39;rown&#39;, &#39;ownf&#39;, &#39;wnfo&#39;, &#39;nfox&#39;]
 459     &#34;&#34;&#34;
 460     return [text[i:i+n] for i in range(len(text)-n+1)]</code></pre>
 461 </details>
 462 </dd>
 463 <dt id="szyfrow.support.language_models.random_english_letter"><code class="name flex">
 464 <span>def <span class="ident">random_english_letter</span></span>(<span>)</span>
 465 </code></dt>
 466 <dd>
 467 <div class="desc"><p>Generate a random letter based on English letter counts</p></div>
 468 <details class="source">
 469 <summary>
 470 <span>Expand source code</span>
 471 </summary>
 472 <pre><code class="python">def random_english_letter():
 473     &#34;&#34;&#34;Generate a random letter based on English letter counts
 474     &#34;&#34;&#34;
 475     return weighted_choice(normalised_english_counts)</code></pre>
 476 </details>
 477 </dd>
 478 <dt id="szyfrow.support.language_models.transpositions_of"><code class="name flex">
 479 <span>def <span class="ident">transpositions_of</span></span>(<span>keyword)</span>
 480 </code></dt>
 481 <dd>
 482 <div class="desc"><p>Finds the transpostions given by a keyword. For instance, the keyword
 483 'clever' rearranges to 'celrv', so the first column (0) stays first, the
 484 second column (1) moves to third, the third column (2) moves to second,
 485 and so on.</p>
 486 <p>If passed a tuple, assume it's already a transposition and just return it.</p>
 487 <pre><code class="language-python-repl">&gt;&gt;&gt; transpositions_of('clever')
 488 (0, 2, 1, 4, 3)
 489 &gt;&gt;&gt; transpositions_of('fred')
 490 (3, 2, 0, 1)
 491 &gt;&gt;&gt; transpositions_of((3, 2, 0, 1))
 492 (3, 2, 0, 1)
 493 </code></pre></div>
 494 <details class="source">
 495 <summary>
 496 <span>Expand source code</span>
 497 </summary>
 498 <pre><code class="python">def transpositions_of(keyword):
 499     &#34;&#34;&#34;Finds the transpostions given by a keyword. For instance, the keyword
 500     &#39;clever&#39; rearranges to &#39;celrv&#39;, so the first column (0) stays first, the
 501     second column (1) moves to third, the third column (2) moves to second,
 502     and so on.
 503
 504     If passed a tuple, assume it&#39;s already a transposition and just return it.
 505
 506     &gt;&gt;&gt; transpositions_of(&#39;clever&#39;)
 507     (0, 2, 1, 4, 3)
 508     &gt;&gt;&gt; transpositions_of(&#39;fred&#39;)
 509     (3, 2, 0, 1)
 510     &gt;&gt;&gt; transpositions_of((3, 2, 0, 1))
 511     (3, 2, 0, 1)
 512     &#34;&#34;&#34;
 513     if isinstance(keyword, tuple):
 514         return keyword
 515     else:
 516         key = deduplicate(keyword)
 517         transpositions = tuple(key.index(l) for l in sorted(key))
 518         return transpositions</code></pre>
 519 </details>
 520 </dd>
 521 <dt id="szyfrow.support.language_models.weighted_choice"><code class="name flex">
 522 <span>def <span class="ident">weighted_choice</span></span>(<span>d)</span>
 523 </code></dt>
 524 <dd>
 525 <div class="desc"><p>Generate random item from a dictionary of item counts</p></div>
 526 <details class="source">
 527 <summary>
 528 <span>Expand source code</span>
 529 </summary>
 530 <pre><code class="python">def weighted_choice(d):
 531     &#34;&#34;&#34;Generate random item from a dictionary of item counts
 532     &#34;&#34;&#34;
 533     delems, dweights = list(zip(*d.items()))
 534     return random.choices(delems, dweights)[0]
 535     # target = random.uniform(0, sum(d.values()))
 536     # cuml = 0.0
 537     # for (l, p) in d.items():
 538     #     cuml += p
 539     #     if cuml &gt; target:
 540     #         return l
 541     # return None</code></pre>
 542 </details>
 543 </dd>
 544 </dl>
 545 </section>
 546 <section>
 547 <h2 class="section-title" id="header-classes">Classes</h2>
 548 <dl>
 549 <dt id="szyfrow.support.language_models.Pdist"><code class="flex name class">
 550 <span>class <span class="ident">Pdist</span></span>
 551 <span>(</span><span>data=[], estimate_of_missing=None)</span>
 552 </code></dt>
 553 <dd>
 554 <div class="desc"><p>A probability distribution estimated from counts in datafile.
 555 Values are stored and returned as log probabilities.</p></div>
 556 <details class="source">
 557 <summary>
 558 <span>Expand source code</span>
 559 </summary>
 560 <pre><code class="python">class Pdist(dict):
 561     &#34;&#34;&#34;A probability distribution estimated from counts in datafile.
 562     Values are stored and returned as log probabilities.
 563     &#34;&#34;&#34;
 564     def __init__(self, data=[], estimate_of_missing=None):
 565         data1, data2 = itertools.tee(data)
 566         self.total = sum([d[1] for d in data1])
 567         for key, count in data2:
 568             self[key] = log10(count / self.total)
 569         self.estimate_of_missing = estimate_of_missing or (lambda k, N: 1./N)
 570     def __missing__(self, key):
 571         return self.estimate_of_missing(key, self.total)</code></pre>
 572 </details>
 573 <h3>Ancestors</h3>
 574 <ul class="hlist">
 575 <li>builtins.dict</li>
 576 </ul>
 577 </dd>
 578 </dl>
 579 </section>
 580 </article>
 581 <nav id="sidebar">
 582 <h1>Index</h1>
 583 <div class="toc">
 584 <ul></ul>
 585 </div>
 586 <ul id="index">
 587 <li><h3>Super-module</h3>
 588 <ul>
 589 <li><code><a title="szyfrow.support" href="index.html">szyfrow.support</a></code></li>
 590 </ul>
 591 </li>
 592 <li><h3><a href="#header-variables">Global variables</a></h3>
 593 <ul class="">
 594 <li><code><a title="szyfrow.support.language_models.P2l" href="#szyfrow.support.language_models.P2l">P2l</a></code></li>
 595 <li><code><a title="szyfrow.support.language_models.P3l" href="#szyfrow.support.language_models.P3l">P3l</a></code></li>
 596 <li><code><a title="szyfrow.support.language_models.Pl" href="#szyfrow.support.language_models.Pl">Pl</a></code></li>
 597 <li><code><a title="szyfrow.support.language_models.Pw" href="#szyfrow.support.language_models.Pw">Pw</a></code></li>
 598 <li><code><a title="szyfrow.support.language_models.english_bigram_counts" href="#szyfrow.support.language_models.english_bigram_counts">english_bigram_counts</a></code></li>
 599 <li><code><a title="szyfrow.support.language_models.english_counts" href="#szyfrow.support.language_models.english_counts">english_counts</a></code></li>
 600 <li><code><a title="szyfrow.support.language_models.english_trigram_counts" href="#szyfrow.support.language_models.english_trigram_counts">english_trigram_counts</a></code></li>
 601 <li><code><a title="szyfrow.support.language_models.keywords" href="#szyfrow.support.language_models.keywords">keywords</a></code></li>
 602 <li><code><a title="szyfrow.support.language_models.normalised_english_bigram_counts" href="#szyfrow.support.language_models.normalised_english_bigram_counts">normalised_english_bigram_counts</a></code></li>
 603 <li><code><a title="szyfrow.support.language_models.normalised_english_counts" href="#szyfrow.support.language_models.normalised_english_counts">normalised_english_counts</a></code></li>
 604 <li><code><a title="szyfrow.support.language_models.normalised_english_trigram_counts" href="#szyfrow.support.language_models.normalised_english_trigram_counts">normalised_english_trigram_counts</a></code></li>
 605 <li><code><a title="szyfrow.support.language_models.transpositions" href="#szyfrow.support.language_models.transpositions">transpositions</a></code></li>
 606 </ul>
 607 </li>
 608 <li><h3><a href="#header-functions">Functions</a></h3>
 609 <ul class="">
 610 <li><code><a title="szyfrow.support.language_models.Pbigrams" href="#szyfrow.support.language_models.Pbigrams">Pbigrams</a></code></li>
 611 <li><code><a title="szyfrow.support.language_models.Pletters" href="#szyfrow.support.language_models.Pletters">Pletters</a></code></li>
 612 <li><code><a title="szyfrow.support.language_models.Ptrigrams" href="#szyfrow.support.language_models.Ptrigrams">Ptrigrams</a></code></li>
 613 <li><code><a title="szyfrow.support.language_models.Pwords" href="#szyfrow.support.language_models.Pwords">Pwords</a></code></li>
 614 <li><code><a title="szyfrow.support.language_models.cosine_distance_score" href="#szyfrow.support.language_models.cosine_distance_score">cosine_distance_score</a></code></li>
 615 <li><code><a title="szyfrow.support.language_models.datafile" href="#szyfrow.support.language_models.datafile">datafile</a></code></li>
 616 <li><code><a title="szyfrow.support.language_models.log_probability_of_unknown_word" href="#szyfrow.support.language_models.log_probability_of_unknown_word">log_probability_of_unknown_word</a></code></li>
 617 <li><code><a title="szyfrow.support.language_models.ngrams" href="#szyfrow.support.language_models.ngrams">ngrams</a></code></li>
 618 <li><code><a title="szyfrow.support.language_models.random_english_letter" href="#szyfrow.support.language_models.random_english_letter">random_english_letter</a></code></li>
 619 <li><code><a title="szyfrow.support.language_models.transpositions_of" href="#szyfrow.support.language_models.transpositions_of">transpositions_of</a></code></li>
 620 <li><code><a title="szyfrow.support.language_models.weighted_choice" href="#szyfrow.support.language_models.weighted_choice">weighted_choice</a></code></li>
 621 </ul>
 622 </li>
 623 <li><h3><a href="#header-classes">Classes</a></h3>
 624 <ul>
 625 <li>
 626 <h4><code><a title="szyfrow.support.language_models.Pdist" href="#szyfrow.support.language_models.Pdist">Pdist</a></code></h4>
 627 </li>
 628 </ul>
 629 </li>
 630 </ul>
 631 </nav>
 632 </main>
 633 <footer id="footer">
 634 <p>Generated by <a href="https://pdoc3.github.io/pdoc"><cite>pdoc</cite> 0.9.2</a>.</p>
 635 </footer>
 636 </body>
 637 </html>