5 <meta name=
"viewport" content=
"width=device-width, initial-scale=1, minimum-scale=1" />
6 <meta name=
"generator" content=
"pdoc 0.9.2" />
7 <title>szyfrow.support.language_models API documentation
</title>
8 <meta name=
"description" content=
"Descriptive models of a natural language (in this case, English) …" />
9 <link rel=
"preload stylesheet" as=
"style" href=
"https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/sanitize.min.css" integrity=
"sha256-PK9q560IAAa6WVRRh76LtCaI8pjTJ2z11v0miyNNjrs=" crossorigin
>
10 <link rel=
"preload stylesheet" as=
"style" href=
"https://cdnjs.cloudflare.com/ajax/libs/10up-sanitize.css/11.0.1/typography.min.css" integrity=
"sha256-7l/o7C8jubJiy74VsKTidCy1yBkRtiUGbVkYBylBqUg=" crossorigin
>
11 <link rel=
"stylesheet preload" as=
"style" href=
"https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/styles/github.min.css" crossorigin
>
12 <style>:root{--highlight-color:#fe9}.flex{display:flex !important}body{line-height:
1.5em}#content{padding:
20px}#sidebar{padding:
30px;overflow:hidden}#sidebar
> *:last-child{margin-bottom:
2cm}.http-server-breadcrumbs{font-size:
130%;margin:
0 0 15px
0}#footer{font-size:
.75em;padding:
5px
30px;border-top:
1px solid #ddd;text-align:right}#footer p{margin:
0 0 0 1em;display:inline-block}#footer p:last-child{margin-right:
30px}h1,h2,h3,h4,h5{font-weight:
300}h1{font-size:
2.5em;line-height:
1.1em}h2{font-size:
1.75em;margin:
1em
0 .50em
0}h3{font-size:
1.4em;margin:
25px
0 10px
0}h4{margin:
0;font-size:
105%}h1:target,h2:target,h3:target,h4:target,h5:target,h6:target{background:var(--highlight-color);padding:
.2em
0}a{color:#
058;text-decoration:none;transition:color
.3s ease-in-out}a:hover{color:#e82}.title code{font-weight:bold}h2[id^=
"header-"]{margin-top:
2em}.ident{color:#
900}pre code{background:#f8f8f8;font-size:
.8em;line-height:
1.4em}code{background:#f2f2f1;padding:
1px
4px;overflow-wrap:break-word}h1 code{background:transparent}pre{background:#f8f8f8;border:
0;border-top:
1px solid #ccc;border-bottom:
1px solid #ccc;margin:
1em
0;padding:
1ex}#http-server-module-list{display:flex;flex-flow:column}#http-server-module-list div{display:flex}#http-server-module-list dt{min-width:
10%}#http-server-module-list p{margin-top:
0}.toc ul,#index{list-style-type:none;margin:
0;padding:
0}#index code{background:transparent}#index h3{border-bottom:
1px solid #ddd}#index ul{padding:
0}#index h4{margin-top:
.6em;font-weight:bold}@media (min-width:
200ex){#index .two-column{column-count:
2}}@media (min-width:
300ex){#index .two-column{column-count:
3}}dl{margin-bottom:
2em}dl dl:last-child{margin-bottom:
4em}dd{margin:
0 0 1em
3em}#header-classes + dl
> dd{margin-bottom:
3em}dd dd{margin-left:
2em}dd p{margin:
10px
0}.name{background:#eee;font-weight:bold;font-size:
.85em;padding:
5px
10px;display:inline-block;min-width:
40%}.name:hover{background:#e0e0e0}dt:target .name{background:var(--highlight-color)}.name
> span:first-child{white-space:nowrap}.name.class
> span:nth-child(
2){margin-left:
.4em}.inherited{color:#
999;border-left:
5px solid #eee;padding-left:
1em}.inheritance em{font-style:normal;font-weight:bold}.desc h2{font-weight:
400;font-size:
1.25em}.desc h3{font-size:
1em}.desc dt code{background:inherit}.source summary,.git-link-div{color:#
666;text-align:right;font-weight:
400;font-size:
.8em;text-transform:uppercase}.source summary
> *{white-space:nowrap;cursor:pointer}.git-link{color:inherit;margin-left:
1em}.source pre{max-height:
500px;overflow:auto;margin:
0}.source pre code{font-size:
12px;overflow:visible}.hlist{list-style:none}.hlist li{display:inline}.hlist li:after{content:',\
2002'}.hlist li:last-child:after{content:none}.hlist .hlist{display:inline;padding-left:
1em}img{max-width:
100%}td{padding:
0 .5em}.admonition{padding:
.1em
.5em;margin-bottom:
1em}.admonition-title{font-weight:bold}.admonition.note,.admonition.info,.admonition.important{background:#aef}.admonition.todo,.admonition.versionadded,.admonition.tip,.admonition.hint{background:#dfd}.admonition.warning,.admonition.versionchanged,.admonition.deprecated{background:#fd4}.admonition.error,.admonition.danger,.admonition.caution{background:lightpink}
</style>
13 <style media=
"screen and (min-width: 700px)">@media screen and (min-width:
700px){#sidebar{width:
30%;height:
100vh;overflow:auto;position:sticky;top:
0}#content{width:
70%;max-width:
100ch;padding:
3em
4em;border-left:
1px solid #ddd}pre code{font-size:
1em}.item .name{font-size:
1em}main{display:flex;flex-direction:row-reverse;justify-content:flex-end}.toc ul ul,#index ul{padding-left:
1.5em}.toc
> ul
> li{margin-top:
.5em}}
</style>
14 <style media=
"print">@media print{#sidebar h1{page-break-before:always}.source{display:none}}@media print{*{background:transparent !important;color:#
000 !important;box-shadow:none !important;text-shadow:none !important}a[href]:after{content:
" (" attr(href)
")";font-size:
90%}a[href][title]:after{content:none}abbr[title]:after{content:
" (" attr(title)
")"}.ir a:after,a[href^=
"javascript:"]:after,a[href^=
"#"]:after{content:
""}pre,blockquote{border:
1px solid #
999;page-break-inside:avoid}thead{display:table-header-group}tr,img{page-break-inside:avoid}img{max-width:
100% !important}@page{margin:
0.5cm}p,h2,h3{orphans:
3;widows:
3}h1,h2,h3,h4,h5,h6{page-break-after:avoid}}
</style>
15 <script defer
src=
"https://cdnjs.cloudflare.com/ajax/libs/highlight.js/10.1.1/highlight.min.js" integrity=
"sha256-Uv3H6lx7dJmRfRvH8TH6kJD1TSK1aFcwgx+mdg3epi8=" crossorigin
></script>
16 <script>window.addEventListener('DOMContentLoaded', () =
> hljs.initHighlighting())
</script>
20 <article id=
"content">
22 <h1 class=
"title">Module
<code>szyfrow.support.language_models
</code></h1>
24 <section id=
"section-intro">
25 <p>Descriptive models of a natural language (in this case, English).
</p>
26 <p>The functions
<code><a title=
"szyfrow.support.language_models.Pwords" href=
"#szyfrow.support.language_models.Pwords">Pwords()
</a></code>,
<code><a title=
"szyfrow.support.language_models.Pletters" href=
"#szyfrow.support.language_models.Pletters">Pletters()
</a></code>,
<code><a title=
"szyfrow.support.language_models.Pbigrams" href=
"#szyfrow.support.language_models.Pbigrams">Pbigrams()
</a></code>, and
<code><a title=
"szyfrow.support.language_models.Ptrigrams" href=
"#szyfrow.support.language_models.Ptrigrams">Ptrigrams()
</a></code> return the
27 log probability of a section of text.
</p>
28 <p>If you want to use a different language, replace the data files in
29 <a href=
"../language_model_files/index.html"><code>szyfrow/language_model_files
</code></a>.
</p>
31 <li><code>count_1l.txt
</code>: counts of single letters
</li>
32 <li><code>count_2l.txt
</code>: counts of pairs letters, bigrams
</li>
33 <li><code>count_3l.txt
</code>: counts of triples of letters, triagrams
</li>
34 <li><code>words.txt
</code>: a dictionary of words, used for keyword-based cipher breaking.
35 These words should only contain characters cointained in
36 <code>string.ascii_letters
</code>.
</li>
38 <details class=
"source">
40 <span>Expand source code
</span>
42 <pre><code class=
"python">"""Descriptive models of a natural language (in this case, English).
44 The functions `Pwords`, `Pletters`, `Pbigrams`, and `Ptrigrams` return the
45 log probability of a section of text.
47 If you want to use a different language, replace the data files in
48 [`szyfrow/language_model_files`](../language_model_files/index.html).
50 * `count_1l.txt`: counts of single letters
51 * `count_2l.txt`: counts of pairs letters, bigrams
52 * `count_3l.txt`: counts of triples of letters, triagrams
53 * `words.txt`: a dictionary of words, used for keyword-based cipher breaking.
54 These words should only contain characters cointained in
55 `string.ascii_letters`.
63 from math import log10
65 import importlib.resources as pkg_resources
67 import szyfrow.support.norms
68 from szyfrow.support.utilities import sanitise, deduplicate
69 from szyfrow import language_model_files
72 def datafile(name, sep=
'\t
'):
73 """Read key,value pairs from file.
75 with pkg_resources.open_text(language_model_files, name) as f:
76 # with open(p name),
'r
') as f:
78 splits = line.split(sep)
79 yield [splits[
0], int(splits[
1])]
81 english_counts = collections.Counter(dict(datafile(
'count_1l.txt
')))
82 """Counts of single letters in English.
"""
83 normalised_english_counts = szyfrow.support.norms.normalise(english_counts)
84 """Normalised counts of single letters in English (the sum of all counts
85 adds to
1).
"""
87 english_bigram_counts = collections.Counter(dict(datafile(
'count_2l.txt
')))
88 """Counts of letter bigrams in English.
"""
89 normalised_english_bigram_counts = szyfrow.support.norms.normalise(english_bigram_counts)
90 """Normalised counts of letter bigrams in English (the sum of all counts
91 adds to
1).
"""
93 english_trigram_counts = collections.Counter(dict(datafile(
'count_3l.txt
')))
94 """Counts of letter trigrams in English.
"""
95 normalised_english_trigram_counts = szyfrow.support.norms.normalise(english_trigram_counts)
96 """Normalised counts of letter trigrams in English (the sum of all counts
97 adds to
1).
"""
100 """A sample list of keywords, to act as a dictionary for
101 dictionary-based cipher breaking attempts.
"""
102 with pkg_resources.open_text(language_model_files,
'words.txt
') as f:
103 keywords = [line.rstrip() for line in f]
106 def transpositions_of(keyword):
107 """Finds the transpostions given by a keyword. For instance, the keyword
108 'clever
' rearranges to
'celrv
', so the first column (
0) stays first, the
109 second column (
1) moves to third, the third column (
2) moves to second,
112 If passed a tuple, assume it
's already a transposition and just return it.
114 >>> transpositions_of(
'clever
')
116 >>> transpositions_of(
'fred
')
118 >>> transpositions_of((
3,
2,
0,
1))
121 if isinstance(keyword, tuple):
124 key = deduplicate(keyword)
125 transpositions = tuple(key.index(l) for l in sorted(key))
126 return transpositions
128 transpositions = collections.defaultdict(list)
129 """A sample dict of transpositions, to act as a dictionary for
130 dictionary-based cipher breaking attempts. Each key is a transposition,
131 each value is a list of words that give that transposition.
"""
132 for word in keywords:
133 transpositions[transpositions_of(word)] += [word]
136 def weighted_choice(d):
137 """Generate random item from a dictionary of item counts
139 delems, dweights = list(zip(*d.items()))
140 return random.choices(delems, dweights)[
0]
141 # target = random.uniform(
0, sum(d.values()))
143 # for (l, p) in d.items():
145 # if cuml
> target:
149 def random_english_letter():
150 """Generate a random letter based on English letter counts
152 return weighted_choice(normalised_english_counts)
156 """Returns all n-grams of a text
158 >>> ngrams(sanitise(
'the quick brown fox
'),
2) # doctest: +NORMALIZE_WHITESPACE
159 [
'th
',
'he
',
'eq
',
'qu
',
'ui
',
'ic
',
'ck
',
'kb
',
'br
',
'ro
',
'ow
',
'wn
',
160 'nf
',
'fo
',
'ox
']
161 >>> ngrams(sanitise(
'the quick brown fox
'),
4) # doctest: +NORMALIZE_WHITESPACE
162 [
'theq
',
'hequ
',
'equi
',
'quic
',
'uick
',
'ickb
',
'ckbr
',
'kbro
',
'brow
',
163 'rown
',
'ownf
',
'wnfo
',
'nfox
']
165 return [text[i:i+n] for i in range(len(text)-n+
1)]
169 """A probability distribution estimated from counts in datafile.
170 Values are stored and returned as log probabilities.
172 def __init__(self, data=[], estimate_of_missing=None):
173 data1, data2 = itertools.tee(data)
174 self.total = sum([d[
1] for d in data1])
175 for key, count in data2:
176 self[key] = log10(count / self.total)
177 self.estimate_of_missing = estimate_of_missing or (lambda k, N:
1./N)
178 def __missing__(self, key):
179 return self.estimate_of_missing(key, self.total)
181 def log_probability_of_unknown_word(key, N):
182 """Estimate the probability of an unknown word.
184 return -log10(N *
10**((len(key) -
2) *
1.4))
186 Pw = Pdist(datafile(
'count_1w.txt
'), log_probability_of_unknown_word)
187 """A [Pdist](#szyfrow.support.language_models.Pdist) holding log probabilities
188 of words. Unknown words have their probability estimated by
189 [log_probability_of_unknown_word](#szyfrow.support.language_models.log_probability_of_unknown_word)
"""
190 Pl = Pdist(datafile(
'count_1l.txt
'), lambda _k, _N:
0)
191 """A [Pdist](#szyfrow.support.language_models.Pdist) holding log probabilities
192 of single letters. Unknown words have their probability estimated as zero.
"""
193 P2l = Pdist(datafile(
'count_2l.txt
'), lambda _k, _N:
0)
194 """A [Pdist](#szyfrow.support.language_models.Pdist) holding log probabilities
195 of letter bigrams. Unknown words have their probability estimated as zero.
"""
196 P3l = Pdist(datafile(
'count_3l.txt
'), lambda _k, _N:
0)
197 """A [Pdist](#szyfrow.support.language_models.Pdist) holding log probabilities
198 of letter trigrams. Unknown words have their probability estimated as zero.
"""
201 """The Naive Bayes log probability of a sequence of words.
203 return sum(Pw[w.lower()] for w in words)
205 def Pletters(letters):
206 """The Naive Bayes log probability of a sequence of letters.
208 return sum(Pl[l.lower()] for l in letters)
210 def Pbigrams(letters):
211 """The Naive Bayes log probability of the bigrams formed from a sequence
214 return sum(P2l[p] for p in ngrams(letters,
2))
216 def Ptrigrams(letters):
217 """The Naive Bayes log probability of the trigrams formed from a sequence
220 return sum(P3l[p] for p in ngrams(letters,
3))
223 def cosine_distance_score(text):
224 """Finds the dissimilarity of a text to English, using the cosine distance
225 of the frequency distribution.
227 >>> cosine_distance_score(
'abcabc
') # doctest: +ELLIPSIS
230 # return szyfrow.support.norms.cosine_distance(english_counts,
231 # collections.Counter(sanitise(text)))
232 return
1 - szyfrow.support.norms.cosine_similarity(english_counts,
233 collections.Counter(sanitise(text)))
236 if __name__ ==
"__main__
":
238 doctest.testmod()
</code></pre>
244 <h2 class=
"section-title" id=
"header-variables">Global variables
</h2>
246 <dt id=
"szyfrow.support.language_models.P2l"><code class=
"name">var
<span class=
"ident">P2l
</span></code></dt>
248 <div class=
"desc"><p>A
<a href=
"#szyfrow.support.language_models.Pdist">Pdist
</a> holding log probabilities
249 of letter bigrams. Unknown words have their probability estimated as zero.
</p></div>
251 <dt id=
"szyfrow.support.language_models.P3l"><code class=
"name">var
<span class=
"ident">P3l
</span></code></dt>
253 <div class=
"desc"><p>A
<a href=
"#szyfrow.support.language_models.Pdist">Pdist
</a> holding log probabilities
254 of letter trigrams. Unknown words have their probability estimated as zero.
</p></div>
256 <dt id=
"szyfrow.support.language_models.Pl"><code class=
"name">var
<span class=
"ident">Pl
</span></code></dt>
258 <div class=
"desc"><p>A
<a href=
"#szyfrow.support.language_models.Pdist">Pdist
</a> holding log probabilities
259 of single letters. Unknown words have their probability estimated as zero.
</p></div>
261 <dt id=
"szyfrow.support.language_models.Pw"><code class=
"name">var
<span class=
"ident">Pw
</span></code></dt>
263 <div class=
"desc"><p>A
<a href=
"#szyfrow.support.language_models.Pdist">Pdist
</a> holding log probabilities
264 of words. Unknown words have their probability estimated by
265 <a href=
"#szyfrow.support.language_models.log_probability_of_unknown_word">log_probability_of_unknown_word
</a></p></div>
267 <dt id=
"szyfrow.support.language_models.english_bigram_counts"><code class=
"name">var
<span class=
"ident">english_bigram_counts
</span></code></dt>
269 <div class=
"desc"><p>Counts of letter bigrams in English.
</p></div>
271 <dt id=
"szyfrow.support.language_models.english_counts"><code class=
"name">var
<span class=
"ident">english_counts
</span></code></dt>
273 <div class=
"desc"><p>Counts of single letters in English.
</p></div>
275 <dt id=
"szyfrow.support.language_models.english_trigram_counts"><code class=
"name">var
<span class=
"ident">english_trigram_counts
</span></code></dt>
277 <div class=
"desc"><p>Counts of letter trigrams in English.
</p></div>
279 <dt id=
"szyfrow.support.language_models.keywords"><code class=
"name">var
<span class=
"ident">keywords
</span></code></dt>
281 <div class=
"desc"><p>A sample list of keywords, to act as a dictionary for
282 dictionary-based cipher breaking attempts.
</p></div>
284 <dt id=
"szyfrow.support.language_models.normalised_english_bigram_counts"><code class=
"name">var
<span class=
"ident">normalised_english_bigram_counts
</span></code></dt>
286 <div class=
"desc"><p>Normalised counts of letter bigrams in English (the sum of all counts
287 adds to
1).
</p></div>
289 <dt id=
"szyfrow.support.language_models.normalised_english_counts"><code class=
"name">var
<span class=
"ident">normalised_english_counts
</span></code></dt>
291 <div class=
"desc"><p>Normalised counts of single letters in English (the sum of all counts
292 adds to
1).
</p></div>
294 <dt id=
"szyfrow.support.language_models.normalised_english_trigram_counts"><code class=
"name">var
<span class=
"ident">normalised_english_trigram_counts
</span></code></dt>
296 <div class=
"desc"><p>Normalised counts of letter trigrams in English (the sum of all counts
297 adds to
1).
</p></div>
299 <dt id=
"szyfrow.support.language_models.transpositions"><code class=
"name">var
<span class=
"ident">transpositions
</span></code></dt>
301 <div class=
"desc"><p>A sample dict of transpositions, to act as a dictionary for
302 dictionary-based cipher breaking attempts. Each key is a transposition,
303 each value is a list of words that give that transposition.
</p></div>
308 <h2 class=
"section-title" id=
"header-functions">Functions
</h2>
310 <dt id=
"szyfrow.support.language_models.Pbigrams"><code class=
"name flex">
311 <span>def
<span class=
"ident">Pbigrams
</span></span>(
<span>letters)
</span>
314 <div class=
"desc"><p>The Naive Bayes log probability of the bigrams formed from a sequence
315 of letters.
</p></div>
316 <details class=
"source">
318 <span>Expand source code
</span>
320 <pre><code class=
"python">def Pbigrams(letters):
321 """The Naive Bayes log probability of the bigrams formed from a sequence
324 return sum(P2l[p] for p in ngrams(letters,
2))
</code></pre>
327 <dt id=
"szyfrow.support.language_models.Pletters"><code class=
"name flex">
328 <span>def
<span class=
"ident">Pletters
</span></span>(
<span>letters)
</span>
331 <div class=
"desc"><p>The Naive Bayes log probability of a sequence of letters.
</p></div>
332 <details class=
"source">
334 <span>Expand source code
</span>
336 <pre><code class=
"python">def Pletters(letters):
337 """The Naive Bayes log probability of a sequence of letters.
339 return sum(Pl[l.lower()] for l in letters)
</code></pre>
342 <dt id=
"szyfrow.support.language_models.Ptrigrams"><code class=
"name flex">
343 <span>def
<span class=
"ident">Ptrigrams
</span></span>(
<span>letters)
</span>
346 <div class=
"desc"><p>The Naive Bayes log probability of the trigrams formed from a sequence
347 of letters.
</p></div>
348 <details class=
"source">
350 <span>Expand source code
</span>
352 <pre><code class=
"python">def Ptrigrams(letters):
353 """The Naive Bayes log probability of the trigrams formed from a sequence
356 return sum(P3l[p] for p in ngrams(letters,
3))
</code></pre>
359 <dt id=
"szyfrow.support.language_models.Pwords"><code class=
"name flex">
360 <span>def
<span class=
"ident">Pwords
</span></span>(
<span>words)
</span>
363 <div class=
"desc"><p>The Naive Bayes log probability of a sequence of words.
</p></div>
364 <details class=
"source">
366 <span>Expand source code
</span>
368 <pre><code class=
"python">def Pwords(words):
369 """The Naive Bayes log probability of a sequence of words.
371 return sum(Pw[w.lower()] for w in words)
</code></pre>
374 <dt id=
"szyfrow.support.language_models.cosine_distance_score"><code class=
"name flex">
375 <span>def
<span class=
"ident">cosine_distance_score
</span></span>(
<span>text)
</span>
378 <div class=
"desc"><p>Finds the dissimilarity of a text to English, using the cosine distance
379 of the frequency distribution.
</p>
380 <pre><code class=
"language-python-repl">>>> cosine_distance_score('abcabc') # doctest: +ELLIPSIS
383 <details class=
"source">
385 <span>Expand source code
</span>
387 <pre><code class=
"python">def cosine_distance_score(text):
388 """Finds the dissimilarity of a text to English, using the cosine distance
389 of the frequency distribution.
391 >>> cosine_distance_score(
'abcabc
') # doctest: +ELLIPSIS
394 # return szyfrow.support.norms.cosine_distance(english_counts,
395 # collections.Counter(sanitise(text)))
396 return
1 - szyfrow.support.norms.cosine_similarity(english_counts,
397 collections.Counter(sanitise(text)))
</code></pre>
400 <dt id=
"szyfrow.support.language_models.datafile"><code class=
"name flex">
401 <span>def
<span class=
"ident">datafile
</span></span>(
<span>name, sep='\t')
</span>
404 <div class=
"desc"><p>Read key,value pairs from file.
</p></div>
405 <details class=
"source">
407 <span>Expand source code
</span>
409 <pre><code class=
"python">def datafile(name, sep=
'\t
'):
410 """Read key,value pairs from file.
412 with pkg_resources.open_text(language_model_files, name) as f:
413 # with open(p name),
'r
') as f:
415 splits = line.split(sep)
416 yield [splits[
0], int(splits[
1])]
</code></pre>
419 <dt id=
"szyfrow.support.language_models.log_probability_of_unknown_word"><code class=
"name flex">
420 <span>def
<span class=
"ident">log_probability_of_unknown_word
</span></span>(
<span>key, N)
</span>
423 <div class=
"desc"><p>Estimate the probability of an unknown word.
</p></div>
424 <details class=
"source">
426 <span>Expand source code
</span>
428 <pre><code class=
"python">def log_probability_of_unknown_word(key, N):
429 """Estimate the probability of an unknown word.
431 return -log10(N *
10**((len(key) -
2) *
1.4))
</code></pre>
434 <dt id=
"szyfrow.support.language_models.ngrams"><code class=
"name flex">
435 <span>def
<span class=
"ident">ngrams
</span></span>(
<span>text, n)
</span>
438 <div class=
"desc"><p>Returns all n-grams of a text
</p>
439 <pre><code class=
"language-python-repl">>>> ngrams(sanitise('the quick brown fox'),
2) # doctest: +NORMALIZE_WHITESPACE
440 ['th', 'he', 'eq', 'qu', 'ui', 'ic', 'ck', 'kb', 'br', 'ro', 'ow', 'wn',
442 >>> ngrams(sanitise('the quick brown fox'),
4) # doctest: +NORMALIZE_WHITESPACE
443 ['theq', 'hequ', 'equi', 'quic', 'uick', 'ickb', 'ckbr', 'kbro', 'brow',
444 'rown', 'ownf', 'wnfo', 'nfox']
446 <details class=
"source">
448 <span>Expand source code
</span>
450 <pre><code class=
"python">def ngrams(text, n):
451 """Returns all n-grams of a text
453 >>> ngrams(sanitise(
'the quick brown fox
'),
2) # doctest: +NORMALIZE_WHITESPACE
454 [
'th
',
'he
',
'eq
',
'qu
',
'ui
',
'ic
',
'ck
',
'kb
',
'br
',
'ro
',
'ow
',
'wn
',
455 'nf
',
'fo
',
'ox
']
456 >>> ngrams(sanitise(
'the quick brown fox
'),
4) # doctest: +NORMALIZE_WHITESPACE
457 [
'theq
',
'hequ
',
'equi
',
'quic
',
'uick
',
'ickb
',
'ckbr
',
'kbro
',
'brow
',
458 'rown
',
'ownf
',
'wnfo
',
'nfox
']
460 return [text[i:i+n] for i in range(len(text)-n+
1)]
</code></pre>
463 <dt id=
"szyfrow.support.language_models.random_english_letter"><code class=
"name flex">
464 <span>def
<span class=
"ident">random_english_letter
</span></span>(
<span>)
</span>
467 <div class=
"desc"><p>Generate a random letter based on English letter counts
</p></div>
468 <details class=
"source">
470 <span>Expand source code
</span>
472 <pre><code class=
"python">def random_english_letter():
473 """Generate a random letter based on English letter counts
475 return weighted_choice(normalised_english_counts)
</code></pre>
478 <dt id=
"szyfrow.support.language_models.transpositions_of"><code class=
"name flex">
479 <span>def
<span class=
"ident">transpositions_of
</span></span>(
<span>keyword)
</span>
482 <div class=
"desc"><p>Finds the transpostions given by a keyword. For instance, the keyword
483 'clever' rearranges to 'celrv', so the first column (
0) stays first, the
484 second column (
1) moves to third, the third column (
2) moves to second,
486 <p>If passed a tuple, assume it's already a transposition and just return it.
</p>
487 <pre><code class=
"language-python-repl">>>> transpositions_of('clever')
489 >>> transpositions_of('fred')
491 >>> transpositions_of((
3,
2,
0,
1))
494 <details class=
"source">
496 <span>Expand source code
</span>
498 <pre><code class=
"python">def transpositions_of(keyword):
499 """Finds the transpostions given by a keyword. For instance, the keyword
500 'clever
' rearranges to
'celrv
', so the first column (
0) stays first, the
501 second column (
1) moves to third, the third column (
2) moves to second,
504 If passed a tuple, assume it
's already a transposition and just return it.
506 >>> transpositions_of(
'clever
')
508 >>> transpositions_of(
'fred
')
510 >>> transpositions_of((
3,
2,
0,
1))
513 if isinstance(keyword, tuple):
516 key = deduplicate(keyword)
517 transpositions = tuple(key.index(l) for l in sorted(key))
518 return transpositions
</code></pre>
521 <dt id=
"szyfrow.support.language_models.weighted_choice"><code class=
"name flex">
522 <span>def
<span class=
"ident">weighted_choice
</span></span>(
<span>d)
</span>
525 <div class=
"desc"><p>Generate random item from a dictionary of item counts
</p></div>
526 <details class=
"source">
528 <span>Expand source code
</span>
530 <pre><code class=
"python">def weighted_choice(d):
531 """Generate random item from a dictionary of item counts
533 delems, dweights = list(zip(*d.items()))
534 return random.choices(delems, dweights)[
0]
535 # target = random.uniform(
0, sum(d.values()))
537 # for (l, p) in d.items():
539 # if cuml
> target:
541 # return None
</code></pre>
547 <h2 class=
"section-title" id=
"header-classes">Classes
</h2>
549 <dt id=
"szyfrow.support.language_models.Pdist"><code class=
"flex name class">
550 <span>class
<span class=
"ident">Pdist
</span></span>
551 <span>(
</span><span>data=[], estimate_of_missing=None)
</span>
554 <div class=
"desc"><p>A probability distribution estimated from counts in datafile.
555 Values are stored and returned as log probabilities.
</p></div>
556 <details class=
"source">
558 <span>Expand source code
</span>
560 <pre><code class=
"python">class Pdist(dict):
561 """A probability distribution estimated from counts in datafile.
562 Values are stored and returned as log probabilities.
564 def __init__(self, data=[], estimate_of_missing=None):
565 data1, data2 = itertools.tee(data)
566 self.total = sum([d[
1] for d in data1])
567 for key, count in data2:
568 self[key] = log10(count / self.total)
569 self.estimate_of_missing = estimate_of_missing or (lambda k, N:
1./N)
570 def __missing__(self, key):
571 return self.estimate_of_missing(key, self.total)
</code></pre>
575 <li>builtins.dict
</li>
587 <li><h3>Super-module
</h3>
589 <li><code><a title=
"szyfrow.support" href=
"index.html">szyfrow.support
</a></code></li>
592 <li><h3><a href=
"#header-variables">Global variables
</a></h3>
594 <li><code><a title=
"szyfrow.support.language_models.P2l" href=
"#szyfrow.support.language_models.P2l">P2l
</a></code></li>
595 <li><code><a title=
"szyfrow.support.language_models.P3l" href=
"#szyfrow.support.language_models.P3l">P3l
</a></code></li>
596 <li><code><a title=
"szyfrow.support.language_models.Pl" href=
"#szyfrow.support.language_models.Pl">Pl
</a></code></li>
597 <li><code><a title=
"szyfrow.support.language_models.Pw" href=
"#szyfrow.support.language_models.Pw">Pw
</a></code></li>
598 <li><code><a title=
"szyfrow.support.language_models.english_bigram_counts" href=
"#szyfrow.support.language_models.english_bigram_counts">english_bigram_counts
</a></code></li>
599 <li><code><a title=
"szyfrow.support.language_models.english_counts" href=
"#szyfrow.support.language_models.english_counts">english_counts
</a></code></li>
600 <li><code><a title=
"szyfrow.support.language_models.english_trigram_counts" href=
"#szyfrow.support.language_models.english_trigram_counts">english_trigram_counts
</a></code></li>
601 <li><code><a title=
"szyfrow.support.language_models.keywords" href=
"#szyfrow.support.language_models.keywords">keywords
</a></code></li>
602 <li><code><a title=
"szyfrow.support.language_models.normalised_english_bigram_counts" href=
"#szyfrow.support.language_models.normalised_english_bigram_counts">normalised_english_bigram_counts
</a></code></li>
603 <li><code><a title=
"szyfrow.support.language_models.normalised_english_counts" href=
"#szyfrow.support.language_models.normalised_english_counts">normalised_english_counts
</a></code></li>
604 <li><code><a title=
"szyfrow.support.language_models.normalised_english_trigram_counts" href=
"#szyfrow.support.language_models.normalised_english_trigram_counts">normalised_english_trigram_counts
</a></code></li>
605 <li><code><a title=
"szyfrow.support.language_models.transpositions" href=
"#szyfrow.support.language_models.transpositions">transpositions
</a></code></li>
608 <li><h3><a href=
"#header-functions">Functions
</a></h3>
610 <li><code><a title=
"szyfrow.support.language_models.Pbigrams" href=
"#szyfrow.support.language_models.Pbigrams">Pbigrams
</a></code></li>
611 <li><code><a title=
"szyfrow.support.language_models.Pletters" href=
"#szyfrow.support.language_models.Pletters">Pletters
</a></code></li>
612 <li><code><a title=
"szyfrow.support.language_models.Ptrigrams" href=
"#szyfrow.support.language_models.Ptrigrams">Ptrigrams
</a></code></li>
613 <li><code><a title=
"szyfrow.support.language_models.Pwords" href=
"#szyfrow.support.language_models.Pwords">Pwords
</a></code></li>
614 <li><code><a title=
"szyfrow.support.language_models.cosine_distance_score" href=
"#szyfrow.support.language_models.cosine_distance_score">cosine_distance_score
</a></code></li>
615 <li><code><a title=
"szyfrow.support.language_models.datafile" href=
"#szyfrow.support.language_models.datafile">datafile
</a></code></li>
616 <li><code><a title=
"szyfrow.support.language_models.log_probability_of_unknown_word" href=
"#szyfrow.support.language_models.log_probability_of_unknown_word">log_probability_of_unknown_word
</a></code></li>
617 <li><code><a title=
"szyfrow.support.language_models.ngrams" href=
"#szyfrow.support.language_models.ngrams">ngrams
</a></code></li>
618 <li><code><a title=
"szyfrow.support.language_models.random_english_letter" href=
"#szyfrow.support.language_models.random_english_letter">random_english_letter
</a></code></li>
619 <li><code><a title=
"szyfrow.support.language_models.transpositions_of" href=
"#szyfrow.support.language_models.transpositions_of">transpositions_of
</a></code></li>
620 <li><code><a title=
"szyfrow.support.language_models.weighted_choice" href=
"#szyfrow.support.language_models.weighted_choice">weighted_choice
</a></code></li>
623 <li><h3><a href=
"#header-classes">Classes
</a></h3>
626 <h4><code><a title=
"szyfrow.support.language_models.Pdist" href=
"#szyfrow.support.language_models.Pdist">Pdist
</a></code></h4>
634 <p>Generated by
<a href=
"https://pdoc3.github.io/pdoc"><cite>pdoc
</cite> 0.9.2</a>.
</p>