1 """Descriptive models of a natural language (in this case, English).
3 The functions `Pwords`, `Pletters`, `Pbigrams`, and `Ptrigrams` return the
4 log probability of a section of text.
6 If you want to use a different language, replace the data files in
7 [`szyfrow/language_model_files`](../language_model_files/index.html).
9 * `count_1l.txt`: counts of single letters
10 * `count_2l.txt`: counts of pairs letters, bigrams
11 * `count_3l.txt`: counts of triples of letters, triagrams
12 * `words.txt`: a dictionary of words, used for keyword-based cipher breaking.
13 These words should only contain characters cointained in
14 `string.ascii_letters`.
22 from math
import log10
24 import importlib
.resources
as pkg_resources
26 import szyfrow
.support
.norms
27 from szyfrow
.support
.utilities
import sanitise
, deduplicate
28 from szyfrow
import language_model_files
31 def datafile(name
, sep
='\t'):
32 """Read key,value pairs from file.
34 with pkg_resources
.open_text(language_model_files
, name
) as f
:
35 # with open(p name), 'r') as f:
37 splits
= line
.split(sep
)
38 yield [splits
[0], int(splits
[1])]
40 english_counts
= collections
.Counter(dict(datafile('count_1l.txt')))
41 """Counts of single letters in English."""
42 normalised_english_counts
= szyfrow
.support
.norms
.normalise(english_counts
)
43 """Normalised counts of single letters in English (the sum of all counts
46 english_bigram_counts
= collections
.Counter(dict(datafile('count_2l.txt')))
47 """Counts of letter bigrams in English."""
48 normalised_english_bigram_counts
= szyfrow
.support
.norms
.normalise(english_bigram_counts
)
49 """Normalised counts of letter bigrams in English (the sum of all counts
52 english_trigram_counts
= collections
.Counter(dict(datafile('count_3l.txt')))
53 """Counts of letter trigrams in English."""
54 normalised_english_trigram_counts
= szyfrow
.support
.norms
.normalise(english_trigram_counts
)
55 """Normalised counts of letter trigrams in English (the sum of all counts
59 """A sample list of keywords, to act as a dictionary for
60 dictionary-based cipher breaking attempts."""
61 with pkg_resources
.open_text(language_model_files
, 'words.txt') as f
:
62 keywords
= [line
.rstrip() for line
in f
]
65 def transpositions_of(keyword
):
66 """Finds the transpostions given by a keyword. For instance, the keyword
67 'clever' rearranges to 'celrv', so the first column (0) stays first, the
68 second column (1) moves to third, the third column (2) moves to second,
71 If passed a tuple, assume it's already a transposition and just return it.
73 >>> transpositions_of('clever')
75 >>> transpositions_of('fred')
77 >>> transpositions_of((3, 2, 0, 1))
80 if isinstance(keyword
, tuple):
83 key
= deduplicate(keyword
)
84 transpositions
= tuple(key
.index(l
) for l
in sorted(key
))
87 transpositions
= collections
.defaultdict(list)
88 """A sample dict of transpositions, to act as a dictionary for
89 dictionary-based cipher breaking attempts. Each key is a transposition,
90 each value is a list of words that give that transposition."""
92 transpositions
[transpositions_of(word
)] += [word
]
95 def weighted_choice(d
):
96 """Generate random item from a dictionary of item counts
98 delems
, dweights
= list(zip(*d
.items()))
99 return random
.choices(delems
, dweights
)[0]
100 # target = random.uniform(0, sum(d.values()))
102 # for (l, p) in d.items():
108 def random_english_letter():
109 """Generate a random letter based on English letter counts
111 return weighted_choice(normalised_english_counts
)
115 """Returns all n-grams of a text
117 >>> ngrams(sanitise('the quick brown fox'), 2) # doctest: +NORMALIZE_WHITESPACE
118 ['th', 'he', 'eq', 'qu', 'ui', 'ic', 'ck', 'kb', 'br', 'ro', 'ow', 'wn',
120 >>> ngrams(sanitise('the quick brown fox'), 4) # doctest: +NORMALIZE_WHITESPACE
121 ['theq', 'hequ', 'equi', 'quic', 'uick', 'ickb', 'ckbr', 'kbro', 'brow',
122 'rown', 'ownf', 'wnfo', 'nfox']
124 return [text
[i
:i
+n
] for i
in range(len(text
)-n
+1)]
128 """A probability distribution estimated from counts in datafile.
129 Values are stored and returned as log probabilities.
131 def __init__(self
, data
=[], estimate_of_missing
=None):
132 data1
, data2
= itertools
.tee(data
)
133 self
.total
= sum([d
[1] for d
in data1
])
134 for key
, count
in data2
:
135 self
[key
] = log10(count
/ self
.total
)
136 self
.estimate_of_missing
= estimate_of_missing
or (lambda k
, N
: 1./N
)
137 def __missing__(self
, key
):
138 return self
.estimate_of_missing(key
, self
.total
)
140 def log_probability_of_unknown_word(key
, N
):
141 """Estimate the probability of an unknown word.
143 return -log10(N
* 10**((len(key
) - 2) * 1.4))
145 Pw
= Pdist(datafile('count_1w.txt'), log_probability_of_unknown_word
)
146 """A [Pdist](#szyfrow.support.language_models.Pdist) holding log probabilities
147 of words. Unknown words have their probability estimated by
148 [log_probability_of_unknown_word](#szyfrow.support.language_models.log_probability_of_unknown_word)"""
149 Pl
= Pdist(datafile('count_1l.txt'), lambda _k
, _N
: 0)
150 """A [Pdist](#szyfrow.support.language_models.Pdist) holding log probabilities
151 of single letters. Unknown words have their probability estimated as zero."""
152 P2l
= Pdist(datafile('count_2l.txt'), lambda _k
, _N
: 0)
153 """A [Pdist](#szyfrow.support.language_models.Pdist) holding log probabilities
154 of letter bigrams. Unknown words have their probability estimated as zero."""
155 P3l
= Pdist(datafile('count_3l.txt'), lambda _k
, _N
: 0)
156 """A [Pdist](#szyfrow.support.language_models.Pdist) holding log probabilities
157 of letter trigrams. Unknown words have their probability estimated as zero."""
160 """The Naive Bayes log probability of a sequence of words.
162 return sum(Pw
[w
.lower()] for w
in words
)
164 def Pletters(letters
):
165 """The Naive Bayes log probability of a sequence of letters.
167 return sum(Pl
[l
.lower()] for l
in letters
)
169 def Pbigrams(letters
):
170 """The Naive Bayes log probability of the bigrams formed from a sequence
173 return sum(P2l
[p
] for p
in ngrams(letters
, 2))
175 def Ptrigrams(letters
):
176 """The Naive Bayes log probability of the trigrams formed from a sequence
179 return sum(P3l
[p
] for p
in ngrams(letters
, 3))
182 def cosine_distance_score(text
):
183 """Finds the dissimilarity of a text to English, using the cosine distance
184 of the frequency distribution.
186 >>> cosine_distance_score('abcabc') # doctest: +ELLIPSIS
189 # return szyfrow.support.norms.cosine_distance(english_counts,
190 # collections.Counter(sanitise(text)))
191 return 1 - szyfrow
.support
.norms
.cosine_similarity(english_counts
,
192 collections
.Counter(sanitise(text
)))
195 if __name__
== "__main__":