Minor documentation updates
[szyfrow.git] / szyfrow / support / language_models.py
1 """Descriptive models of a natural language (in this case, English).
2
3 The functions `Pwords`, `Pletters`, `Pbigrams`, and `Ptrigrams` return the
4 log probability of a section of text.
5
6 If you want to use a different language, replace the data files in
7 [`szyfrow/language_model_files`](../language_model_files/index.html).
8
9 * `count_1l.txt`: counts of single letters
10 * `count_2l.txt`: counts of pairs letters, bigrams
11 * `count_3l.txt`: counts of triples of letters, triagrams
12 * `words.txt`: a dictionary of words, used for keyword-based cipher breaking.
13 These words should only contain characters cointained in
14 `string.ascii_letters`.
15
16 """
17
18 import string
19 import random
20 import collections
21 import itertools
22 from math import log10
23 import os
24 import importlib.resources as pkg_resources
25
26 import szyfrow.support.norms
27 from szyfrow.support.utilities import sanitise, deduplicate
28 from szyfrow import language_model_files
29
30
31 def datafile(name, sep='\t'):
32 """Read key,value pairs from file.
33 """
34 with pkg_resources.open_text(language_model_files, name) as f:
35 # with open(p name), 'r') as f:
36 for line in f:
37 splits = line.split(sep)
38 yield [splits[0], int(splits[1])]
39
40 english_counts = collections.Counter(dict(datafile('count_1l.txt')))
41 """Counts of single letters in English."""
42 normalised_english_counts = szyfrow.support.norms.normalise(english_counts)
43 """Normalised counts of single letters in English (the sum of all counts
44 adds to 1)."""
45
46 english_bigram_counts = collections.Counter(dict(datafile('count_2l.txt')))
47 """Counts of letter bigrams in English."""
48 normalised_english_bigram_counts = szyfrow.support.norms.normalise(english_bigram_counts)
49 """Normalised counts of letter bigrams in English (the sum of all counts
50 adds to 1)."""
51
52 english_trigram_counts = collections.Counter(dict(datafile('count_3l.txt')))
53 """Counts of letter trigrams in English."""
54 normalised_english_trigram_counts = szyfrow.support.norms.normalise(english_trigram_counts)
55 """Normalised counts of letter trigrams in English (the sum of all counts
56 adds to 1)."""
57
58 keywords = []
59 """A sample list of keywords, to act as a dictionary for
60 dictionary-based cipher breaking attempts."""
61 with pkg_resources.open_text(language_model_files, 'words.txt') as f:
62 keywords = [line.rstrip() for line in f]
63
64
65 def transpositions_of(keyword):
66 """Finds the transpostions given by a keyword. For instance, the keyword
67 'clever' rearranges to 'celrv', so the first column (0) stays first, the
68 second column (1) moves to third, the third column (2) moves to second,
69 and so on.
70
71 If passed a tuple, assume it's already a transposition and just return it.
72
73 >>> transpositions_of('clever')
74 (0, 2, 1, 4, 3)
75 >>> transpositions_of('fred')
76 (3, 2, 0, 1)
77 >>> transpositions_of((3, 2, 0, 1))
78 (3, 2, 0, 1)
79 """
80 if isinstance(keyword, tuple):
81 return keyword
82 else:
83 key = deduplicate(keyword)
84 transpositions = tuple(key.index(l) for l in sorted(key))
85 return transpositions
86
87 transpositions = collections.defaultdict(list)
88 """A sample dict of transpositions, to act as a dictionary for
89 dictionary-based cipher breaking attempts. Each key is a transposition,
90 each value is a list of words that give that transposition."""
91 for word in keywords:
92 transpositions[transpositions_of(word)] += [word]
93
94
95 def weighted_choice(d):
96 """Generate random item from a dictionary of item counts
97 """
98 delems, dweights = list(zip(*d.items()))
99 return random.choices(delems, dweights)[0]
100 # target = random.uniform(0, sum(d.values()))
101 # cuml = 0.0
102 # for (l, p) in d.items():
103 # cuml += p
104 # if cuml > target:
105 # return l
106 # return None
107
108 def random_english_letter():
109 """Generate a random letter based on English letter counts
110 """
111 return weighted_choice(normalised_english_counts)
112
113
114 def ngrams(text, n):
115 """Returns all n-grams of a text
116
117 >>> ngrams(sanitise('the quick brown fox'), 2) # doctest: +NORMALIZE_WHITESPACE
118 ['th', 'he', 'eq', 'qu', 'ui', 'ic', 'ck', 'kb', 'br', 'ro', 'ow', 'wn',
119 'nf', 'fo', 'ox']
120 >>> ngrams(sanitise('the quick brown fox'), 4) # doctest: +NORMALIZE_WHITESPACE
121 ['theq', 'hequ', 'equi', 'quic', 'uick', 'ickb', 'ckbr', 'kbro', 'brow',
122 'rown', 'ownf', 'wnfo', 'nfox']
123 """
124 return [text[i:i+n] for i in range(len(text)-n+1)]
125
126
127 class Pdist(dict):
128 """A probability distribution estimated from counts in datafile.
129 Values are stored and returned as log probabilities.
130 """
131 def __init__(self, data=[], estimate_of_missing=None):
132 data1, data2 = itertools.tee(data)
133 self.total = sum([d[1] for d in data1])
134 for key, count in data2:
135 self[key] = log10(count / self.total)
136 self.estimate_of_missing = estimate_of_missing or (lambda k, N: 1./N)
137 def __missing__(self, key):
138 return self.estimate_of_missing(key, self.total)
139
140 def log_probability_of_unknown_word(key, N):
141 """Estimate the probability of an unknown word.
142 """
143 return -log10(N * 10**((len(key) - 2) * 1.4))
144
145 Pw = Pdist(datafile('count_1w.txt'), log_probability_of_unknown_word)
146 """A [Pdist](#szyfrow.support.language_models.Pdist) holding log probabilities
147 of words. Unknown words have their probability estimated by
148 [log_probability_of_unknown_word](#szyfrow.support.language_models.log_probability_of_unknown_word)"""
149 Pl = Pdist(datafile('count_1l.txt'), lambda _k, _N: 0)
150 """A [Pdist](#szyfrow.support.language_models.Pdist) holding log probabilities
151 of single letters. Unknown words have their probability estimated as zero."""
152 P2l = Pdist(datafile('count_2l.txt'), lambda _k, _N: 0)
153 """A [Pdist](#szyfrow.support.language_models.Pdist) holding log probabilities
154 of letter bigrams. Unknown words have their probability estimated as zero."""
155 P3l = Pdist(datafile('count_3l.txt'), lambda _k, _N: 0)
156 """A [Pdist](#szyfrow.support.language_models.Pdist) holding log probabilities
157 of letter trigrams. Unknown words have their probability estimated as zero."""
158
159 def Pwords(words):
160 """The Naive Bayes log probability of a sequence of words.
161 """
162 return sum(Pw[w.lower()] for w in words)
163
164 def Pletters(letters):
165 """The Naive Bayes log probability of a sequence of letters.
166 """
167 return sum(Pl[l.lower()] for l in letters)
168
169 def Pbigrams(letters):
170 """The Naive Bayes log probability of the bigrams formed from a sequence
171 of letters.
172 """
173 return sum(P2l[p] for p in ngrams(letters, 2))
174
175 def Ptrigrams(letters):
176 """The Naive Bayes log probability of the trigrams formed from a sequence
177 of letters.
178 """
179 return sum(P3l[p] for p in ngrams(letters, 3))
180
181
182 def cosine_distance_score(text):
183 """Finds the dissimilarity of a text to English, using the cosine distance
184 of the frequency distribution.
185
186 >>> cosine_distance_score('abcabc') # doctest: +ELLIPSIS
187 0.73771...
188 """
189 # return szyfrow.support.norms.cosine_distance(english_counts,
190 # collections.Counter(sanitise(text)))
191 return 1 - szyfrow.support.norms.cosine_similarity(english_counts,
192 collections.Counter(sanitise(text)))
193
194
195 if __name__ == "__main__":
196 import doctest
197 doctest.testmod()