Many tests done, still more to come.
[szyfrow.git] / szyfrow / support / language_models.py
1 import string
2 import random
3 import collections
4 import itertools
5 from math import log10
6 import os
7 import importlib.resources as pkg_resources
8
9 import szyfrow.support.norms
10 from szyfrow.support.utilities import sanitise, deduplicate
11 from szyfrow import language_model_files
12
13
14 def datafile(name, sep='\t'):
15 """Read key,value pairs from file.
16 """
17 with pkg_resources.open_text(language_model_files, name) as f:
18 # with open(p name), 'r') as f:
19 for line in f:
20 splits = line.split(sep)
21 yield [splits[0], int(splits[1])]
22
23 english_counts = collections.Counter(dict(datafile('count_1l.txt')))
24 normalised_english_counts = szyfrow.support.norms.normalise(english_counts)
25
26 english_bigram_counts = collections.Counter(dict(datafile('count_2l.txt')))
27 normalised_english_bigram_counts = szyfrow.support.norms.normalise(english_bigram_counts)
28
29 english_trigram_counts = collections.Counter(dict(datafile('count_3l.txt')))
30 normalised_english_trigram_counts = szyfrow.support.norms.normalise(english_trigram_counts)
31
32 with pkg_resources.open_text(language_model_files, 'words.txt') as f:
33 keywords = [line.rstrip() for line in f]
34
35
36 def transpositions_of(keyword):
37 """Finds the transpostions given by a keyword. For instance, the keyword
38 'clever' rearranges to 'celrv', so the first column (0) stays first, the
39 second column (1) moves to third, the third column (2) moves to second,
40 and so on.
41
42 If passed a tuple, assume it's already a transposition and just return it.
43
44 >>> transpositions_of('clever')
45 (0, 2, 1, 4, 3)
46 >>> transpositions_of('fred')
47 (3, 2, 0, 1)
48 >>> transpositions_of((3, 2, 0, 1))
49 (3, 2, 0, 1)
50 """
51 if isinstance(keyword, tuple):
52 return keyword
53 else:
54 key = deduplicate(keyword)
55 transpositions = tuple(key.index(l) for l in sorted(key))
56 return transpositions
57
58 transpositions = collections.defaultdict(list)
59 for word in keywords:
60 transpositions[transpositions_of(word)] += [word]
61
62
63 def weighted_choice(d):
64 """Generate random item from a dictionary of item counts
65 """
66 target = random.uniform(0, sum(d.values()))
67 cuml = 0.0
68 for (l, p) in d.items():
69 cuml += p
70 if cuml > target:
71 return l
72 return None
73
74 def random_english_letter():
75 """Generate a random letter based on English letter counts
76 """
77 return weighted_choice(normalised_english_counts)
78
79
80 def ngrams(text, n):
81 """Returns all n-grams of a text
82
83 >>> ngrams(sanitise('the quick brown fox'), 2) # doctest: +NORMALIZE_WHITESPACE
84 ['th', 'he', 'eq', 'qu', 'ui', 'ic', 'ck', 'kb', 'br', 'ro', 'ow', 'wn',
85 'nf', 'fo', 'ox']
86 >>> ngrams(sanitise('the quick brown fox'), 4) # doctest: +NORMALIZE_WHITESPACE
87 ['theq', 'hequ', 'equi', 'quic', 'uick', 'ickb', 'ckbr', 'kbro', 'brow',
88 'rown', 'ownf', 'wnfo', 'nfox']
89 """
90 return [text[i:i+n] for i in range(len(text)-n+1)]
91
92
93 class Pdist(dict):
94 """A probability distribution estimated from counts in datafile.
95 Values are stored and returned as log probabilities.
96 """
97 def __init__(self, data=[], estimate_of_missing=None):
98 data1, data2 = itertools.tee(data)
99 self.total = sum([d[1] for d in data1])
100 for key, count in data2:
101 self[key] = log10(count / self.total)
102 self.estimate_of_missing = estimate_of_missing or (lambda k, N: 1./N)
103 def __missing__(self, key):
104 return self.estimate_of_missing(key, self.total)
105
106 def log_probability_of_unknown_word(key, N):
107 """Estimate the probability of an unknown word.
108 """
109 return -log10(N * 10**((len(key) - 2) * 1.4))
110
111 Pw = Pdist(datafile('count_1w.txt'), log_probability_of_unknown_word)
112 Pl = Pdist(datafile('count_1l.txt'), lambda _k, _N: 0)
113 P2l = Pdist(datafile('count_2l.txt'), lambda _k, _N: 0)
114 P3l = Pdist(datafile('count_3l.txt'), lambda _k, _N: 0)
115
116 def Pwords(words):
117 """The Naive Bayes log probability of a sequence of words.
118 """
119 return sum(Pw[w.lower()] for w in words)
120
121 def Pletters(letters):
122 """The Naive Bayes log probability of a sequence of letters.
123 """
124 return sum(Pl[l.lower()] for l in letters)
125
126 def Pbigrams(letters):
127 """The Naive Bayes log probability of the bigrams formed from a sequence
128 of letters.
129 """
130 return sum(P2l[p] for p in ngrams(letters, 2))
131
132 def Ptrigrams(letters):
133 """The Naive Bayes log probability of the trigrams formed from a sequence
134 of letters.
135 """
136 return sum(P3l[p] for p in ngrams(letters, 3))
137
138
139 def cosine_distance_score(text):
140 """Finds the dissimilarity of a text to English, using the cosine distance
141 of the frequency distribution.
142
143 >>> cosine_distance_score('abcabc') # doctest: +ELLIPSIS
144 0.73771...
145 """
146 # return szyfrow.support.norms.cosine_distance(english_counts,
147 # collections.Counter(sanitise(text)))
148 return 1 - szyfrow.support.norms.cosine_similarity(english_counts,
149 collections.Counter(sanitise(text)))
150
151
152 if __name__ == "__main__":
153 import doctest
154 doctest.testmod()