Done challenge 7, updated Amsco implementation to match
[cipher-tools.git] / language_models.py
1 import string
2 import norms
3 import random
4 import collections
5 import unicodedata
6 import itertools
7 from math import log10
8
9 unaccent_specials = ''.maketrans({"’": "'"})
10
11 def letters(text):
12 """Remove all non-alphabetic characters from a text
13 >>> letters('The Quick')
14 'TheQuick'
15 >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
16 'TheQuickBROWNfoxjumpedoverthelazyDOG'
17 """
18 return ''.join([c for c in text if c in string.ascii_letters])
19
20 def unaccent(text):
21 """Remove all accents from letters.
22 It does this by converting the unicode string to decomposed compatability
23 form, dropping all the combining accents, then re-encoding the bytes.
24
25 >>> unaccent('hello')
26 'hello'
27 >>> unaccent('HELLO')
28 'HELLO'
29 >>> unaccent('héllo')
30 'hello'
31 >>> unaccent('héllö')
32 'hello'
33 >>> unaccent('HÉLLÖ')
34 'HELLO'
35 """
36 translated_text = text.translate(unaccent_specials)
37 return unicodedata.normalize('NFKD', translated_text).\
38 encode('ascii', 'ignore').\
39 decode('utf-8')
40
41 def sanitise(text):
42 """Remove all non-alphabetic characters and convert the text to lowercase
43
44 >>> sanitise('The Quick')
45 'thequick'
46 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
47 'thequickbrownfoxjumpedoverthelazydog'
48 >>> sanitise('HÉLLÖ')
49 'hello'
50 """
51 # sanitised = [c.lower() for c in text if c in string.ascii_letters]
52 # return ''.join(sanitised)
53 return letters(unaccent(text)).lower()
54
55
56 def datafile(name, sep='\t'):
57 """Read key,value pairs from file.
58 """
59 with open(name, 'r') as f:
60 for line in f:
61 splits = line.split(sep)
62 yield [splits[0], int(splits[1])]
63
64 english_counts = collections.Counter(dict(datafile('count_1l.txt')))
65 normalised_english_counts = norms.normalise(english_counts)
66
67 english_bigram_counts = collections.Counter(dict(datafile('count_2l.txt')))
68 normalised_english_bigram_counts = norms.normalise(english_bigram_counts)
69
70 english_trigram_counts = collections.Counter(dict(datafile('count_3l.txt')))
71 normalised_english_trigram_counts = norms.normalise(english_trigram_counts)
72
73 with open('words.txt', 'r') as f:
74 keywords = [line.rstrip() for line in f]
75
76
77 def weighted_choice(d):
78 """Generate random item from a dictionary of item counts
79 """
80 target = random.uniform(0, sum(d.values()))
81 cuml = 0.0
82 for (l, p) in d.items():
83 cuml += p
84 if cuml > target:
85 return l
86 return None
87
88 def random_english_letter():
89 """Generate a random letter based on English letter counts
90 """
91 return weighted_choice(normalised_english_counts)
92
93
94 def ngrams(text, n):
95 """Returns all n-grams of a text
96
97 >>> ngrams(sanitise('the quick brown fox'), 2) # doctest: +NORMALIZE_WHITESPACE
98 ['th', 'he', 'eq', 'qu', 'ui', 'ic', 'ck', 'kb', 'br', 'ro', 'ow', 'wn',
99 'nf', 'fo', 'ox']
100 >>> ngrams(sanitise('the quick brown fox'), 4) # doctest: +NORMALIZE_WHITESPACE
101 ['theq', 'hequ', 'equi', 'quic', 'uick', 'ickb', 'ckbr', 'kbro', 'brow',
102 'rown', 'ownf', 'wnfo', 'nfox']
103 """
104 return [text[i:i+n] for i in range(len(text)-n+1)]
105
106
107 class Pdist(dict):
108 """A probability distribution estimated from counts in datafile.
109 Values are stored and returned as log probabilities.
110 """
111 def __init__(self, data=[], estimate_of_missing=None):
112 data1, data2 = itertools.tee(data)
113 self.total = sum([d[1] for d in data1])
114 for key, count in data2:
115 self[key] = log10(count / self.total)
116 self.estimate_of_missing = estimate_of_missing or (lambda k, N: 1./N)
117 def __missing__(self, key):
118 return self.estimate_of_missing(key, self.total)
119
120 def log_probability_of_unknown_word(key, N):
121 """Estimate the probability of an unknown word.
122 """
123 return -log10(N * 10**((len(key) - 2) * 1.4))
124
125 Pw = Pdist(datafile('count_1w.txt'), log_probability_of_unknown_word)
126 Pl = Pdist(datafile('count_1l.txt'), lambda _k, _N: 0)
127 P2l = Pdist(datafile('count_2l.txt'), lambda _k, _N: 0)
128
129 def Pwords(words):
130 """The Naive Bayes log probability of a sequence of words.
131 """
132 return sum(Pw[w.lower()] for w in words)
133
134 def Pletters(letters):
135 """The Naive Bayes log probability of a sequence of letters.
136 """
137 return sum(Pl[l.lower()] for l in letters)
138
139 def Pbigrams(letters):
140 """The Naive Bayes log probability of the bigrams formed from a sequence
141 of letters.
142 """
143 return sum(P2l[p] for p in ngrams(letters, 2))
144
145
146 def cosine_distance_score(text):
147 """Finds the dissimilarity of a text to English, using the cosine distance
148 of the frequency distribution.
149
150 >>> cosine_distance_score('abcabc') # doctest: +ELLIPSIS
151 0.370847405...
152 """
153 return norms.cosine_distance(english_counts,
154 collections.Counter(sanitise(text)))
155
156
157 if __name__ == "__main__":
158 import doctest
159 doctest.testmod()