Started 2015 challenges
[cipher-training.git] / language_models.py
1 """Language-specific functions, including models of languages based on data of
2 its use.
3 """
4
5 import string
6 import random
7 import norms
8 import collections
9 import unicodedata
10 import itertools
11 from math import log10
12
13 unaccent_specials = ''.maketrans({"’": "'"})
14
15 def letters(text):
16 """Remove all non-alphabetic characters from a text
17 >>> letters('The Quick')
18 'TheQuick'
19 >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
20 'TheQuickBROWNfoxjumpedoverthelazyDOG'
21 """
22 return ''.join([c for c in text if c in string.ascii_letters])
23
24 def unaccent(text):
25 """Remove all accents from letters.
26 It does this by converting the unicode string to decomposed compatability
27 form, dropping all the combining accents, then re-encoding the bytes.
28
29 >>> unaccent('hello')
30 'hello'
31 >>> unaccent('HELLO')
32 'HELLO'
33 >>> unaccent('héllo')
34 'hello'
35 >>> unaccent('héllö')
36 'hello'
37 >>> unaccent('HÉLLÖ')
38 'HELLO'
39 """
40 translated_text = text.translate(unaccent_specials)
41 return unicodedata.normalize('NFKD', translated_text).\
42 encode('ascii', 'ignore').\
43 decode('utf-8')
44
45 def sanitise(text):
46 """Remove all non-alphabetic characters and convert the text to lowercase
47
48 >>> sanitise('The Quick')
49 'thequick'
50 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
51 'thequickbrownfoxjumpedoverthelazydog'
52 >>> sanitise('HÉLLÖ')
53 'hello'
54 """
55 # sanitised = [c.lower() for c in text if c in string.ascii_letters]
56 # return ''.join(sanitised)
57 return letters(unaccent(text)).lower()
58
59
60 def datafile(name, sep='\t'):
61 """Read key,value pairs from file.
62 """
63 with open(name, 'r') as f:
64 for line in f:
65 splits = line.split(sep)
66 yield [splits[0], int(splits[1])]
67
68 english_counts = collections.Counter(dict(datafile('count_1l.txt')))
69 normalised_english_counts = norms.normalise(english_counts)
70
71 english_bigram_counts = collections.Counter(dict(datafile('count_2l.txt')))
72 normalised_english_bigram_counts = norms.normalise(english_bigram_counts)
73
74 english_trigram_counts = collections.Counter(dict(datafile('count_3l.txt')))
75 normalised_english_trigram_counts = norms.normalise(english_trigram_counts)
76
77 with open('words.txt', 'r') as f:
78 keywords = [line.rstrip() for line in f]
79
80
81 def weighted_choice(d):
82 """Generate random item from a dictionary of item counts
83 """
84 target = random.uniform(0, sum(d.values()))
85 cuml = 0.0
86 for (l, p) in d.items():
87 cuml += p
88 if cuml > target:
89 return l
90 return None
91
92 def random_english_letter():
93 """Generate a random letter based on English letter counts
94 """
95 return weighted_choice(normalised_english_counts)
96
97
98 def ngrams(text, n):
99 """Returns all n-grams of a text
100
101 >>> ngrams(sanitise('the quick brown fox'), 2) # doctest: +NORMALIZE_WHITESPACE
102 ['th', 'he', 'eq', 'qu', 'ui', 'ic', 'ck', 'kb', 'br', 'ro', 'ow', 'wn',
103 'nf', 'fo', 'ox']
104 >>> ngrams(sanitise('the quick brown fox'), 4) # doctest: +NORMALIZE_WHITESPACE
105 ['theq', 'hequ', 'equi', 'quic', 'uick', 'ickb', 'ckbr', 'kbro', 'brow',
106 'rown', 'ownf', 'wnfo', 'nfox']
107 """
108 return [text[i:i+n] for i in range(len(text)-n+1)]
109
110
111 class Pdist(dict):
112 """A probability distribution estimated from counts in datafile.
113 Values are stored and returned as log probabilities.
114 """
115 def __init__(self, data=[], estimate_of_missing=None):
116 data1, data2 = itertools.tee(data)
117 self.total = sum([d[1] for d in data1])
118 for key, count in data2:
119 self[key] = log10(count / self.total)
120 self.estimate_of_missing = estimate_of_missing or (lambda k, N: 1./N)
121 def __missing__(self, key):
122 return self.estimate_of_missing(key, self.total)
123
124 def log_probability_of_unknown_word(key, N):
125 """Estimate the probability of an unknown word.
126 """
127 return -log10(N * 10**((len(key) - 2) * 1.4))
128
129 Pw = Pdist(datafile('count_1w.txt'), log_probability_of_unknown_word)
130 Pw_wrong = Pdist(datafile('count_1w.txt'), lambda _k, N: log10(1/N))
131 Pl = Pdist(datafile('count_1l.txt'), lambda _k, _N: 0)
132 P2l = Pdist(datafile('count_2l.txt'), lambda _k, _N: 0)
133 P3l = Pdist(datafile('count_3l.txt'), lambda _k, _N: 0)
134
135 def Pwords(words):
136 """The Naive Bayes log probability of a sequence of words.
137 """
138 return sum(Pw[w.lower()] for w in words)
139
140 def Pwords_wrong(words):
141 """The Naive Bayes log probability of a sequence of words.
142 """
143 return sum(Pw_wrong[w.lower()] for w in words)
144
145 def Pletters(letters):
146 """The Naive Bayes log probability of a sequence of letters.
147 """
148 return sum(Pl[l.lower()] for l in letters)
149
150 def Pbigrams(letters):
151 """The Naive Bayes log probability of the bigrams formed from a sequence
152 of letters.
153 """
154 return sum(P2l[p] for p in ngrams(letters, 2))
155
156 def Ptrigrams(letters):
157 """The Naive Bayes log probability of the trigrams formed from a sequence
158 of letters.
159 """
160 return sum(P3l[p] for p in ngrams(letters, 3))
161
162
163 def cosine_similarity_score(text):
164 """Finds the dissimilarity of a text to English, using the cosine distance
165 of the frequency distribution.
166
167 >>> cosine_similarity_score('abcabc') # doctest: +ELLIPSIS
168 0.26228882...
169 """
170 return norms.cosine_similarity(english_counts,
171 collections.Counter(sanitise(text)))
172
173
174 if __name__ == "__main__":
175 import doctest
176 doctest.testmod()