OMerge branch 'development' of git.njae.me.uk:cipher-training into development
[cipher-training.git] / language_models.py
1 """Language-specific functions, including models of languages based on data of
2 its use.
3 """
4
5 import string
6 import random
7 import norms
8 import collections
9 import unicodedata
10 import itertools
11 from math import log10
12 import os
13
14 unaccent_specials = ''.maketrans({"’": "'", '“': '"', '”': '"'})
15
16 def letters(text):
17 """Remove all non-alphabetic characters from a text
18 >>> letters('The Quick')
19 'TheQuick'
20 >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
21 'TheQuickBROWNfoxjumpedoverthelazyDOG'
22 """
23 return ''.join([c for c in text if c in string.ascii_letters])
24
25 def unaccent(text):
26 """Remove all accents from letters.
27 It does this by converting the unicode string to decomposed compatability
28 form, dropping all the combining accents, then re-encoding the bytes.
29
30 >>> unaccent('hello')
31 'hello'
32 >>> unaccent('HELLO')
33 'HELLO'
34 >>> unaccent('héllo')
35 'hello'
36 >>> unaccent('héllö')
37 'hello'
38 >>> unaccent('HÉLLÖ')
39 'HELLO'
40 """
41 translated_text = text.translate(unaccent_specials)
42 return unicodedata.normalize('NFKD', translated_text).\
43 encode('ascii', 'ignore').\
44 decode('utf-8')
45
46 def sanitise(text):
47 """Remove all non-alphabetic characters and convert the text to lowercase
48
49 >>> sanitise('The Quick')
50 'thequick'
51 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
52 'thequickbrownfoxjumpedoverthelazydog'
53 >>> sanitise('HÉLLÖ')
54 'hello'
55 """
56 # sanitised = [c.lower() for c in text if c in string.ascii_letters]
57 # return ''.join(sanitised)
58 return letters(unaccent(text)).lower()
59
60
61 def datafile(name, sep='\t'):
62 """Read key,value pairs from file.
63 """
64 with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), name), 'r') as f:
65 for line in f:
66 splits = line.split(sep)
67 yield [splits[0], int(splits[1])]
68
69 english_counts = collections.Counter(dict(datafile('count_1l.txt')))
70 normalised_english_counts = norms.normalise(english_counts)
71
72 english_bigram_counts = collections.Counter(dict(datafile('count_2l.txt')))
73 normalised_english_bigram_counts = norms.normalise(english_bigram_counts)
74
75 english_trigram_counts = collections.Counter(dict(datafile('count_3l.txt')))
76 normalised_english_trigram_counts = norms.normalise(english_trigram_counts)
77
78 with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'words.txt'), 'r') as f:
79 keywords = [line.rstrip() for line in f]
80
81
82 def weighted_choice(d):
83 """Generate random item from a dictionary of item counts
84 """
85 target = random.uniform(0, sum(d.values()))
86 cuml = 0.0
87 for (l, p) in d.items():
88 cuml += p
89 if cuml > target:
90 return l
91 return None
92
93 def random_english_letter():
94 """Generate a random letter based on English letter counts
95 """
96 return weighted_choice(normalised_english_counts)
97
98
99 def ngrams(text, n):
100 """Returns all n-grams of a text
101
102 >>> ngrams(sanitise('the quick brown fox'), 2) # doctest: +NORMALIZE_WHITESPACE
103 ['th', 'he', 'eq', 'qu', 'ui', 'ic', 'ck', 'kb', 'br', 'ro', 'ow', 'wn',
104 'nf', 'fo', 'ox']
105 >>> ngrams(sanitise('the quick brown fox'), 4) # doctest: +NORMALIZE_WHITESPACE
106 ['theq', 'hequ', 'equi', 'quic', 'uick', 'ickb', 'ckbr', 'kbro', 'brow',
107 'rown', 'ownf', 'wnfo', 'nfox']
108 """
109 return [text[i:i+n] for i in range(len(text)-n+1)]
110
111
112 class Pdist(dict):
113 """A probability distribution estimated from counts in datafile.
114 Values are stored and returned as log probabilities.
115 """
116 def __init__(self, data=[], estimate_of_missing=None):
117 data1, data2 = itertools.tee(data)
118 self.total = sum([d[1] for d in data1])
119 for key, count in data2:
120 self[key] = log10(count / self.total)
121 self.estimate_of_missing = estimate_of_missing or (lambda k, N: 1./N)
122 def __missing__(self, key):
123 return self.estimate_of_missing(key, self.total)
124
125 def log_probability_of_unknown_word(key, N):
126 """Estimate the probability of an unknown word.
127 """
128 return -log10(N * 10**((len(key) - 2) * 1.4))
129
130 Pw = Pdist(datafile('count_1w.txt'), log_probability_of_unknown_word)
131 Pw_wrong = Pdist(datafile('count_1w.txt'), lambda _k, N: log10(1/N))
132 Pl = Pdist(datafile('count_1l.txt'), lambda _k, _N: 0)
133 P2l = Pdist(datafile('count_2l.txt'), lambda _k, _N: 0)
134 P3l = Pdist(datafile('count_3l.txt'), lambda _k, _N: 0)
135
136 def Pwords(words):
137 """The Naive Bayes log probability of a sequence of words.
138 """
139 return sum(Pw[w.lower()] for w in words)
140
141 def Pwords_wrong(words):
142 """The Naive Bayes log probability of a sequence of words.
143 """
144 return sum(Pw_wrong[w.lower()] for w in words)
145
146 def Pletters(letters):
147 """The Naive Bayes log probability of a sequence of letters.
148 """
149 return sum(Pl[l.lower()] for l in letters)
150
151 def Pbigrams(letters):
152 """The Naive Bayes log probability of the bigrams formed from a sequence
153 of letters.
154 """
155 return sum(P2l[p] for p in ngrams(letters, 2))
156
157 def Ptrigrams(letters):
158 """The Naive Bayes log probability of the trigrams formed from a sequence
159 of letters.
160 """
161 return sum(P3l[p] for p in ngrams(letters, 3))
162
163
164 def cosine_similarity_score(text):
165 """Finds the dissimilarity of a text to English, using the cosine distance
166 of the frequency distribution.
167
168 >>> cosine_similarity_score('abcabc') # doctest: +ELLIPSIS
169 0.26228882...
170 """
171 return norms.cosine_similarity(english_counts,
172 collections.Counter(sanitise(text)))
173
174
175 if __name__ == "__main__":
176 import doctest
177 doctest.testmod()