Merge branch 'master' of git.njae.me.uk:national-cipher-challenge
[cipher-tools.git] / language_models.py
1 import string
2 import norms
3 import random
4 import collections
5 import unicodedata
6 import itertools
7 from math import log10
8 import os
9
10 unaccent_specials = ''.maketrans({"’": "'", '“': '"', '”': '"'})
11
12 def letters(text):
13 """Remove all non-alphabetic characters from a text
14 >>> letters('The Quick')
15 'TheQuick'
16 >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
17 'TheQuickBROWNfoxjumpedoverthelazyDOG'
18 """
19 return ''.join([c for c in text if c in string.ascii_letters])
20
21 def unaccent(text):
22 """Remove all accents from letters.
23 It does this by converting the unicode string to decomposed compatability
24 form, dropping all the combining accents, then re-encoding the bytes.
25
26 >>> unaccent('hello')
27 'hello'
28 >>> unaccent('HELLO')
29 'HELLO'
30 >>> unaccent('héllo')
31 'hello'
32 >>> unaccent('héllö')
33 'hello'
34 >>> unaccent('HÉLLÖ')
35 'HELLO'
36 """
37 translated_text = text.translate(unaccent_specials)
38 return unicodedata.normalize('NFKD', translated_text).\
39 encode('ascii', 'ignore').\
40 decode('utf-8')
41
42 def sanitise(text):
43 """Remove all non-alphabetic characters and convert the text to lowercase
44
45 >>> sanitise('The Quick')
46 'thequick'
47 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
48 'thequickbrownfoxjumpedoverthelazydog'
49 >>> sanitise('HÉLLÖ')
50 'hello'
51 """
52 # sanitised = [c.lower() for c in text if c in string.ascii_letters]
53 # return ''.join(sanitised)
54 return letters(unaccent(text)).lower()
55
56
57 def datafile(name, sep='\t'):
58 """Read key,value pairs from file.
59 """
60 with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), name), 'r') as f:
61 for line in f:
62 splits = line.split(sep)
63 yield [splits[0], int(splits[1])]
64
65 english_counts = collections.Counter(dict(datafile('count_1l.txt')))
66 normalised_english_counts = norms.normalise(english_counts)
67
68 english_bigram_counts = collections.Counter(dict(datafile('count_2l.txt')))
69 normalised_english_bigram_counts = norms.normalise(english_bigram_counts)
70
71 english_trigram_counts = collections.Counter(dict(datafile('count_3l.txt')))
72 normalised_english_trigram_counts = norms.normalise(english_trigram_counts)
73
74 with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'words.txt'), 'r') as f:
75 keywords = [line.rstrip() for line in f]
76
77
78 def weighted_choice(d):
79 """Generate random item from a dictionary of item counts
80 """
81 target = random.uniform(0, sum(d.values()))
82 cuml = 0.0
83 for (l, p) in d.items():
84 cuml += p
85 if cuml > target:
86 return l
87 return None
88
89 def random_english_letter():
90 """Generate a random letter based on English letter counts
91 """
92 return weighted_choice(normalised_english_counts)
93
94
95 def ngrams(text, n):
96 """Returns all n-grams of a text
97
98 >>> ngrams(sanitise('the quick brown fox'), 2) # doctest: +NORMALIZE_WHITESPACE
99 ['th', 'he', 'eq', 'qu', 'ui', 'ic', 'ck', 'kb', 'br', 'ro', 'ow', 'wn',
100 'nf', 'fo', 'ox']
101 >>> ngrams(sanitise('the quick brown fox'), 4) # doctest: +NORMALIZE_WHITESPACE
102 ['theq', 'hequ', 'equi', 'quic', 'uick', 'ickb', 'ckbr', 'kbro', 'brow',
103 'rown', 'ownf', 'wnfo', 'nfox']
104 """
105 return [text[i:i+n] for i in range(len(text)-n+1)]
106
107
108 class Pdist(dict):
109 """A probability distribution estimated from counts in datafile.
110 Values are stored and returned as log probabilities.
111 """
112 def __init__(self, data=[], estimate_of_missing=None):
113 data1, data2 = itertools.tee(data)
114 self.total = sum([d[1] for d in data1])
115 for key, count in data2:
116 self[key] = log10(count / self.total)
117 self.estimate_of_missing = estimate_of_missing or (lambda k, N: 1./N)
118 def __missing__(self, key):
119 return self.estimate_of_missing(key, self.total)
120
121 def log_probability_of_unknown_word(key, N):
122 """Estimate the probability of an unknown word.
123 """
124 return -log10(N * 10**((len(key) - 2) * 1.4))
125
126 Pw = Pdist(datafile('count_1w.txt'), log_probability_of_unknown_word)
127 Pl = Pdist(datafile('count_1l.txt'), lambda _k, _N: 0)
128 P2l = Pdist(datafile('count_2l.txt'), lambda _k, _N: 0)
129
130 def Pwords(words):
131 """The Naive Bayes log probability of a sequence of words.
132 """
133 return sum(Pw[w.lower()] for w in words)
134
135 def Pletters(letters):
136 """The Naive Bayes log probability of a sequence of letters.
137 """
138 return sum(Pl[l.lower()] for l in letters)
139
140 def Pbigrams(letters):
141 """The Naive Bayes log probability of the bigrams formed from a sequence
142 of letters.
143 """
144 return sum(P2l[p] for p in ngrams(letters, 2))
145
146
147 def cosine_distance_score(text):
148 """Finds the dissimilarity of a text to English, using the cosine distance
149 of the frequency distribution.
150
151 >>> cosine_distance_score('abcabc') # doctest: +ELLIPSIS
152 0.370847405...
153 """
154 return norms.cosine_distance(english_counts,
155 collections.Counter(sanitise(text)))
156
157
158 if __name__ == "__main__":
159 import doctest
160 doctest.testmod()