Finished for a bit
[cipher-tools.git] / language_models.py
1 import string
2 import norms
3 import random
4 import collections
5 import unicodedata
6 import itertools
7 from math import log10
8
9 def letters(text):
10 """Remove all non-alphabetic characters from a text
11 >>> letters('The Quick')
12 'TheQuick'
13 >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
14 'TheQuickBROWNfoxjumpedoverthelazyDOG'
15 """
16 return ''.join([c for c in text if c in string.ascii_letters])
17
18 def unaccent(text):
19 """Remove all accents from letters.
20 It does this by converting the unicode string to decomposed compatability
21 form, dropping all the combining accents, then re-encoding the bytes.
22
23 >>> unaccent('hello')
24 'hello'
25 >>> unaccent('HELLO')
26 'HELLO'
27 >>> unaccent('héllo')
28 'hello'
29 >>> unaccent('héllö')
30 'hello'
31 >>> unaccent('HÉLLÖ')
32 'HELLO'
33 """
34 return unicodedata.normalize('NFKD', text).\
35 encode('ascii', 'ignore').\
36 decode('utf-8')
37
38 def sanitise(text):
39 """Remove all non-alphabetic characters and convert the text to lowercase
40
41 >>> sanitise('The Quick')
42 'thequick'
43 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
44 'thequickbrownfoxjumpedoverthelazydog'
45 >>> sanitise('HÉLLÖ')
46 'hello'
47 """
48 # sanitised = [c.lower() for c in text if c in string.ascii_letters]
49 # return ''.join(sanitised)
50 return letters(unaccent(text)).lower()
51
52
53 def datafile(name, sep='\t'):
54 """Read key,value pairs from file.
55 """
56 with open(name, 'r') as f:
57 for line in f:
58 splits = line.split(sep)
59 yield [splits[0], int(splits[1])]
60
61 english_counts = collections.Counter(dict(datafile('count_1l.txt')))
62 normalised_english_counts = norms.normalise(english_counts)
63
64 english_bigram_counts = collections.Counter(dict(datafile('count_2l.txt')))
65 normalised_english_bigram_counts = norms.normalise(english_bigram_counts)
66
67 english_trigram_counts = collections.Counter(dict(datafile('count_3l.txt')))
68 normalised_english_trigram_counts = norms.normalise(english_trigram_counts)
69
70 with open('words.txt', 'r') as f:
71 keywords = [line.rstrip() for line in f]
72
73
74 def weighted_choice(d):
75 """Generate random item from a dictionary of item counts
76 """
77 target = random.uniform(0, sum(d.values()))
78 cuml = 0.0
79 for (l, p) in d.items():
80 cuml += p
81 if cuml > target:
82 return l
83 return None
84
85 def random_english_letter():
86 """Generate a random letter based on English letter counts
87 """
88 return weighted_choice(normalised_english_counts)
89
90
91 class Pdist(dict):
92 """A probability distribution estimated from counts in datafile.
93 Values are stored and returned as log probabilities.
94 """
95 def __init__(self, data=[], estimate_of_missing=None):
96 data1, data2 = itertools.tee(data)
97 self.total = sum([d[1] for d in data1])
98 for key, count in data2:
99 self[key] = log10(count / self.total)
100 self.estimate_of_missing = estimate_of_missing or (lambda k, N: 1./N)
101 def __missing__(self, key):
102 return self.estimate_of_missing(key, self.total)
103
104 def log_probability_of_unknown_word(key, N):
105 """Estimate the probability of an unknown word.
106 """
107 return -log10(N * 10**((len(key) - 2) * 1.4))
108
109 Pw = Pdist(datafile('count_1w.txt'), log_probability_of_unknown_word)
110 Pl = Pdist(datafile('count_1l.txt'), lambda _k, _N: 0)
111
112 def Pwords(words):
113 """The Naive Bayes log probability of a sequence of words.
114 """
115 return sum(Pw[w.lower()] for w in words)
116
117 def Pletters(letters):
118 """The Naive Bayes log probability of a sequence of letters.
119 """
120 return sum(Pl[l.lower()] for l in letters)
121
122
123
124 def cosine_distance_score(text):
125 """Finds the dissimilarity of a text to English, using the cosine distance
126 of the frequency distribution.
127
128 >>> cosine_distance_score('abcabc') # doctest: +ELLIPSIS
129 0.370847405...
130 """
131 return norms.cosine_distance(english_counts,
132 collections.Counter(sanitise(text)))
133
134
135 if __name__ == "__main__":
136 import doctest
137 doctest.testmod()