Transpositon ciphers
[cipher-training.git] / language_models.py
1 """Language-specific functions, including models of languages based on data of
2 its use.
3 """
4
5 import string
6 import random
7 import norms
8 import collections
9 import unicodedata
10 import itertools
11 from math import log10
12
13 def letters(text):
14 """Remove all non-alphabetic characters from a text
15 >>> letters('The Quick')
16 'TheQuick'
17 >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
18 'TheQuickBROWNfoxjumpedoverthelazyDOG'
19 """
20 return ''.join([c for c in text if c in string.ascii_letters])
21
22 def unaccent(text):
23 """Remove all accents from letters.
24 It does this by converting the unicode string to decomposed compatability
25 form, dropping all the combining accents, then re-encoding the bytes.
26
27 >>> unaccent('hello')
28 'hello'
29 >>> unaccent('HELLO')
30 'HELLO'
31 >>> unaccent('héllo')
32 'hello'
33 >>> unaccent('héllö')
34 'hello'
35 >>> unaccent('HÉLLÖ')
36 'HELLO'
37 """
38 return unicodedata.normalize('NFKD', text).\
39 encode('ascii', 'ignore').\
40 decode('utf-8')
41
42 def sanitise(text):
43 """Remove all non-alphabetic characters and convert the text to lowercase
44
45 >>> sanitise('The Quick')
46 'thequick'
47 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
48 'thequickbrownfoxjumpedoverthelazydog'
49 >>> sanitise('HÉLLÖ')
50 'hello'
51 """
52 # sanitised = [c.lower() for c in text if c in string.ascii_letters]
53 # return ''.join(sanitised)
54 return letters(unaccent(text)).lower()
55
56
57 def datafile(name, sep='\t'):
58 """Read key,value pairs from file.
59 """
60 with open(name, 'r') as f:
61 for line in f:
62 splits = line.split(sep)
63 yield [splits[0], int(splits[1])]
64
65 english_counts = collections.Counter(dict(datafile('count_1l.txt')))
66 normalised_english_counts = norms.normalise(english_counts)
67
68 with open('words.txt', 'r') as f:
69 keywords = [line.rstrip() for line in f]
70
71
72 def weighted_choice(d):
73 """Generate random item from a dictionary of item counts
74 """
75 target = random.uniform(0, sum(d.values()))
76 cuml = 0.0
77 for (l, p) in d.items():
78 cuml += p
79 if cuml > target:
80 return l
81 return None
82
83 def random_english_letter():
84 """Generate a random letter based on English letter counts
85 """
86 return weighted_choice(normalised_english_counts)
87
88
89 class Pdist(dict):
90 """A probability distribution estimated from counts in datafile.
91 Values are stored and returned as log probabilities.
92 """
93 def __init__(self, data=[], estimate_of_missing=None):
94 data1, data2 = itertools.tee(data)
95 self.total = sum([d[1] for d in data1])
96 for key, count in data2:
97 self[key] = log10(count / self.total)
98 self.estimate_of_missing = estimate_of_missing or (lambda k, N: 1./N)
99 def __missing__(self, key):
100 return self.estimate_of_missing(key, self.total)
101
102 def log_probability_of_unknown_word(key, N):
103 """Estimate the probability of an unknown word.
104 """
105 return -log10(N * 10**((len(key) - 2) * 1.4))
106
107 Pw = Pdist(datafile('count_1w.txt'), log_probability_of_unknown_word)
108 Pw_wrong = Pdist(datafile('count_1w.txt'), lambda _k, N: log10(1/N))
109 Pl = Pdist(datafile('count_1l.txt'), lambda _k, _N: 0)
110 P2l = Pdist(datafile('count_2l.txt'), lambda _k, _N: 0)
111 P3l = Pdist(datafile('count_3l.txt'), lambda _k, _N: 0)
112
113 def Pwords(words):
114 """The Naive Bayes log probability of a sequence of words.
115 """
116 return sum(Pw[w.lower()] for w in words)
117
118 def Pwords_wrong(words):
119 """The Naive Bayes log probability of a sequence of words.
120 """
121 return sum(Pw_wrong[w.lower()] for w in words)
122
123 def Pletters(letters):
124 """The Naive Bayes log probability of a sequence of letters.
125 """
126 return sum(Pl[l.lower()] for l in letters)
127
128
129 def cosine_similarity_score(text):
130 """Finds the dissimilarity of a text to English, using the cosine distance
131 of the frequency distribution.
132
133 >>> cosine_similarity_score('abcabc') # doctest: +ELLIPSIS
134 0.26228882...
135 """
136 return norms.cosine_similarity(english_counts,
137 collections.Counter(sanitise(text)))
138
139
140 if __name__ == "__main__":
141 import doctest
142 doctest.testmod()