Fixed accent removal
[cipher-tools.git] / language_models.py
1 import string
2 import norms
3 import random
4 import collections
5 import unicodedata
6
7 english_counts = collections.defaultdict(int)
8 with open('count_1l.txt', 'r') as f:
9 for line in f:
10 (letter, count) = line.split("\t")
11 english_counts[letter] = int(count)
12 normalised_english_counts = norms.normalise(english_counts)
13
14 english_bigram_counts = collections.defaultdict(int)
15 with open('count_2l.txt', 'r') as f:
16 for line in f:
17 (bigram, count) = line.split("\t")
18 english_bigram_counts[bigram] = int(count)
19 normalised_english_bigram_counts = norms.normalise(english_bigram_counts)
20
21 english_trigram_counts = collections.defaultdict(int)
22 with open('count_3l.txt', 'r') as f:
23 for line in f:
24 (trigram, count) = line.split("\t")
25 english_trigram_counts[trigram] = int(count)
26 normalised_english_trigram_counts = norms.normalise(english_trigram_counts)
27
28 with open('words.txt', 'r') as f:
29 keywords = [line.rstrip() for line in f]
30
31 def weighted_choice(d):
32 """Generate a set of random items from a dictionary of item counts
33 """
34 target = random.uniform(0, sum(d.values()))
35 cuml = 0.0
36 for (l, p) in d.items():
37 cuml += p
38 if cuml > target:
39 return l
40 return None
41
42 def random_english_letter():
43 """Generate a random letter based on English letter counts
44 """
45 return weighted_choice(normalised_english_counts)
46
47
48 def letters(text):
49 """Remove all non-alphabetic characters from a text
50 >>> letters('The Quick')
51 'TheQuick'
52 >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
53 'TheQuickBROWNfoxjumpedoverthelazyDOG'
54 """
55 return ''.join([c for c in text if c in string.ascii_letters])
56
57 def unaccent(text):
58 """Remove all accents from letters.
59 It does this by converting the unicode string to decomposed compatability
60 form, dropping all the combining accents, then re-encoding the bytes.
61
62 >>> unaccent('hello')
63 'hello'
64 >>> unaccent('HELLO')
65 'HELLO'
66 >>> unaccent('héllo')
67 'hello'
68 >>> unaccent('héllö')
69 'hello'
70 >>> unaccent('HÉLLÖ')
71 'HELLO'
72 """
73 return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
74
75 def sanitise(text):
76 """Remove all non-alphabetic characters and convert the text to lowercase
77
78 >>> sanitise('The Quick')
79 'thequick'
80 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
81 'thequickbrownfoxjumpedoverthelazydog'
82 >>> sanitise('HÉLLÖ')
83 'hello'
84 """
85 # sanitised = [c.lower() for c in text if c in string.ascii_letters]
86 # return ''.join(sanitised)
87 return letters(unaccent(text)).lower()
88
89
90 if __name__ == "__main__":
91 import doctest
92 doctest.testmod()