4798d731610929e3fc8c31324e83db1a20eafd61
[cipher-training.git] / language_models.py
1 """Language-specific functions, including models of languages based on data of
2 its use.
3 """
4
5 import string
6 import random
7 import norms
8 import collections
9 import unicodedata
10 import itertools
11 from math import log10
12
13 def letters(text):
14 """Remove all non-alphabetic characters from a text
15 >>> letters('The Quick')
16 'TheQuick'
17 >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
18 'TheQuickBROWNfoxjumpedoverthelazyDOG'
19 """
20 return ''.join([c for c in text if c in string.ascii_letters])
21
22 def unaccent(text):
23 """Remove all accents from letters.
24 It does this by converting the unicode string to decomposed compatability
25 form, dropping all the combining accents, then re-encoding the bytes.
26
27 >>> unaccent('hello')
28 'hello'
29 >>> unaccent('HELLO')
30 'HELLO'
31 >>> unaccent('héllo')
32 'hello'
33 >>> unaccent('héllö')
34 'hello'
35 >>> unaccent('HÉLLÖ')
36 'HELLO'
37 """
38 return unicodedata.normalize('NFKD', text).\
39 encode('ascii', 'ignore').\
40 decode('utf-8')
41
42 def sanitise(text):
43 """Remove all non-alphabetic characters and convert the text to lowercase
44
45 >>> sanitise('The Quick')
46 'thequick'
47 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
48 'thequickbrownfoxjumpedoverthelazydog'
49 >>> sanitise('HÉLLÖ')
50 'hello'
51 """
52 # sanitised = [c.lower() for c in text if c in string.ascii_letters]
53 # return ''.join(sanitised)
54 return letters(unaccent(text)).lower()
55
56
57 def datafile(name, sep='\t'):
58 """Read key,value pairs from file.
59 """
60 with open(name, 'r') as f:
61 for line in f:
62 splits = line.split(sep)
63 yield [splits[0], int(splits[1])]
64
65 english_counts = collections.Counter(dict(datafile('count_1l.txt')))
66 normalised_english_counts = norms.normalise(english_counts)
67
68 # The log 10 probabilities of letters
69 Pl = {l: log10(n) for l, n in normalised_english_counts.items()}
70
71 with open('words.txt', 'r') as f:
72 keywords = [line.rstrip() for line in f]
73
74 def Pletters(letters):
75 """The Naive Bayes log probability of a sequence of letters.
76 """
77 return sum(Pl[l.lower()] for l in letters)
78
79
80 def cosine_similarity_score(text):
81 """Finds the dissimilarity of a text to English, using the cosine distance
82 of the frequency distribution.
83
84 >>> cosine_similarity_score('abcabc') # doctest: +ELLIPSIS
85 0.26228882...
86 """
87 return norms.cosine_similarity(english_counts,
88 collections.Counter(sanitise(text)))
89
90
91 if __name__ == "__main__":
92 import doctest
93 doctest.testmod()