Caesar ciphers
[cipher-training.git] / language_models.py
1 """Language-specific functions, including models of languages based on data of
2 its use.
3 """
4
5 import string
6 import norms
7 import collections
8 import unicodedata
9
10 def letters(text):
11 """Remove all non-alphabetic characters from a text
12 >>> letters('The Quick')
13 'TheQuick'
14 >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
15 'TheQuickBROWNfoxjumpedoverthelazyDOG'
16 """
17 return ''.join([c for c in text if c in string.ascii_letters])
18
19 def unaccent(text):
20 """Remove all accents from letters.
21 It does this by converting the unicode string to decomposed compatability
22 form, dropping all the combining accents, then re-encoding the bytes.
23
24 >>> unaccent('hello')
25 'hello'
26 >>> unaccent('HELLO')
27 'HELLO'
28 >>> unaccent('héllo')
29 'hello'
30 >>> unaccent('héllö')
31 'hello'
32 >>> unaccent('HÉLLÖ')
33 'HELLO'
34 """
35 return unicodedata.normalize('NFKD', text).\
36 encode('ascii', 'ignore').\
37 decode('utf-8')
38
39 def sanitise(text):
40 """Remove all non-alphabetic characters and convert the text to lowercase
41
42 >>> sanitise('The Quick')
43 'thequick'
44 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
45 'thequickbrownfoxjumpedoverthelazydog'
46 >>> sanitise('HÉLLÖ')
47 'hello'
48 """
49 # sanitised = [c.lower() for c in text if c in string.ascii_letters]
50 # return ''.join(sanitised)
51 return letters(unaccent(text)).lower()
52
53
54 if __name__ == "__main__":
55 import doctest
56 doctest.testmod()