Caesar ciphers
[cipher-training.git] / language_models.py
1 """Language-specific functions, including models of languages based on data of
2 its use.
3 """
4
5 import string
6 import unicodedata
7
8 def letters(text):
9 """Remove all non-alphabetic characters from a text
10 >>> letters('The Quick')
11 'TheQuick'
12 >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
13 'TheQuickBROWNfoxjumpedoverthelazyDOG'
14 """
15 return ''.join([c for c in text if c in string.ascii_letters])
16
17 def unaccent(text):
18 """Remove all accents from letters.
19 It does this by converting the unicode string to decomposed compatability
20 form, dropping all the combining accents, then re-encoding the bytes.
21
22 >>> unaccent('hello')
23 'hello'
24 >>> unaccent('HELLO')
25 'HELLO'
26 >>> unaccent('héllo')
27 'hello'
28 >>> unaccent('héllö')
29 'hello'
30 >>> unaccent('HÉLLÖ')
31 'HELLO'
32 """
33 return unicodedata.normalize('NFKD', text).\
34 encode('ascii', 'ignore').\
35 decode('utf-8')
36
37 def sanitise(text):
38 """Remove all non-alphabetic characters and convert the text to lowercase
39
40 >>> sanitise('The Quick')
41 'thequick'
42 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
43 'thequickbrownfoxjumpedoverthelazydog'
44 >>> sanitise('HÉLLÖ')
45 'hello'
46 """
47 # sanitised = [c.lower() for c in text if c in string.ascii_letters]
48 # return ''.join(sanitised)
49 return letters(unaccent(text)).lower()
50
51
52
53 if __name__ == "__main__":
54 import doctest
55 doctest.testmod()