Unaccent letters before enciphering
[cipher-training.git] / language_models.py
1 import unicodedata
2
3 def unaccent(text):
4 """Remove all accents from letters.
5 It does this by converting the unicode string to decomposed compatability
6 form, dropping all the combining accents, then re-encoding the bytes.
7
8 >>> unaccent('hello')
9 'hello'
10 >>> unaccent('HELLO')
11 'HELLO'
12 >>> unaccent('héllo')
13 'hello'
14 >>> unaccent('héllö')
15 'hello'
16 >>> unaccent('HÉLLÖ')
17 'HELLO'
18 """
19 return unicodedata.normalize('NFKD', text).\
20 encode('ascii', 'ignore').\
21 decode('utf-8')
22
23
24 if __name__ == "__main__":
25 import doctest
26 doctest.testmod()