Started 2015 challenges
[cipher-tools.git] / language_models.py
index 929746888d036fb54de3f1fbf228e296e0bcd027..19f886fcefcb4384184e0bbad108e6925f029bbf 100644 (file)
@@ -6,6 +6,8 @@ import unicodedata
 import itertools
 from math import log10
 
+unaccent_specials = ''.maketrans({"’": "'"})
+
 def letters(text):
     """Remove all non-alphabetic characters from a text
     >>> letters('The Quick')
@@ -31,7 +33,8 @@ def unaccent(text):
     >>> unaccent('HÉLLÖ')
     'HELLO'
     """
-    return unicodedata.normalize('NFKD', text).\
+    translated_text = text.translate(unaccent_specials)
+    return unicodedata.normalize('NFKD', translated_text).\
         encode('ascii', 'ignore').\
         decode('utf-8')