From: Neil Smith <neil.git@njae.me.uk>
Date: Mon, 10 Mar 2014 20:33:05 +0000 (-0400)
Subject: Unaccent letters before enciphering
X-Git-Url: https://git.njae.me.uk/?a=commitdiff_plain;h=69c9038aaf5cc2f0a758435713f18f3b51bbbe4a;p=cipher-training.git

Unaccent letters before enciphering
---

diff --git a/cipher.py b/cipher.py
index e3c183d..4a12703 100644
--- a/cipher.py
+++ b/cipher.py
@@ -1,6 +1,7 @@
 import string
+from language_models import *
 
-def caesar_encipher_letter(letter, shift):
+def caesar_encipher_letter(accented_letter, shift):
     """Encipher a letter, given a shift amount
 
     >>> caesar_encipher_letter('a', 1)
@@ -19,7 +20,12 @@ def caesar_encipher_letter(letter, shift):
     'y'
     >>> caesar_encipher_letter('a', -1)
     'z'
+    >>> caesar_encipher_letter('A', 1)
+    'B'
+    >>> caesar_encipher_letter('Ã©', 1)
+    'f'
     """
+    letter = unaccent(accented_letter)
     if letter in string.ascii_letters:
         if letter in string.ascii_uppercase:
             alphabet_start = ord('A')
@@ -51,6 +57,8 @@ def caesar_encipher(message, shift):
     'cdezab'
     >>> caesar_encipher('ab cx yz', 2)
     'cd ez ab'
+    >>> caesar_encipher('HÃ©llo World!', 2)
+    'Jgnnq Yqtnf!'
     """
     enciphered = [caesar_encipher_letter(l, shift) for l in message]
     return ''.join(enciphered)
@@ -64,6 +72,8 @@ def caesar_decipher(message, shift):
     'abc'
     >>> caesar_decipher('cd ez ab', 2)
     'ab cx yz'
+    >>> caesar_decipher('Jgnnq Yqtnf!', 2)
+    'Hello World!'
     """
     return caesar_encipher(message, -shift)
 
diff --git a/language_models.py b/language_models.py
new file mode 100644
index 0000000..1786ebe
--- /dev/null
+++ b/language_models.py
@@ -0,0 +1,26 @@
+import unicodedata
+
+def unaccent(text):
+    """Remove all accents from letters. 
+    It does this by converting the unicode string to decomposed compatability
+    form, dropping all the combining accents, then re-encoding the bytes.
+
+    >>> unaccent('hello')
+    'hello'
+    >>> unaccent('HELLO')
+    'HELLO'
+    >>> unaccent('hÃ©llo')
+    'hello'
+    >>> unaccent('hÃ©llÃ¶')
+    'hello'
+    >>> unaccent('HÃLLÃ')
+    'HELLO'
+    """
+    return unicodedata.normalize('NFKD', text).\
+        encode('ascii', 'ignore').\
+        decode('utf-8')
+
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()