From: Neil Smith <neil.git@njae.me.uk>
Date: Mon, 27 Jan 2014 11:24:18 +0000 (+0000)
Subject: Fixed accent removal
X-Git-Url: https://git.njae.me.uk/?a=commitdiff_plain;h=5e31b8928eb08839244c2c36981b50e0f20959a2;p=cipher-tools.git

Fixed accent removal
---

diff --git a/language_models.py b/language_models.py
index e4db178..5626edb 100644
--- a/language_models.py
+++ b/language_models.py
@@ -1,8 +1,8 @@
+import string
 import norms
-import itertools
 import random
-import bisect
 import collections
+import unicodedata
 
 english_counts = collections.defaultdict(int)
 with open('count_1l.txt', 'r') as f:
@@ -43,3 +43,50 @@ def random_english_letter():
 	"""Generate a random letter based on English letter counts
 	"""
 	return weighted_choice(normalised_english_counts)
+
+
+def letters(text):
+    """Remove all non-alphabetic characters from a text
+    >>> letters('The Quick')
+    'TheQuick'
+    >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
+    'TheQuickBROWNfoxjumpedoverthelazyDOG'
+    """
+    return ''.join([c for c in text if c in string.ascii_letters])
+
+def unaccent(text):
+	"""Remove all accents from letters. 
+	It does this by converting the unicode string to decomposed compatability
+	form, dropping all the combining accents, then re-encoding the bytes.
+
+	>>> unaccent('hello')
+	'hello'
+	>>> unaccent('HELLO')
+	'HELLO'
+	>>> unaccent('héllo')
+	'hello'
+	>>> unaccent('héllö')
+	'hello'
+	>>> unaccent('HÉLLÖ')
+	'HELLO'
+	"""
+	return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
+
+def sanitise(text):
+    """Remove all non-alphabetic characters and convert the text to lowercase
+    
+    >>> sanitise('The Quick')
+    'thequick'
+    >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
+    'thequickbrownfoxjumpedoverthelazydog'
+    >>> sanitise('HÉLLÖ')
+    'hello'
+    """
+    # sanitised = [c.lower() for c in text if c in string.ascii_letters]
+    # return ''.join(sanitised)
+    return letters(unaccent(text)).lower()
+
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()