Fixed accent removal
authorNeil Smith <neil.git@njae.me.uk>
Mon, 27 Jan 2014 11:24:18 +0000 (11:24 +0000)
committerNeil Smith <neil.git@njae.me.uk>
Mon, 27 Jan 2014 11:24:18 +0000 (11:24 +0000)
language_models.py

index e4db178c0715e08c7467774370cfe5b1db5392a1..5626edbfd03bd802877e7114699437af7dd6164a 100644 (file)
@@ -1,8 +1,8 @@
+import string
 import norms
-import itertools
 import random
-import bisect
 import collections
+import unicodedata
 
 english_counts = collections.defaultdict(int)
 with open('count_1l.txt', 'r') as f:
@@ -43,3 +43,50 @@ def random_english_letter():
        """Generate a random letter based on English letter counts
        """
        return weighted_choice(normalised_english_counts)
+
+
+def letters(text):
+    """Remove all non-alphabetic characters from a text
+    >>> letters('The Quick')
+    'TheQuick'
+    >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
+    'TheQuickBROWNfoxjumpedoverthelazyDOG'
+    """
+    return ''.join([c for c in text if c in string.ascii_letters])
+
+def unaccent(text):
+       """Remove all accents from letters. 
+       It does this by converting the unicode string to decomposed compatability
+       form, dropping all the combining accents, then re-encoding the bytes.
+
+       >>> unaccent('hello')
+       'hello'
+       >>> unaccent('HELLO')
+       'HELLO'
+       >>> unaccent('héllo')
+       'hello'
+       >>> unaccent('héllö')
+       'hello'
+       >>> unaccent('HÉLLÖ')
+       'HELLO'
+       """
+       return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
+
+def sanitise(text):
+    """Remove all non-alphabetic characters and convert the text to lowercase
+    
+    >>> sanitise('The Quick')
+    'thequick'
+    >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
+    'thequickbrownfoxjumpedoverthelazydog'
+    >>> sanitise('HÉLLÖ')
+    'hello'
+    """
+    # sanitised = [c.lower() for c in text if c in string.ascii_letters]
+    # return ''.join(sanitised)
+    return letters(unaccent(text)).lower()
+
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()