From 5e31b8928eb08839244c2c36981b50e0f20959a2 Mon Sep 17 00:00:00 2001 From: Neil Smith Date: Mon, 27 Jan 2014 11:24:18 +0000 Subject: [PATCH 1/1] Fixed accent removal --- language_models.py | 51 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/language_models.py b/language_models.py index e4db178..5626edb 100644 --- a/language_models.py +++ b/language_models.py @@ -1,8 +1,8 @@ +import string import norms -import itertools import random -import bisect import collections +import unicodedata english_counts = collections.defaultdict(int) with open('count_1l.txt', 'r') as f: @@ -43,3 +43,50 @@ def random_english_letter(): """Generate a random letter based on English letter counts """ return weighted_choice(normalised_english_counts) + + +def letters(text): + """Remove all non-alphabetic characters from a text + >>> letters('The Quick') + 'TheQuick' + >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG') + 'TheQuickBROWNfoxjumpedoverthelazyDOG' + """ + return ''.join([c for c in text if c in string.ascii_letters]) + +def unaccent(text): + """Remove all accents from letters. + It does this by converting the unicode string to decomposed compatability + form, dropping all the combining accents, then re-encoding the bytes. + + >>> unaccent('hello') + 'hello' + >>> unaccent('HELLO') + 'HELLO' + >>> unaccent('héllo') + 'hello' + >>> unaccent('héllö') + 'hello' + >>> unaccent('HÉLLÖ') + 'HELLO' + """ + return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8') + +def sanitise(text): + """Remove all non-alphabetic characters and convert the text to lowercase + + >>> sanitise('The Quick') + 'thequick' + >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG') + 'thequickbrownfoxjumpedoverthelazydog' + >>> sanitise('HÉLLÖ') + 'hello' + """ + # sanitised = [c.lower() for c in text if c in string.ascii_letters] + # return ''.join(sanitised) + return letters(unaccent(text)).lower() + + +if __name__ == "__main__": + import doctest + doctest.testmod() -- 2.34.1