From: Neil Smith Date: Mon, 27 Jan 2014 11:24:18 +0000 (+0000) Subject: Fixed accent removal X-Git-Url: https://git.njae.me.uk/?a=commitdiff_plain;h=5e31b8928eb08839244c2c36981b50e0f20959a2;p=cipher-tools.git Fixed accent removal --- diff --git a/language_models.py b/language_models.py index e4db178..5626edb 100644 --- a/language_models.py +++ b/language_models.py @@ -1,8 +1,8 @@ +import string import norms -import itertools import random -import bisect import collections +import unicodedata english_counts = collections.defaultdict(int) with open('count_1l.txt', 'r') as f: @@ -43,3 +43,50 @@ def random_english_letter(): """Generate a random letter based on English letter counts """ return weighted_choice(normalised_english_counts) + + +def letters(text): + """Remove all non-alphabetic characters from a text + >>> letters('The Quick') + 'TheQuick' + >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG') + 'TheQuickBROWNfoxjumpedoverthelazyDOG' + """ + return ''.join([c for c in text if c in string.ascii_letters]) + +def unaccent(text): + """Remove all accents from letters. + It does this by converting the unicode string to decomposed compatability + form, dropping all the combining accents, then re-encoding the bytes. + + >>> unaccent('hello') + 'hello' + >>> unaccent('HELLO') + 'HELLO' + >>> unaccent('héllo') + 'hello' + >>> unaccent('héllö') + 'hello' + >>> unaccent('HÉLLÖ') + 'HELLO' + """ + return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8') + +def sanitise(text): + """Remove all non-alphabetic characters and convert the text to lowercase + + >>> sanitise('The Quick') + 'thequick' + >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG') + 'thequickbrownfoxjumpedoverthelazydog' + >>> sanitise('HÉLLÖ') + 'hello' + """ + # sanitised = [c.lower() for c in text if c in string.ascii_letters] + # return ''.join(sanitised) + return letters(unaccent(text)).lower() + + +if __name__ == "__main__": + import doctest + doctest.testmod()