X-Git-Url: https://git.njae.me.uk/?a=blobdiff_plain;f=language_models.py;fp=language_models.py;h=5626edbfd03bd802877e7114699437af7dd6164a;hb=5e31b8928eb08839244c2c36981b50e0f20959a2;hp=e4db178c0715e08c7467774370cfe5b1db5392a1;hpb=eaecd10e334e6d63d2fd222bc280b02febca5a1b;p=cipher-tools.git diff --git a/language_models.py b/language_models.py index e4db178..5626edb 100644 --- a/language_models.py +++ b/language_models.py @@ -1,8 +1,8 @@ +import string import norms -import itertools import random -import bisect import collections +import unicodedata english_counts = collections.defaultdict(int) with open('count_1l.txt', 'r') as f: @@ -43,3 +43,50 @@ def random_english_letter(): """Generate a random letter based on English letter counts """ return weighted_choice(normalised_english_counts) + + +def letters(text): + """Remove all non-alphabetic characters from a text + >>> letters('The Quick') + 'TheQuick' + >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG') + 'TheQuickBROWNfoxjumpedoverthelazyDOG' + """ + return ''.join([c for c in text if c in string.ascii_letters]) + +def unaccent(text): + """Remove all accents from letters. + It does this by converting the unicode string to decomposed compatability + form, dropping all the combining accents, then re-encoding the bytes. + + >>> unaccent('hello') + 'hello' + >>> unaccent('HELLO') + 'HELLO' + >>> unaccent('héllo') + 'hello' + >>> unaccent('héllö') + 'hello' + >>> unaccent('HÉLLÖ') + 'HELLO' + """ + return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8') + +def sanitise(text): + """Remove all non-alphabetic characters and convert the text to lowercase + + >>> sanitise('The Quick') + 'thequick' + >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG') + 'thequickbrownfoxjumpedoverthelazydog' + >>> sanitise('HÉLLÖ') + 'hello' + """ + # sanitised = [c.lower() for c in text if c in string.ascii_letters] + # return ''.join(sanitised) + return letters(unaccent(text)).lower() + + +if __name__ == "__main__": + import doctest + doctest.testmod()