X-Git-Url: https://git.njae.me.uk/?a=blobdiff_plain;f=language_models.py;h=a6a711f1562d8c70f091165fa15330a825a48559;hb=21c390a77d42729afa23844ef2f1295106bed3de;hp=da5d2d07fa2003a3bf95a4a6629c1eafd666382b;hpb=d0a53e974970bc915d94280b5158b50f93054dc3;p=cipher-tools.git diff --git a/language_models.py b/language_models.py index da5d2d0..a6a711f 100644 --- a/language_models.py +++ b/language_models.py @@ -7,51 +7,6 @@ import itertools from math import log10 import os -unaccent_specials = ''.maketrans({"’": "'", '“': '"', '”': '"'}) - -def letters(text): - """Remove all non-alphabetic characters from a text - >>> letters('The Quick') - 'TheQuick' - >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG') - 'TheQuickBROWNfoxjumpedoverthelazyDOG' - """ - return ''.join([c for c in text if c in string.ascii_letters]) - -def unaccent(text): - """Remove all accents from letters. - It does this by converting the unicode string to decomposed compatability - form, dropping all the combining accents, then re-encoding the bytes. - - >>> unaccent('hello') - 'hello' - >>> unaccent('HELLO') - 'HELLO' - >>> unaccent('héllo') - 'hello' - >>> unaccent('héllö') - 'hello' - >>> unaccent('HÉLLÖ') - 'HELLO' - """ - translated_text = text.translate(unaccent_specials) - return unicodedata.normalize('NFKD', translated_text).\ - encode('ascii', 'ignore').\ - decode('utf-8') - -def sanitise(text): - """Remove all non-alphabetic characters and convert the text to lowercase - - >>> sanitise('The Quick') - 'thequick' - >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG') - 'thequickbrownfoxjumpedoverthelazydog' - >>> sanitise('HÉLLÖ') - 'hello' - """ - # sanitised = [c.lower() for c in text if c in string.ascii_letters] - # return ''.join(sanitised) - return letters(unaccent(text)).lower() def datafile(name, sep='\t'):