X-Git-Url: https://git.njae.me.uk/?a=blobdiff_plain;f=language_models.py;h=a6a711f1562d8c70f091165fa15330a825a48559;hb=3d8f7067b9c3a48ef140d7cff834d18ee91f58b3;hp=0fa6e85dc7f3732e2c36a1c1bc4ead827005023e;hpb=defd4de8e665aa31bbf17487bcd5517c5c84b092;p=cipher-tools.git diff --git a/language_models.py b/language_models.py index 0fa6e85..a6a711f 100644 --- a/language_models.py +++ b/language_models.py @@ -7,51 +7,6 @@ import itertools from math import log10 import os -unaccent_specials = ''.maketrans({"’": "'", '“': '"', '”': '"'}) - -def letters(text): - """Remove all non-alphabetic characters from a text - >>> letters('The Quick') - 'TheQuick' - >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG') - 'TheQuickBROWNfoxjumpedoverthelazyDOG' - """ - return ''.join([c for c in text if c in string.ascii_letters]) - -def unaccent(text): - """Remove all accents from letters. - It does this by converting the unicode string to decomposed compatability - form, dropping all the combining accents, then re-encoding the bytes. - - >>> unaccent('hello') - 'hello' - >>> unaccent('HELLO') - 'HELLO' - >>> unaccent('héllo') - 'hello' - >>> unaccent('héllö') - 'hello' - >>> unaccent('HÉLLÖ') - 'HELLO' - """ - translated_text = text.translate(unaccent_specials) - return unicodedata.normalize('NFKD', translated_text).\ - encode('ascii', 'ignore').\ - decode('utf-8') - -def sanitise(text): - """Remove all non-alphabetic characters and convert the text to lowercase - - >>> sanitise('The Quick') - 'thequick' - >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG') - 'thequickbrownfoxjumpedoverthelazydog' - >>> sanitise('HÉLLÖ') - 'hello' - """ - # sanitised = [c.lower() for c in text if c in string.ascii_letters] - # return ''.join(sanitised) - return letters(unaccent(text)).lower() def datafile(name, sep='\t'): @@ -76,20 +31,20 @@ with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'words.txt') def weighted_choice(d): - """Generate random item from a dictionary of item counts - """ - target = random.uniform(0, sum(d.values())) - cuml = 0.0 - for (l, p) in d.items(): - cuml += p - if cuml > target: - return l - return None + """Generate random item from a dictionary of item counts + """ + target = random.uniform(0, sum(d.values())) + cuml = 0.0 + for (l, p) in d.items(): + cuml += p + if cuml > target: + return l + return None def random_english_letter(): - """Generate a random letter based on English letter counts - """ - return weighted_choice(normalised_english_counts) + """Generate a random letter based on English letter counts + """ + return weighted_choice(normalised_english_counts) def ngrams(text, n): @@ -144,12 +99,6 @@ def Pbigrams(letters): """ return sum(P2l[p] for p in ngrams(letters, 2)) -def Pbigrams(letters): - """The Naive Bayes log probability of the bigrams formed from a sequence - of letters. - """ - return sum(P2l[p] for p in ngrams(letters, 2)) - def Ptrigrams(letters): """The Naive Bayes log probability of the trigrams formed from a sequence of letters.