X-Git-Url: https://git.njae.me.uk/?a=blobdiff_plain;f=szyfrow%2Fsupport%2Flanguage_models.py;h=95dbeb277b8c828f62cc6e8ac83938e683491ed0;hb=3350a462f460e81d96c587466f5b6a88cbba1f7e;hp=271b8dba023f93f35ab9c6b93d51e10d36fcdcb4;hpb=27c8005f6dea0026887b80a01b5f93a8f1b3c2b2;p=szyfrow.git diff --git a/szyfrow/support/language_models.py b/szyfrow/support/language_models.py index 271b8db..95dbeb2 100644 --- a/szyfrow/support/language_models.py +++ b/szyfrow/support/language_models.py @@ -4,14 +4,18 @@ import collections import itertools from math import log10 import os +import importlib.resources as pkg_resources import szyfrow.support.norms -from szyfrow.support.utilities import sanitise +from szyfrow.support.utilities import sanitise, deduplicate +from szyfrow import language_model_files + def datafile(name, sep='\t'): """Read key,value pairs from file. """ - with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), name), 'r') as f: + with pkg_resources.open_text(language_model_files, name) as f: + # with open(p name), 'r') as f: for line in f: splits = line.split(sep) yield [splits[0], int(splits[1])] @@ -25,10 +29,37 @@ normalised_english_bigram_counts = szyfrow.support.norms.normalise(english_bigra english_trigram_counts = collections.Counter(dict(datafile('count_3l.txt'))) normalised_english_trigram_counts = szyfrow.support.norms.normalise(english_trigram_counts) -with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'words.txt'), 'r') as f: +with pkg_resources.open_text(language_model_files, 'words.txt') as f: keywords = [line.rstrip() for line in f] +def transpositions_of(keyword): + """Finds the transpostions given by a keyword. For instance, the keyword + 'clever' rearranges to 'celrv', so the first column (0) stays first, the + second column (1) moves to third, the third column (2) moves to second, + and so on. + + If passed a tuple, assume it's already a transposition and just return it. + + >>> transpositions_of('clever') + (0, 2, 1, 4, 3) + >>> transpositions_of('fred') + (3, 2, 0, 1) + >>> transpositions_of((3, 2, 0, 1)) + (3, 2, 0, 1) + """ + if isinstance(keyword, tuple): + return keyword + else: + key = deduplicate(keyword) + transpositions = tuple(key.index(l) for l in sorted(key)) + return transpositions + +transpositions = collections.defaultdict(list) +for word in keywords: + transpositions[transpositions_of(word)] += [word] + + def weighted_choice(d): """Generate random item from a dictionary of item counts """