X-Git-Url: https://git.njae.me.uk/?a=blobdiff_plain;ds=sidebyside;f=language_models.py;fp=language_models.py;h=e4db178c0715e08c7467774370cfe5b1db5392a1;hb=eaecd10e334e6d63d2fd222bc280b02febca5a1b;hp=0000000000000000000000000000000000000000;hpb=19a359ab34be225b4ab7df3974368a2833d45648;p=cipher-tools.git diff --git a/language_models.py b/language_models.py new file mode 100644 index 0000000..e4db178 --- /dev/null +++ b/language_models.py @@ -0,0 +1,45 @@ +import norms +import itertools +import random +import bisect +import collections + +english_counts = collections.defaultdict(int) +with open('count_1l.txt', 'r') as f: + for line in f: + (letter, count) = line.split("\t") + english_counts[letter] = int(count) +normalised_english_counts = norms.normalise(english_counts) + +english_bigram_counts = collections.defaultdict(int) +with open('count_2l.txt', 'r') as f: + for line in f: + (bigram, count) = line.split("\t") + english_bigram_counts[bigram] = int(count) +normalised_english_bigram_counts = norms.normalise(english_bigram_counts) + +english_trigram_counts = collections.defaultdict(int) +with open('count_3l.txt', 'r') as f: + for line in f: + (trigram, count) = line.split("\t") + english_trigram_counts[trigram] = int(count) +normalised_english_trigram_counts = norms.normalise(english_trigram_counts) + +with open('words.txt', 'r') as f: + keywords = [line.rstrip() for line in f] + +def weighted_choice(d): + """Generate a set of random items from a dictionary of item counts + """ + target = random.uniform(0, sum(d.values())) + cuml = 0.0 + for (l, p) in d.items(): + cuml += p + if cuml > target: + return l + return None + +def random_english_letter(): + """Generate a random letter based on English letter counts + """ + return weighted_choice(normalised_english_counts)