X-Git-Url: https://git.njae.me.uk/?a=blobdiff_plain;f=szyfrow%2Fsupport%2Flanguage_models.py;h=13aeeaf09ebd3965ad9273be0f94ae2ca1ce79f6;hb=refs%2Fheads%2Fmain;hp=6898eb4688031c1e21a31cbef64b5a66c3400b83;hpb=6dae779d336388fdec43f684e9607c25423d1572;p=szyfrow.git diff --git a/szyfrow/support/language_models.py b/szyfrow/support/language_models.py index 6898eb4..13aeeaf 100644 --- a/szyfrow/support/language_models.py +++ b/szyfrow/support/language_models.py @@ -1,16 +1,30 @@ +"""Descriptive models of a natural language (in this case, English). + +The functions `Pwords`, `Pletters`, `Pbigrams`, and `Ptrigrams` return the +log probability of a section of text. + +If you want to use a different language, replace the data files in +[`szyfrow/language_model_files`](../language_model_files/index.html). + +* `count_1l.txt`: counts of single letters +* `count_2l.txt`: counts of pairs letters, bigrams +* `count_3l.txt`: counts of triples of letters, triagrams +* `words.txt`: a dictionary of words, used for keyword-based cipher breaking. + These words should only contain characters cointained in + `string.ascii_letters`. + +""" + import string import random import collections import itertools from math import log10 import os - import importlib.resources as pkg_resources import szyfrow.support.norms -from szyfrow.support.utilities import sanitise - - +from szyfrow.support.utilities import sanitise, deduplicate from szyfrow import language_model_files @@ -24,28 +38,72 @@ def datafile(name, sep='\t'): yield [splits[0], int(splits[1])] english_counts = collections.Counter(dict(datafile('count_1l.txt'))) +"""Counts of single letters in English.""" normalised_english_counts = szyfrow.support.norms.normalise(english_counts) +"""Normalised counts of single letters in English (the sum of all counts +adds to 1).""" english_bigram_counts = collections.Counter(dict(datafile('count_2l.txt'))) +"""Counts of letter bigrams in English.""" normalised_english_bigram_counts = szyfrow.support.norms.normalise(english_bigram_counts) +"""Normalised counts of letter bigrams in English (the sum of all counts +adds to 1).""" english_trigram_counts = collections.Counter(dict(datafile('count_3l.txt'))) +"""Counts of letter trigrams in English.""" normalised_english_trigram_counts = szyfrow.support.norms.normalise(english_trigram_counts) +"""Normalised counts of letter trigrams in English (the sum of all counts +adds to 1).""" +keywords = [] +"""A sample list of keywords, to act as a dictionary for +dictionary-based cipher breaking attempts.""" with pkg_resources.open_text(language_model_files, 'words.txt') as f: keywords = [line.rstrip() for line in f] +def transpositions_of(keyword): + """Finds the transpostions given by a keyword. For instance, the keyword + 'clever' rearranges to 'celrv', so the first column (0) stays first, the + second column (1) moves to third, the third column (2) moves to second, + and so on. + + If passed a tuple, assume it's already a transposition and just return it. + + >>> transpositions_of('clever') + (0, 2, 1, 4, 3) + >>> transpositions_of('fred') + (3, 2, 0, 1) + >>> transpositions_of((3, 2, 0, 1)) + (3, 2, 0, 1) + """ + if isinstance(keyword, tuple): + return keyword + else: + key = deduplicate(keyword) + transpositions = tuple(key.index(l) for l in sorted(key)) + return transpositions + +transpositions = collections.defaultdict(list) +"""A sample dict of transpositions, to act as a dictionary for +dictionary-based cipher breaking attempts. Each key is a transposition, +each value is a list of words that give that transposition.""" +for word in keywords: + transpositions[transpositions_of(word)] += [word] + + def weighted_choice(d): """Generate random item from a dictionary of item counts """ - target = random.uniform(0, sum(d.values())) - cuml = 0.0 - for (l, p) in d.items(): - cuml += p - if cuml > target: - return l - return None + delems, dweights = list(zip(*d.items())) + return random.choices(delems, dweights)[0] + # target = random.uniform(0, sum(d.values())) + # cuml = 0.0 + # for (l, p) in d.items(): + # cuml += p + # if cuml > target: + # return l + # return None def random_english_letter(): """Generate a random letter based on English letter counts @@ -85,9 +143,18 @@ def log_probability_of_unknown_word(key, N): return -log10(N * 10**((len(key) - 2) * 1.4)) Pw = Pdist(datafile('count_1w.txt'), log_probability_of_unknown_word) +"""A [Pdist](#szyfrow.support.language_models.Pdist) holding log probabilities +of words. Unknown words have their probability estimated by +[log_probability_of_unknown_word](#szyfrow.support.language_models.log_probability_of_unknown_word)""" Pl = Pdist(datafile('count_1l.txt'), lambda _k, _N: 0) +"""A [Pdist](#szyfrow.support.language_models.Pdist) holding log probabilities +of single letters. Unknown words have their probability estimated as zero.""" P2l = Pdist(datafile('count_2l.txt'), lambda _k, _N: 0) +"""A [Pdist](#szyfrow.support.language_models.Pdist) holding log probabilities +of letter bigrams. Unknown words have their probability estimated as zero.""" P3l = Pdist(datafile('count_3l.txt'), lambda _k, _N: 0) +"""A [Pdist](#szyfrow.support.language_models.Pdist) holding log probabilities +of letter trigrams. Unknown words have their probability estimated as zero.""" def Pwords(words): """The Naive Bayes log probability of a sequence of words.