szyfrow/support/language_models.py

   1 """Descriptive models of a natural language (in this case, English).
   2
   3 The functions `Pwords`, `Pletters`, `Pbigrams`, and `Ptrigrams` return the
   4 log probability of a section of text.
   5
   6 If you want to use a different language, replace the data files in
   7 [`szyfrow/language_model_files`](../language_model_files/index.html).
   8
   9 * `count_1l.txt`: counts of single letters
  10 * `count_2l.txt`: counts of pairs letters, bigrams
  11 * `count_3l.txt`: counts of triples of letters, triagrams
  12 * `words.txt`: a dictionary of words, used for keyword-based cipher breaking.
  13   These words should only contain characters cointained in
  14   `string.ascii_letters`.
  15
  16 """
  17
  18 import string
  19 import random
  20 import collections
  21 import itertools
  22 from math import log10
  23 import os
  24 import importlib.resources as pkg_resources
  25
  26 import szyfrow.support.norms
  27 from szyfrow.support.utilities import sanitise, deduplicate
  28 from szyfrow import language_model_files
  29
  30
  31 def datafile(name, sep='\t'):
  32     """Read key,value pairs from file.
  33     """
  34     with pkg_resources.open_text(language_model_files, name) as f:
  35     # with open(p name), 'r') as f:
  36         for line in f:
  37             splits = line.split(sep)
  38             yield [splits[0], int(splits[1])]
  39
  40 english_counts = collections.Counter(dict(datafile('count_1l.txt')))
  41 """Counts of single letters in English."""
  42 normalised_english_counts = szyfrow.support.norms.normalise(english_counts)
  43 """Normalised counts of single letters in English (the sum of all counts
  44 adds to 1)."""
  45
  46 english_bigram_counts = collections.Counter(dict(datafile('count_2l.txt')))
  47 """Counts of letter bigrams in English."""
  48 normalised_english_bigram_counts = szyfrow.support.norms.normalise(english_bigram_counts)
  49 """Normalised counts of letter bigrams in English (the sum of all counts
  50 adds to 1)."""
  51
  52 english_trigram_counts = collections.Counter(dict(datafile('count_3l.txt')))
  53 """Counts of letter trigrams in English."""
  54 normalised_english_trigram_counts = szyfrow.support.norms.normalise(english_trigram_counts)
  55 """Normalised counts of letter trigrams in English (the sum of all counts
  56 adds to 1)."""
  57
  58 keywords = []
  59 """A sample list of keywords, to act as a dictionary for
  60 dictionary-based cipher breaking attempts."""
  61 with pkg_resources.open_text(language_model_files, 'words.txt') as f:
  62     keywords = [line.rstrip() for line in f]
  63
  64
  65 def transpositions_of(keyword):
  66     """Finds the transpostions given by a keyword. For instance, the keyword
  67     'clever' rearranges to 'celrv', so the first column (0) stays first, the
  68     second column (1) moves to third, the third column (2) moves to second,
  69     and so on.
  70
  71     If passed a tuple, assume it's already a transposition and just return it.
  72
  73     >>> transpositions_of('clever')
  74     (0, 2, 1, 4, 3)
  75     >>> transpositions_of('fred')
  76     (3, 2, 0, 1)
  77     >>> transpositions_of((3, 2, 0, 1))
  78     (3, 2, 0, 1)
  79     """
  80     if isinstance(keyword, tuple):
  81         return keyword
  82     else:
  83         key = deduplicate(keyword)
  84         transpositions = tuple(key.index(l) for l in sorted(key))
  85         return transpositions
  86
  87 transpositions = collections.defaultdict(list)
  88 """A sample dict of transpositions, to act as a dictionary for
  89 dictionary-based cipher breaking attempts. Each key is a transposition,
  90 each value is a list of words that give that transposition."""
  91 for word in keywords:
  92     transpositions[transpositions_of(word)] += [word]
  93
  94
  95 def weighted_choice(d):
  96     """Generate random item from a dictionary of item counts
  97     """
  98     delems, dweights = list(zip(*d.items()))
  99     return random.choices(delems, dweights)[0]
 100     # target = random.uniform(0, sum(d.values()))
 101     # cuml = 0.0
 102     # for (l, p) in d.items():
 103     #     cuml += p
 104     #     if cuml > target:
 105     #         return l
 106     # return None
 107
 108 def random_english_letter():
 109     """Generate a random letter based on English letter counts
 110     """
 111     return weighted_choice(normalised_english_counts)
 112
 113
 114 def ngrams(text, n):
 115     """Returns all n-grams of a text
 116
 117     >>> ngrams(sanitise('the quick brown fox'), 2) # doctest: +NORMALIZE_WHITESPACE
 118     ['th', 'he', 'eq', 'qu', 'ui', 'ic', 'ck', 'kb', 'br', 'ro', 'ow', 'wn',
 119      'nf', 'fo', 'ox']
 120     >>> ngrams(sanitise('the quick brown fox'), 4) # doctest: +NORMALIZE_WHITESPACE
 121     ['theq', 'hequ', 'equi', 'quic', 'uick', 'ickb', 'ckbr', 'kbro', 'brow',
 122      'rown', 'ownf', 'wnfo', 'nfox']
 123     """
 124     return [text[i:i+n] for i in range(len(text)-n+1)]
 125
 126
 127 class Pdist(dict):
 128     """A probability distribution estimated from counts in datafile.
 129     Values are stored and returned as log probabilities.
 130     """
 131     def __init__(self, data=[], estimate_of_missing=None):
 132         data1, data2 = itertools.tee(data)
 133         self.total = sum([d[1] for d in data1])
 134         for key, count in data2:
 135             self[key] = log10(count / self.total)
 136         self.estimate_of_missing = estimate_of_missing or (lambda k, N: 1./N)
 137     def __missing__(self, key):
 138         return self.estimate_of_missing(key, self.total)
 139
 140 def log_probability_of_unknown_word(key, N):
 141     """Estimate the probability of an unknown word.
 142     """
 143     return -log10(N * 10**((len(key) - 2) * 1.4))
 144
 145 Pw = Pdist(datafile('count_1w.txt'), log_probability_of_unknown_word)
 146 """A [Pdist](#szyfrow.support.language_models.Pdist) holding log probabilities
 147 of words. Unknown words have their probability estimated by
 148 [log_probability_of_unknown_word](#szyfrow.support.language_models.log_probability_of_unknown_word)"""
 149 Pl = Pdist(datafile('count_1l.txt'), lambda _k, _N: 0)
 150 """A [Pdist](#szyfrow.support.language_models.Pdist) holding log probabilities
 151 of single letters. Unknown words have their probability estimated as zero."""
 152 P2l = Pdist(datafile('count_2l.txt'), lambda _k, _N: 0)
 153 """A [Pdist](#szyfrow.support.language_models.Pdist) holding log probabilities
 154 of letter bigrams. Unknown words have their probability estimated as zero."""
 155 P3l = Pdist(datafile('count_3l.txt'), lambda _k, _N: 0)
 156 """A [Pdist](#szyfrow.support.language_models.Pdist) holding log probabilities
 157 of letter trigrams. Unknown words have their probability estimated as zero."""
 158
 159 def Pwords(words):
 160     """The Naive Bayes log probability of a sequence of words.
 161     """
 162     return sum(Pw[w.lower()] for w in words)
 163
 164 def Pletters(letters):
 165     """The Naive Bayes log probability of a sequence of letters.
 166     """
 167     return sum(Pl[l.lower()] for l in letters)
 168
 169 def Pbigrams(letters):
 170     """The Naive Bayes log probability of the bigrams formed from a sequence
 171     of letters.
 172     """
 173     return sum(P2l[p] for p in ngrams(letters, 2))
 174
 175 def Ptrigrams(letters):
 176     """The Naive Bayes log probability of the trigrams formed from a sequence
 177     of letters.
 178     """
 179     return sum(P3l[p] for p in ngrams(letters, 3))
 180
 181
 182 def cosine_distance_score(text):
 183     """Finds the dissimilarity of a text to English, using the cosine distance
 184     of the frequency distribution.
 185
 186     >>> cosine_distance_score('abcabc') # doctest: +ELLIPSIS
 187     0.73771...
 188     """
 189     # return szyfrow.support.norms.cosine_distance(english_counts,
 190     #     collections.Counter(sanitise(text)))
 191     return 1 - szyfrow.support.norms.cosine_similarity(english_counts,
 192         collections.Counter(sanitise(text)))
 193
 194
 195 if __name__ == "__main__":
 196     import doctest
 197     doctest.testmod()