szyfrow/support/language_models.py

   1 import string
   2 import random
   3 import collections
   4 import itertools
   5 from math import log10
   6 import os
   7 import importlib.resources as pkg_resources
   8
   9 import szyfrow.support.norms
  10 from szyfrow.support.utilities import sanitise, deduplicate
  11 from szyfrow import language_model_files
  12
  13
  14 def datafile(name, sep='\t'):
  15     """Read key,value pairs from file.
  16     """
  17     with pkg_resources.open_text(language_model_files, name) as f:
  18     # with open(p name), 'r') as f:
  19         for line in f:
  20             splits = line.split(sep)
  21             yield [splits[0], int(splits[1])]
  22
  23 english_counts = collections.Counter(dict(datafile('count_1l.txt')))
  24 normalised_english_counts = szyfrow.support.norms.normalise(english_counts)
  25
  26 english_bigram_counts = collections.Counter(dict(datafile('count_2l.txt')))
  27 normalised_english_bigram_counts = szyfrow.support.norms.normalise(english_bigram_counts)
  28
  29 english_trigram_counts = collections.Counter(dict(datafile('count_3l.txt')))
  30 normalised_english_trigram_counts = szyfrow.support.norms.normalise(english_trigram_counts)
  31
  32 with pkg_resources.open_text(language_model_files, 'words.txt') as f:
  33     keywords = [line.rstrip() for line in f]
  34
  35
  36 def transpositions_of(keyword):
  37     """Finds the transpostions given by a keyword. For instance, the keyword
  38     'clever' rearranges to 'celrv', so the first column (0) stays first, the
  39     second column (1) moves to third, the third column (2) moves to second,
  40     and so on.
  41
  42     If passed a tuple, assume it's already a transposition and just return it.
  43
  44     >>> transpositions_of('clever')
  45     (0, 2, 1, 4, 3)
  46     >>> transpositions_of('fred')
  47     (3, 2, 0, 1)
  48     >>> transpositions_of((3, 2, 0, 1))
  49     (3, 2, 0, 1)
  50     """
  51     if isinstance(keyword, tuple):
  52         return keyword
  53     else:
  54         key = deduplicate(keyword)
  55         transpositions = tuple(key.index(l) for l in sorted(key))
  56         return transpositions
  57
  58 transpositions = collections.defaultdict(list)
  59 for word in keywords:
  60     transpositions[transpositions_of(word)] += [word]
  61
  62
  63 def weighted_choice(d):
  64     """Generate random item from a dictionary of item counts
  65     """
  66     target = random.uniform(0, sum(d.values()))
  67     cuml = 0.0
  68     for (l, p) in d.items():
  69         cuml += p
  70         if cuml > target:
  71             return l
  72     return None
  73
  74 def random_english_letter():
  75     """Generate a random letter based on English letter counts
  76     """
  77     return weighted_choice(normalised_english_counts)
  78
  79
  80 def ngrams(text, n):
  81     """Returns all n-grams of a text
  82
  83     >>> ngrams(sanitise('the quick brown fox'), 2) # doctest: +NORMALIZE_WHITESPACE
  84     ['th', 'he', 'eq', 'qu', 'ui', 'ic', 'ck', 'kb', 'br', 'ro', 'ow', 'wn',
  85      'nf', 'fo', 'ox']
  86     >>> ngrams(sanitise('the quick brown fox'), 4) # doctest: +NORMALIZE_WHITESPACE
  87     ['theq', 'hequ', 'equi', 'quic', 'uick', 'ickb', 'ckbr', 'kbro', 'brow',
  88      'rown', 'ownf', 'wnfo', 'nfox']
  89     """
  90     return [text[i:i+n] for i in range(len(text)-n+1)]
  91
  92
  93 class Pdist(dict):
  94     """A probability distribution estimated from counts in datafile.
  95     Values are stored and returned as log probabilities.
  96     """
  97     def __init__(self, data=[], estimate_of_missing=None):
  98         data1, data2 = itertools.tee(data)
  99         self.total = sum([d[1] for d in data1])
 100         for key, count in data2:
 101             self[key] = log10(count / self.total)
 102         self.estimate_of_missing = estimate_of_missing or (lambda k, N: 1./N)
 103     def __missing__(self, key):
 104         return self.estimate_of_missing(key, self.total)
 105
 106 def log_probability_of_unknown_word(key, N):
 107     """Estimate the probability of an unknown word.
 108     """
 109     return -log10(N * 10**((len(key) - 2) * 1.4))
 110
 111 Pw = Pdist(datafile('count_1w.txt'), log_probability_of_unknown_word)
 112 Pl = Pdist(datafile('count_1l.txt'), lambda _k, _N: 0)
 113 P2l = Pdist(datafile('count_2l.txt'), lambda _k, _N: 0)
 114 P3l = Pdist(datafile('count_3l.txt'), lambda _k, _N: 0)
 115
 116 def Pwords(words):
 117     """The Naive Bayes log probability of a sequence of words.
 118     """
 119     return sum(Pw[w.lower()] for w in words)
 120
 121 def Pletters(letters):
 122     """The Naive Bayes log probability of a sequence of letters.
 123     """
 124     return sum(Pl[l.lower()] for l in letters)
 125
 126 def Pbigrams(letters):
 127     """The Naive Bayes log probability of the bigrams formed from a sequence
 128     of letters.
 129     """
 130     return sum(P2l[p] for p in ngrams(letters, 2))
 131
 132 def Ptrigrams(letters):
 133     """The Naive Bayes log probability of the trigrams formed from a sequence
 134     of letters.
 135     """
 136     return sum(P3l[p] for p in ngrams(letters, 3))
 137
 138
 139 def cosine_distance_score(text):
 140     """Finds the dissimilarity of a text to English, using the cosine distance
 141     of the frequency distribution.
 142
 143     >>> cosine_distance_score('abcabc') # doctest: +ELLIPSIS
 144     0.73771...
 145     """
 146     # return szyfrow.support.norms.cosine_distance(english_counts,
 147     #     collections.Counter(sanitise(text)))
 148     return 1 - szyfrow.support.norms.cosine_similarity(english_counts,
 149         collections.Counter(sanitise(text)))
 150
 151
 152 if __name__ == "__main__":
 153     import doctest
 154     doctest.testmod()