language_models.py

   1 import string
   2 import random
   3 import collections
   4 import itertools
   5 from math import log10
   6 import os
   7
   8 import support.norms
   9 from support.utilities import sanitise
  10
  11 def datafile(name, sep='\t'):
  12     """Read key,value pairs from file.
  13     """
  14     with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), name), 'r') as f:
  15         for line in f:
  16             splits = line.split(sep)
  17             yield [splits[0], int(splits[1])]
  18
  19 english_counts = collections.Counter(dict(datafile('count_1l.txt')))
  20 normalised_english_counts = support.norms.normalise(english_counts)
  21
  22 english_bigram_counts = collections.Counter(dict(datafile('count_2l.txt')))
  23 normalised_english_bigram_counts = support.norms.normalise(english_bigram_counts)
  24
  25 english_trigram_counts = collections.Counter(dict(datafile('count_3l.txt')))
  26 normalised_english_trigram_counts = support.norms.normalise(english_trigram_counts)
  27
  28 with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'words.txt'), 'r') as f:
  29     keywords = [line.rstrip() for line in f]
  30
  31
  32 def weighted_choice(d):
  33     """Generate random item from a dictionary of item counts
  34     """
  35     target = random.uniform(0, sum(d.values()))
  36     cuml = 0.0
  37     for (l, p) in d.items():
  38         cuml += p
  39         if cuml > target:
  40             return l
  41     return None
  42
  43 def random_english_letter():
  44     """Generate a random letter based on English letter counts
  45     """
  46     return weighted_choice(normalised_english_counts)
  47
  48
  49 def ngrams(text, n):
  50     """Returns all n-grams of a text
  51
  52     >>> ngrams(sanitise('the quick brown fox'), 2) # doctest: +NORMALIZE_WHITESPACE
  53     ['th', 'he', 'eq', 'qu', 'ui', 'ic', 'ck', 'kb', 'br', 'ro', 'ow', 'wn',
  54      'nf', 'fo', 'ox']
  55     >>> ngrams(sanitise('the quick brown fox'), 4) # doctest: +NORMALIZE_WHITESPACE
  56     ['theq', 'hequ', 'equi', 'quic', 'uick', 'ickb', 'ckbr', 'kbro', 'brow',
  57      'rown', 'ownf', 'wnfo', 'nfox']
  58     """
  59     return [text[i:i+n] for i in range(len(text)-n+1)]
  60
  61
  62 class Pdist(dict):
  63     """A probability distribution estimated from counts in datafile.
  64     Values are stored and returned as log probabilities.
  65     """
  66     def __init__(self, data=[], estimate_of_missing=None):
  67         data1, data2 = itertools.tee(data)
  68         self.total = sum([d[1] for d in data1])
  69         for key, count in data2:
  70             self[key] = log10(count / self.total)
  71         self.estimate_of_missing = estimate_of_missing or (lambda k, N: 1./N)
  72     def __missing__(self, key):
  73         return self.estimate_of_missing(key, self.total)
  74
  75 def log_probability_of_unknown_word(key, N):
  76     """Estimate the probability of an unknown word.
  77     """
  78     return -log10(N * 10**((len(key) - 2) * 1.4))
  79
  80 Pw = Pdist(datafile('count_1w.txt'), log_probability_of_unknown_word)
  81 Pl = Pdist(datafile('count_1l.txt'), lambda _k, _N: 0)
  82 P2l = Pdist(datafile('count_2l.txt'), lambda _k, _N: 0)
  83 P3l = Pdist(datafile('count_3l.txt'), lambda _k, _N: 0)
  84
  85 def Pwords(words):
  86     """The Naive Bayes log probability of a sequence of words.
  87     """
  88     return sum(Pw[w.lower()] for w in words)
  89
  90 def Pletters(letters):
  91     """The Naive Bayes log probability of a sequence of letters.
  92     """
  93     return sum(Pl[l.lower()] for l in letters)
  94
  95 def Pbigrams(letters):
  96     """The Naive Bayes log probability of the bigrams formed from a sequence
  97     of letters.
  98     """
  99     return sum(P2l[p] for p in ngrams(letters, 2))
 100
 101 def Ptrigrams(letters):
 102     """The Naive Bayes log probability of the trigrams formed from a sequence
 103     of letters.
 104     """
 105     return sum(P3l[p] for p in ngrams(letters, 3))
 106
 107
 108 def cosine_distance_score(text):
 109     """Finds the dissimilarity of a text to English, using the cosine distance
 110     of the frequency distribution.
 111
 112     >>> cosine_distance_score('abcabc') # doctest: +ELLIPSIS
 113     0.73771...
 114     """
 115     # return support.norms.cosine_distance(english_counts,
 116     #     collections.Counter(sanitise(text)))
 117     return 1 - support.norms.cosine_similarity(english_counts,
 118         collections.Counter(sanitise(text)))
 119
 120
 121 if __name__ == "__main__":
 122     import doctest
 123     doctest.testmod()