language_models.py

   1 import string
   2 import norms
   3 import random
   4 import collections
   5 import unicodedata
   6 import itertools
   7 from math import log10
   8
   9 def letters(text):
  10     """Remove all non-alphabetic characters from a text
  11     >>> letters('The Quick')
  12     'TheQuick'
  13     >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
  14     'TheQuickBROWNfoxjumpedoverthelazyDOG'
  15     """
  16     return ''.join([c for c in text if c in string.ascii_letters])
  17
  18 def unaccent(text):
  19     """Remove all accents from letters.
  20     It does this by converting the unicode string to decomposed compatability
  21     form, dropping all the combining accents, then re-encoding the bytes.
  22
  23     >>> unaccent('hello')
  24     'hello'
  25     >>> unaccent('HELLO')
  26     'HELLO'
  27     >>> unaccent('héllo')
  28     'hello'
  29     >>> unaccent('héllö')
  30     'hello'
  31     >>> unaccent('HÉLLÖ')
  32     'HELLO'
  33     """
  34     return unicodedata.normalize('NFKD', text).\
  35         encode('ascii', 'ignore').\
  36         decode('utf-8')
  37
  38 def sanitise(text):
  39     """Remove all non-alphabetic characters and convert the text to lowercase
  40
  41     >>> sanitise('The Quick')
  42     'thequick'
  43     >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
  44     'thequickbrownfoxjumpedoverthelazydog'
  45     >>> sanitise('HÉLLÖ')
  46     'hello'
  47     """
  48     # sanitised = [c.lower() for c in text if c in string.ascii_letters]
  49     # return ''.join(sanitised)
  50     return letters(unaccent(text)).lower()
  51
  52
  53 def datafile(name, sep='\t'):
  54     """Read key,value pairs from file.
  55     """
  56     with open(name, 'r') as f:
  57         for line in f:
  58             splits = line.split(sep)
  59             yield [splits[0], int(splits[1])]
  60
  61 english_counts = collections.Counter(dict(datafile('count_1l.txt')))
  62 normalised_english_counts = norms.normalise(english_counts)
  63
  64 english_bigram_counts = collections.Counter(dict(datafile('count_2l.txt')))
  65 normalised_english_bigram_counts = norms.normalise(english_bigram_counts)
  66
  67 english_trigram_counts = collections.Counter(dict(datafile('count_3l.txt')))
  68 normalised_english_trigram_counts = norms.normalise(english_trigram_counts)
  69
  70 with open('words.txt', 'r') as f:
  71     keywords = [line.rstrip() for line in f]
  72
  73
  74 def weighted_choice(d):
  75         """Generate random item from a dictionary of item counts
  76         """
  77         target = random.uniform(0, sum(d.values()))
  78         cuml = 0.0
  79         for (l, p) in d.items():
  80                 cuml += p
  81                 if cuml > target:
  82                         return l
  83         return None
  84
  85 def random_english_letter():
  86         """Generate a random letter based on English letter counts
  87         """
  88         return weighted_choice(normalised_english_counts)
  89
  90
  91 def ngrams(text, n):
  92     """Returns all n-grams of a text
  93
  94     >>> ngrams(sanitise('the quick brown fox'), 2) # doctest: +NORMALIZE_WHITESPACE
  95     ['th', 'he', 'eq', 'qu', 'ui', 'ic', 'ck', 'kb', 'br', 'ro', 'ow', 'wn',
  96      'nf', 'fo', 'ox']
  97     >>> ngrams(sanitise('the quick brown fox'), 4) # doctest: +NORMALIZE_WHITESPACE
  98     ['theq', 'hequ', 'equi', 'quic', 'uick', 'ickb', 'ckbr', 'kbro', 'brow',
  99      'rown', 'ownf', 'wnfo', 'nfox']
 100     """
 101     return [text[i:i+n] for i in range(len(text)-n+1)]
 102
 103
 104 class Pdist(dict):
 105     """A probability distribution estimated from counts in datafile.
 106     Values are stored and returned as log probabilities.
 107     """
 108     def __init__(self, data=[], estimate_of_missing=None):
 109         data1, data2 = itertools.tee(data)
 110         self.total = sum([d[1] for d in data1])
 111         for key, count in data2:
 112             self[key] = log10(count / self.total)
 113         self.estimate_of_missing = estimate_of_missing or (lambda k, N: 1./N)
 114     def __missing__(self, key):
 115         return self.estimate_of_missing(key, self.total)
 116
 117 def log_probability_of_unknown_word(key, N):
 118     """Estimate the probability of an unknown word.
 119     """
 120     return -log10(N * 10**((len(key) - 2) * 1.4))
 121
 122 Pw = Pdist(datafile('count_1w.txt'), log_probability_of_unknown_word)
 123 Pw_wrong = Pdist(datafile('count_1w.txt'), lambda _k, N: log10(1/N))
 124 Pl = Pdist(datafile('count_1l.txt'), lambda _k, _N: 0)
 125 P2l = Pdist(datafile('count_2l.txt'), lambda _k, _N: 0)
 126 P3l = Pdist(datafile('count_3l.txt'), lambda _k, _N: 0)
 127
 128 def Pwords(words):
 129     """The Naive Bayes log probability of a sequence of words.
 130     """
 131     return sum(Pw[w.lower()] for w in words)
 132
 133 def Pwords_wrong(words):
 134     """The Naive Bayes log probability of a sequence of words.
 135     """
 136     return sum(Pw_wrong[w.lower()] for w in words)
 137
 138
 139 def Pletters(letters):
 140     """The Naive Bayes log probability of a sequence of letters.
 141     """
 142     return sum(Pl[l.lower()] for l in letters)
 143
 144 def Pbigrams(letters):
 145     """The Naive Bayes log probability of the bigrams formed from a sequence
 146     of letters.
 147     """
 148     return sum(P2l[p] for p in ngrams(letters, 2))
 149
 150 def Ptrigrams(letters):
 151     """The Naive Bayes log probability of the trigrams formed from a sequence
 152     of letters.
 153     """
 154     return sum(P3l[p] for p in ngrams(letters, 3))
 155
 156
 157 def cosine_similarity_score(text):
 158     """Finds the dissimilarity of a text to English, using the cosine distance
 159     of the frequency distribution.
 160
 161     >>> cosine_similarity_score('abcabc') # doctest: +ELLIPSIS
 162     0.26228882...
 163     """
 164     return norms.cosine_similarity(english_counts,
 165         collections.Counter(sanitise(text)))
 166
 167
 168 if __name__ == "__main__":
 169     import doctest
 170     doctest.testmod()