language_models.py

   1 import string
   2 import norms
   3 import random
   4 import collections
   5 import unicodedata
   6 import itertools
   7 from math import log10
   8 import os
   9
  10 unaccent_specials = ''.maketrans({"’": "'", '“': '"', '”': '"'})
  11
  12 def letters(text):
  13     """Remove all non-alphabetic characters from a text
  14     >>> letters('The Quick')
  15     'TheQuick'
  16     >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
  17     'TheQuickBROWNfoxjumpedoverthelazyDOG'
  18     """
  19     return ''.join([c for c in text if c in string.ascii_letters])
  20
  21 def unaccent(text):
  22     """Remove all accents from letters.
  23     It does this by converting the unicode string to decomposed compatability
  24     form, dropping all the combining accents, then re-encoding the bytes.
  25
  26     >>> unaccent('hello')
  27     'hello'
  28     >>> unaccent('HELLO')
  29     'HELLO'
  30     >>> unaccent('héllo')
  31     'hello'
  32     >>> unaccent('héllö')
  33     'hello'
  34     >>> unaccent('HÉLLÖ')
  35     'HELLO'
  36     """
  37     translated_text = text.translate(unaccent_specials)
  38     return unicodedata.normalize('NFKD', translated_text).\
  39         encode('ascii', 'ignore').\
  40         decode('utf-8')
  41
  42 def sanitise(text):
  43     """Remove all non-alphabetic characters and convert the text to lowercase
  44
  45     >>> sanitise('The Quick')
  46     'thequick'
  47     >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
  48     'thequickbrownfoxjumpedoverthelazydog'
  49     >>> sanitise('HÉLLÖ')
  50     'hello'
  51     """
  52     # sanitised = [c.lower() for c in text if c in string.ascii_letters]
  53     # return ''.join(sanitised)
  54     return letters(unaccent(text)).lower()
  55
  56
  57 def datafile(name, sep='\t'):
  58     """Read key,value pairs from file.
  59     """
  60     with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), name), 'r') as f:
  61         for line in f:
  62             splits = line.split(sep)
  63             yield [splits[0], int(splits[1])]
  64
  65 english_counts = collections.Counter(dict(datafile('count_1l.txt')))
  66 normalised_english_counts = norms.normalise(english_counts)
  67
  68 english_bigram_counts = collections.Counter(dict(datafile('count_2l.txt')))
  69 normalised_english_bigram_counts = norms.normalise(english_bigram_counts)
  70
  71 english_trigram_counts = collections.Counter(dict(datafile('count_3l.txt')))
  72 normalised_english_trigram_counts = norms.normalise(english_trigram_counts)
  73
  74 with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'words.txt'), 'r') as f:
  75     keywords = [line.rstrip() for line in f]
  76
  77
  78 def weighted_choice(d):
  79         """Generate random item from a dictionary of item counts
  80         """
  81         target = random.uniform(0, sum(d.values()))
  82         cuml = 0.0
  83         for (l, p) in d.items():
  84                 cuml += p
  85                 if cuml > target:
  86                         return l
  87         return None
  88
  89 def random_english_letter():
  90         """Generate a random letter based on English letter counts
  91         """
  92         return weighted_choice(normalised_english_counts)
  93
  94
  95 def ngrams(text, n):
  96     """Returns all n-grams of a text
  97
  98     >>> ngrams(sanitise('the quick brown fox'), 2) # doctest: +NORMALIZE_WHITESPACE
  99     ['th', 'he', 'eq', 'qu', 'ui', 'ic', 'ck', 'kb', 'br', 'ro', 'ow', 'wn',
 100      'nf', 'fo', 'ox']
 101     >>> ngrams(sanitise('the quick brown fox'), 4) # doctest: +NORMALIZE_WHITESPACE
 102     ['theq', 'hequ', 'equi', 'quic', 'uick', 'ickb', 'ckbr', 'kbro', 'brow',
 103      'rown', 'ownf', 'wnfo', 'nfox']
 104     """
 105     return [text[i:i+n] for i in range(len(text)-n+1)]
 106
 107
 108 class Pdist(dict):
 109     """A probability distribution estimated from counts in datafile.
 110     Values are stored and returned as log probabilities.
 111     """
 112     def __init__(self, data=[], estimate_of_missing=None):
 113         data1, data2 = itertools.tee(data)
 114         self.total = sum([d[1] for d in data1])
 115         for key, count in data2:
 116             self[key] = log10(count / self.total)
 117         self.estimate_of_missing = estimate_of_missing or (lambda k, N: 1./N)
 118     def __missing__(self, key):
 119         return self.estimate_of_missing(key, self.total)
 120
 121 def log_probability_of_unknown_word(key, N):
 122     """Estimate the probability of an unknown word.
 123     """
 124     return -log10(N * 10**((len(key) - 2) * 1.4))
 125
 126 Pw = Pdist(datafile('count_1w.txt'), log_probability_of_unknown_word)
 127 Pl = Pdist(datafile('count_1l.txt'), lambda _k, _N: 0)
 128 P2l = Pdist(datafile('count_2l.txt'), lambda _k, _N: 0)
 129
 130 def Pwords(words):
 131     """The Naive Bayes log probability of a sequence of words.
 132     """
 133     return sum(Pw[w.lower()] for w in words)
 134
 135 def Pletters(letters):
 136     """The Naive Bayes log probability of a sequence of letters.
 137     """
 138     return sum(Pl[l.lower()] for l in letters)
 139
 140 def Pbigrams(letters):
 141     """The Naive Bayes log probability of the bigrams formed from a sequence
 142     of letters.
 143     """
 144     return sum(P2l[p] for p in ngrams(letters, 2))
 145
 146
 147 def cosine_distance_score(text):
 148     """Finds the dissimilarity of a text to English, using the cosine distance
 149     of the frequency distribution.
 150
 151     >>> cosine_distance_score('abcabc') # doctest: +ELLIPSIS
 152     0.370847405...
 153     """
 154     return norms.cosine_distance(english_counts,
 155         collections.Counter(sanitise(text)))
 156
 157
 158 if __name__ == "__main__":
 159     import doctest
 160     doctest.testmod()