language_models.py

   1 """Language-specific functions, including models of languages based on data of
   2 its use.
   3 """
   4
   5 import string
   6 import random
   7 import norms
   8 import collections
   9 import unicodedata
  10 import itertools
  11 from math import log10
  12
  13 unaccent_specials = ''.maketrans({"’": "'"})
  14
  15 def letters(text):
  16     """Remove all non-alphabetic characters from a text
  17     >>> letters('The Quick')
  18     'TheQuick'
  19     >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
  20     'TheQuickBROWNfoxjumpedoverthelazyDOG'
  21     """
  22     return ''.join([c for c in text if c in string.ascii_letters])
  23
  24 def unaccent(text):
  25     """Remove all accents from letters.
  26     It does this by converting the unicode string to decomposed compatability
  27     form, dropping all the combining accents, then re-encoding the bytes.
  28
  29     >>> unaccent('hello')
  30     'hello'
  31     >>> unaccent('HELLO')
  32     'HELLO'
  33     >>> unaccent('héllo')
  34     'hello'
  35     >>> unaccent('héllö')
  36     'hello'
  37     >>> unaccent('HÉLLÖ')
  38     'HELLO'
  39     """
  40     translated_text = text.translate(unaccent_specials)
  41     return unicodedata.normalize('NFKD', translated_text).\
  42         encode('ascii', 'ignore').\
  43         decode('utf-8')
  44
  45 def sanitise(text):
  46     """Remove all non-alphabetic characters and convert the text to lowercase
  47
  48     >>> sanitise('The Quick')
  49     'thequick'
  50     >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
  51     'thequickbrownfoxjumpedoverthelazydog'
  52     >>> sanitise('HÉLLÖ')
  53     'hello'
  54     """
  55     # sanitised = [c.lower() for c in text if c in string.ascii_letters]
  56     # return ''.join(sanitised)
  57     return letters(unaccent(text)).lower()
  58
  59
  60 def datafile(name, sep='\t'):
  61     """Read key,value pairs from file.
  62     """
  63     with open(name, 'r') as f:
  64         for line in f:
  65             splits = line.split(sep)
  66             yield [splits[0], int(splits[1])]
  67
  68 english_counts = collections.Counter(dict(datafile('count_1l.txt')))
  69 normalised_english_counts = norms.normalise(english_counts)
  70
  71 english_bigram_counts = collections.Counter(dict(datafile('count_2l.txt')))
  72 normalised_english_bigram_counts = norms.normalise(english_bigram_counts)
  73
  74 english_trigram_counts = collections.Counter(dict(datafile('count_3l.txt')))
  75 normalised_english_trigram_counts = norms.normalise(english_trigram_counts)
  76
  77 with open('words.txt', 'r') as f:
  78     keywords = [line.rstrip() for line in f]
  79
  80
  81 def weighted_choice(d):
  82     """Generate random item from a dictionary of item counts
  83     """
  84     target = random.uniform(0, sum(d.values()))
  85     cuml = 0.0
  86     for (l, p) in d.items():
  87         cuml += p
  88         if cuml > target:
  89             return l
  90     return None
  91
  92 def random_english_letter():
  93     """Generate a random letter based on English letter counts
  94     """
  95     return weighted_choice(normalised_english_counts)
  96
  97
  98 def ngrams(text, n):
  99     """Returns all n-grams of a text
 100
 101     >>> ngrams(sanitise('the quick brown fox'), 2) # doctest: +NORMALIZE_WHITESPACE
 102     ['th', 'he', 'eq', 'qu', 'ui', 'ic', 'ck', 'kb', 'br', 'ro', 'ow', 'wn',
 103      'nf', 'fo', 'ox']
 104     >>> ngrams(sanitise('the quick brown fox'), 4) # doctest: +NORMALIZE_WHITESPACE
 105     ['theq', 'hequ', 'equi', 'quic', 'uick', 'ickb', 'ckbr', 'kbro', 'brow',
 106      'rown', 'ownf', 'wnfo', 'nfox']
 107     """
 108     return [text[i:i+n] for i in range(len(text)-n+1)]
 109
 110
 111 class Pdist(dict):
 112     """A probability distribution estimated from counts in datafile.
 113     Values are stored and returned as log probabilities.
 114     """
 115     def __init__(self, data=[], estimate_of_missing=None):
 116         data1, data2 = itertools.tee(data)
 117         self.total = sum([d[1] for d in data1])
 118         for key, count in data2:
 119             self[key] = log10(count / self.total)
 120         self.estimate_of_missing = estimate_of_missing or (lambda k, N: 1./N)
 121     def __missing__(self, key):
 122         return self.estimate_of_missing(key, self.total)
 123
 124 def log_probability_of_unknown_word(key, N):
 125     """Estimate the probability of an unknown word.
 126     """
 127     return -log10(N * 10**((len(key) - 2) * 1.4))
 128
 129 Pw = Pdist(datafile('count_1w.txt'), log_probability_of_unknown_word)
 130 Pw_wrong = Pdist(datafile('count_1w.txt'), lambda _k, N: log10(1/N))
 131 Pl = Pdist(datafile('count_1l.txt'), lambda _k, _N: 0)
 132 P2l = Pdist(datafile('count_2l.txt'), lambda _k, _N: 0)
 133 P3l = Pdist(datafile('count_3l.txt'), lambda _k, _N: 0)
 134
 135 def Pwords(words):
 136     """The Naive Bayes log probability of a sequence of words.
 137     """
 138     return sum(Pw[w.lower()] for w in words)
 139
 140 def Pwords_wrong(words):
 141     """The Naive Bayes log probability of a sequence of words.
 142     """
 143     return sum(Pw_wrong[w.lower()] for w in words)
 144
 145 def Pletters(letters):
 146     """The Naive Bayes log probability of a sequence of letters.
 147     """
 148     return sum(Pl[l.lower()] for l in letters)
 149
 150 def Pbigrams(letters):
 151     """The Naive Bayes log probability of the bigrams formed from a sequence
 152     of letters.
 153     """
 154     return sum(P2l[p] for p in ngrams(letters, 2))
 155
 156 def Ptrigrams(letters):
 157     """The Naive Bayes log probability of the trigrams formed from a sequence
 158     of letters.
 159     """
 160     return sum(P3l[p] for p in ngrams(letters, 3))
 161
 162
 163 def cosine_similarity_score(text):
 164     """Finds the dissimilarity of a text to English, using the cosine distance
 165     of the frequency distribution.
 166
 167     >>> cosine_similarity_score('abcabc') # doctest: +ELLIPSIS
 168     0.26228882...
 169     """
 170     return norms.cosine_similarity(english_counts,
 171                                    collections.Counter(sanitise(text)))
 172
 173
 174 if __name__ == "__main__":
 175     import doctest
 176     doctest.testmod()