language_models.py

   1 """Language-specific functions, including models of languages based on data of
   2 its use.
   3 """
   4
   5 import string
   6 import random
   7 import norms
   8 import collections
   9 import unicodedata
  10 import itertools
  11 from math import log10
  12
  13 def letters(text):
  14     """Remove all non-alphabetic characters from a text
  15     >>> letters('The Quick')
  16     'TheQuick'
  17     >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
  18     'TheQuickBROWNfoxjumpedoverthelazyDOG'
  19     """
  20     return ''.join([c for c in text if c in string.ascii_letters])
  21
  22 def unaccent(text):
  23     """Remove all accents from letters.
  24     It does this by converting the unicode string to decomposed compatability
  25     form, dropping all the combining accents, then re-encoding the bytes.
  26
  27     >>> unaccent('hello')
  28     'hello'
  29     >>> unaccent('HELLO')
  30     'HELLO'
  31     >>> unaccent('héllo')
  32     'hello'
  33     >>> unaccent('héllö')
  34     'hello'
  35     >>> unaccent('HÉLLÖ')
  36     'HELLO'
  37     """
  38     return unicodedata.normalize('NFKD', text).\
  39         encode('ascii', 'ignore').\
  40         decode('utf-8')
  41
  42 def sanitise(text):
  43     """Remove all non-alphabetic characters and convert the text to lowercase
  44
  45     >>> sanitise('The Quick')
  46     'thequick'
  47     >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
  48     'thequickbrownfoxjumpedoverthelazydog'
  49     >>> sanitise('HÉLLÖ')
  50     'hello'
  51     """
  52     # sanitised = [c.lower() for c in text if c in string.ascii_letters]
  53     # return ''.join(sanitised)
  54     return letters(unaccent(text)).lower()
  55
  56
  57 def datafile(name, sep='\t'):
  58     """Read key,value pairs from file.
  59     """
  60     with open(name, 'r') as f:
  61         for line in f:
  62             splits = line.split(sep)
  63             yield [splits[0], int(splits[1])]
  64
  65 english_counts = collections.Counter(dict(datafile('count_1l.txt')))
  66 normalised_english_counts = norms.normalise(english_counts)
  67
  68 with open('words.txt', 'r') as f:
  69     keywords = [line.rstrip() for line in f]
  70
  71
  72 def weighted_choice(d):
  73     """Generate random item from a dictionary of item counts
  74     """
  75     target = random.uniform(0, sum(d.values()))
  76     cuml = 0.0
  77     for (l, p) in d.items():
  78         cuml += p
  79         if cuml > target:
  80             return l
  81     return None
  82
  83 def random_english_letter():
  84     """Generate a random letter based on English letter counts
  85     """
  86     return weighted_choice(normalised_english_counts)
  87
  88
  89 class Pdist(dict):
  90     """A probability distribution estimated from counts in datafile.
  91     Values are stored and returned as log probabilities.
  92     """
  93     def __init__(self, data=[], estimate_of_missing=None):
  94         data1, data2 = itertools.tee(data)
  95         self.total = sum([d[1] for d in data1])
  96         for key, count in data2:
  97             self[key] = log10(count / self.total)
  98         self.estimate_of_missing = estimate_of_missing or (lambda k, N: 1./N)
  99     def __missing__(self, key):
 100         return self.estimate_of_missing(key, self.total)
 101
 102 def log_probability_of_unknown_word(key, N):
 103     """Estimate the probability of an unknown word.
 104     """
 105     return -log10(N * 10**((len(key) - 2) * 1.4))
 106
 107 Pw = Pdist(datafile('count_1w.txt'), log_probability_of_unknown_word)
 108 Pw_wrong = Pdist(datafile('count_1w.txt'), lambda _k, N: log10(1/N))
 109 Pl = Pdist(datafile('count_1l.txt'), lambda _k, _N: 0)
 110 P2l = Pdist(datafile('count_2l.txt'), lambda _k, _N: 0)
 111 P3l = Pdist(datafile('count_3l.txt'), lambda _k, _N: 0)
 112
 113 def Pwords(words):
 114     """The Naive Bayes log probability of a sequence of words.
 115     """
 116     return sum(Pw[w.lower()] for w in words)
 117
 118 def Pwords_wrong(words):
 119     """The Naive Bayes log probability of a sequence of words.
 120     """
 121     return sum(Pw_wrong[w.lower()] for w in words)
 122
 123 def Pletters(letters):
 124     """The Naive Bayes log probability of a sequence of letters.
 125     """
 126     return sum(Pl[l.lower()] for l in letters)
 127
 128
 129 def cosine_similarity_score(text):
 130     """Finds the dissimilarity of a text to English, using the cosine distance
 131     of the frequency distribution.
 132
 133     >>> cosine_similarity_score('abcabc') # doctest: +ELLIPSIS
 134     0.26228882...
 135     """
 136     return norms.cosine_similarity(english_counts,
 137                                    collections.Counter(sanitise(text)))
 138
 139
 140 if __name__ == "__main__":
 141     import doctest
 142     doctest.testmod()