language_models.py

   1 import string
   2 import norms
   3 import random
   4 import collections
   5 import unicodedata
   6
   7 english_counts = collections.defaultdict(int)
   8 with open('count_1l.txt', 'r') as f:
   9     for line in f:
  10         (letter, count) = line.split("\t")
  11         english_counts[letter] = int(count)
  12 normalised_english_counts = norms.normalise(english_counts)
  13
  14 english_bigram_counts = collections.defaultdict(int)
  15 with open('count_2l.txt', 'r') as f:
  16     for line in f:
  17         (bigram, count) = line.split("\t")
  18         english_bigram_counts[bigram] = int(count)
  19 normalised_english_bigram_counts = norms.normalise(english_bigram_counts)
  20
  21 english_trigram_counts = collections.defaultdict(int)
  22 with open('count_3l.txt', 'r') as f:
  23     for line in f:
  24         (trigram, count) = line.split("\t")
  25         english_trigram_counts[trigram] = int(count)
  26 normalised_english_trigram_counts = norms.normalise(english_trigram_counts)
  27
  28 with open('words.txt', 'r') as f:
  29     keywords = [line.rstrip() for line in f]
  30
  31 def weighted_choice(d):
  32         """Generate a set of random items from a dictionary of item counts
  33         """
  34         target = random.uniform(0, sum(d.values()))
  35         cuml = 0.0
  36         for (l, p) in d.items():
  37                 cuml += p
  38                 if cuml > target:
  39                         return l
  40         return None
  41
  42 def random_english_letter():
  43         """Generate a random letter based on English letter counts
  44         """
  45         return weighted_choice(normalised_english_counts)
  46
  47
  48 def letters(text):
  49     """Remove all non-alphabetic characters from a text
  50     >>> letters('The Quick')
  51     'TheQuick'
  52     >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
  53     'TheQuickBROWNfoxjumpedoverthelazyDOG'
  54     """
  55     return ''.join([c for c in text if c in string.ascii_letters])
  56
  57 def unaccent(text):
  58         """Remove all accents from letters.
  59         It does this by converting the unicode string to decomposed compatability
  60         form, dropping all the combining accents, then re-encoding the bytes.
  61
  62         >>> unaccent('hello')
  63         'hello'
  64         >>> unaccent('HELLO')
  65         'HELLO'
  66         >>> unaccent('héllo')
  67         'hello'
  68         >>> unaccent('héllö')
  69         'hello'
  70         >>> unaccent('HÉLLÖ')
  71         'HELLO'
  72         """
  73         return unicodedata.normalize('NFKD', text).\
  74                 encode('ascii', 'ignore').\
  75                 decode('utf-8')
  76
  77 def sanitise(text):
  78     """Remove all non-alphabetic characters and convert the text to lowercase
  79
  80     >>> sanitise('The Quick')
  81     'thequick'
  82     >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
  83     'thequickbrownfoxjumpedoverthelazydog'
  84     >>> sanitise('HÉLLÖ')
  85     'hello'
  86     """
  87     # sanitised = [c.lower() for c in text if c in string.ascii_letters]
  88     # return ''.join(sanitised)
  89     return letters(unaccent(text)).lower()
  90
  91
  92 if __name__ == "__main__":
  93     import doctest
  94     doctest.testmod()