language_models.py

   1 import norms
   2 import itertools
   3 import random
   4 import bisect
   5 import collections
   6
   7 english_counts = collections.defaultdict(int)
   8 with open('count_1l.txt', 'r') as f:
   9     for line in f:
  10         (letter, count) = line.split("\t")
  11         english_counts[letter] = int(count)
  12 normalised_english_counts = norms.normalise(english_counts)
  13
  14 english_bigram_counts = collections.defaultdict(int)
  15 with open('count_2l.txt', 'r') as f:
  16     for line in f:
  17         (bigram, count) = line.split("\t")
  18         english_bigram_counts[bigram] = int(count)
  19 normalised_english_bigram_counts = norms.normalise(english_bigram_counts)
  20
  21 english_trigram_counts = collections.defaultdict(int)
  22 with open('count_3l.txt', 'r') as f:
  23     for line in f:
  24         (trigram, count) = line.split("\t")
  25         english_trigram_counts[trigram] = int(count)
  26 normalised_english_trigram_counts = norms.normalise(english_trigram_counts)
  27
  28 with open('words.txt', 'r') as f:
  29     keywords = [line.rstrip() for line in f]
  30
  31 def weighted_choice(d):
  32         """Generate a set of random items from a dictionary of item counts
  33         """
  34         target = random.uniform(0, sum(d.values()))
  35         cuml = 0.0
  36         for (l, p) in d.items():
  37                 cuml += p
  38                 if cuml > target:
  39                         return l
  40         return None
  41
  42 def random_english_letter():
  43         """Generate a random letter based on English letter counts
  44         """
  45         return weighted_choice(normalised_english_counts)