Caesar ciphers

[cipher-training.git] / language_models.py
diff --git a/language_models.py b/language_models.py

index 4798d731610929e3fc8c31324e83db1a20eafd61..0ee039496d17639278f268cc3d3c8f1feab0d0aa 100644 (file)
--- a/language_models.py
+++ b/language_models.py
@@ -3,12 +3,7 @@ its use.
  """
  
  import string
-import random
-import norms
-import collections
  import unicodedata
-import itertools
-from math import log10
  
  def letters(text):
      """Remove all non-alphabetic characters from a text
@@ -54,39 +49,6 @@ def sanitise(text):
      return letters(unaccent(text)).lower()
  
  
-def datafile(name, sep='\t'):
-    """Read key,value pairs from file.
-    """
-    with open(name, 'r') as f:
-        for line in f:
-            splits = line.split(sep)
-            yield [splits[0], int(splits[1])]
-
-english_counts = collections.Counter(dict(datafile('count_1l.txt')))
-normalised_english_counts = norms.normalise(english_counts)
-
-# The log 10 probabilities of letters
-Pl = {l: log10(n) for l, n in normalised_english_counts.items()}
-
-with open('words.txt', 'r') as f:
-    keywords = [line.rstrip() for line in f]
-
-def Pletters(letters):
-    """The Naive Bayes log probability of a sequence of letters.
-    """
-    return sum(Pl[l.lower()] for l in letters)
-
-
-def cosine_similarity_score(text):
-    """Finds the dissimilarity of a text to English, using the cosine distance
-    of the frequency distribution.
-
-    >>> cosine_similarity_score('abcabc') # doctest: +ELLIPSIS
-    0.26228882...
-    """
-    return norms.cosine_similarity(english_counts,
-                                   collections.Counter(sanitise(text)))
-
  
  if __name__ == "__main__":
      import doctest