"""
import string
-import random
-import norms
-import collections
import unicodedata
-import itertools
-from math import log10
def letters(text):
"""Remove all non-alphabetic characters from a text
return letters(unaccent(text)).lower()
-def datafile(name, sep='\t'):
- """Read key,value pairs from file.
- """
- with open(name, 'r') as f:
- for line in f:
- splits = line.split(sep)
- yield [splits[0], int(splits[1])]
-
-english_counts = collections.Counter(dict(datafile('count_1l.txt')))
-normalised_english_counts = norms.normalise(english_counts)
-
-# The log 10 probabilities of letters
-Pl = {l: log10(n) for l, n in normalised_english_counts.items()}
-
-with open('words.txt', 'r') as f:
- keywords = [line.rstrip() for line in f]
-
-def Pletters(letters):
- """The Naive Bayes log probability of a sequence of letters.
- """
- return sum(Pl[l.lower()] for l in letters)
-
-
-def cosine_similarity_score(text):
- """Finds the dissimilarity of a text to English, using the cosine distance
- of the frequency distribution.
-
- >>> cosine_similarity_score('abcabc') # doctest: +ELLIPSIS
- 0.26228882...
- """
- return norms.cosine_similarity(english_counts,
- collections.Counter(sanitise(text)))
-
if __name__ == "__main__":
import doctest