import unicodedata
import itertools
from math import log10
+import os
-unaccent_specials = ''.maketrans({"’": "'"})
-
-def letters(text):
- """Remove all non-alphabetic characters from a text
- >>> letters('The Quick')
- 'TheQuick'
- >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
- 'TheQuickBROWNfoxjumpedoverthelazyDOG'
- """
- return ''.join([c for c in text if c in string.ascii_letters])
-
-def unaccent(text):
- """Remove all accents from letters.
- It does this by converting the unicode string to decomposed compatability
- form, dropping all the combining accents, then re-encoding the bytes.
-
- >>> unaccent('hello')
- 'hello'
- >>> unaccent('HELLO')
- 'HELLO'
- >>> unaccent('héllo')
- 'hello'
- >>> unaccent('héllö')
- 'hello'
- >>> unaccent('HÉLLÖ')
- 'HELLO'
- """
- translated_text = text.translate(unaccent_specials)
- return unicodedata.normalize('NFKD', translated_text).\
- encode('ascii', 'ignore').\
- decode('utf-8')
-
-def sanitise(text):
- """Remove all non-alphabetic characters and convert the text to lowercase
-
- >>> sanitise('The Quick')
- 'thequick'
- >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
- 'thequickbrownfoxjumpedoverthelazydog'
- >>> sanitise('HÉLLÖ')
- 'hello'
- """
- # sanitised = [c.lower() for c in text if c in string.ascii_letters]
- # return ''.join(sanitised)
- return letters(unaccent(text)).lower()
def datafile(name, sep='\t'):
"""Read key,value pairs from file.
"""
- with open(name, 'r') as f:
+ with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), name), 'r') as f:
for line in f:
splits = line.split(sep)
yield [splits[0], int(splits[1])]
english_trigram_counts = collections.Counter(dict(datafile('count_3l.txt')))
normalised_english_trigram_counts = norms.normalise(english_trigram_counts)
-with open('words.txt', 'r') as f:
+with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'words.txt'), 'r') as f:
keywords = [line.rstrip() for line in f]
def weighted_choice(d):
- """Generate random item from a dictionary of item counts
- """
- target = random.uniform(0, sum(d.values()))
- cuml = 0.0
- for (l, p) in d.items():
- cuml += p
- if cuml > target:
- return l
- return None
+ """Generate random item from a dictionary of item counts
+ """
+ target = random.uniform(0, sum(d.values()))
+ cuml = 0.0
+ for (l, p) in d.items():
+ cuml += p
+ if cuml > target:
+ return l
+ return None
def random_english_letter():
- """Generate a random letter based on English letter counts
- """
- return weighted_choice(normalised_english_counts)
+ """Generate a random letter based on English letter counts
+ """
+ return weighted_choice(normalised_english_counts)
def ngrams(text, n):
Pw = Pdist(datafile('count_1w.txt'), log_probability_of_unknown_word)
Pl = Pdist(datafile('count_1l.txt'), lambda _k, _N: 0)
P2l = Pdist(datafile('count_2l.txt'), lambda _k, _N: 0)
+P3l = Pdist(datafile('count_3l.txt'), lambda _k, _N: 0)
def Pwords(words):
"""The Naive Bayes log probability of a sequence of words.
"""
return sum(P2l[p] for p in ngrams(letters, 2))
+def Ptrigrams(letters):
+ """The Naive Bayes log probability of the trigrams formed from a sequence
+ of letters.
+ """
+ return sum(P3l[p] for p in ngrams(letters, 3))
+
def cosine_distance_score(text):
"""Finds the dissimilarity of a text to English, using the cosine distance
of the frequency distribution.
>>> cosine_distance_score('abcabc') # doctest: +ELLIPSIS
- 0.370847405...
+ 0.73777...
"""
- return norms.cosine_distance(english_counts,
+ # return norms.cosine_distance(english_counts,
+ # collections.Counter(sanitise(text)))
+ return 1 - norms.cosine_similarity(english_counts,
collections.Counter(sanitise(text)))