1 """Language-specific functions, including models of languages based on data of
11 from math
import log10
14 """Remove all non-alphabetic characters from a text
15 >>> letters('The Quick')
17 >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
18 'TheQuickBROWNfoxjumpedoverthelazyDOG'
20 return ''.join([c
for c
in text
if c
in string
.ascii_letters
])
23 """Remove all accents from letters.
24 It does this by converting the unicode string to decomposed compatability
25 form, dropping all the combining accents, then re-encoding the bytes.
38 return unicodedata
.normalize('NFKD', text
).\
39 encode('ascii', 'ignore').\
43 """Remove all non-alphabetic characters and convert the text to lowercase
45 >>> sanitise('The Quick')
47 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
48 'thequickbrownfoxjumpedoverthelazydog'
52 # sanitised = [c.lower() for c in text if c in string.ascii_letters]
53 # return ''.join(sanitised)
54 return letters(unaccent(text
)).lower()
57 def datafile(name
, sep
='\t'):
58 """Read key,value pairs from file.
60 with
open(name
, 'r') as f
:
62 splits
= line
.split(sep
)
63 yield [splits
[0], int(splits
[1])]
65 english_counts
= collections
.Counter(dict(datafile('count_1l.txt')))
66 normalised_english_counts
= norms
.normalise(english_counts
)
67 Pl
= {l
: log10(n
) for l
, n
in normalised_english_counts
.items()}
69 with
open('words.txt', 'r') as f
:
70 keywords
= [line
.rstrip() for line
in f
]
73 def Pletters(letters
):
74 """The Naive Bayes log probability of a sequence of letters.
76 return sum(Pl
[l
.lower()] for l
in letters
)
79 def cosine_similarity_score(text
):
80 """Finds the dissimilarity of a text to English, using the cosine distance
81 of the frequency distribution.
83 >>> cosine_similarity_score('abcabc') # doctest: +ELLIPSIS
86 return norms
.cosine_similarity(english_counts
,
87 collections
.Counter(sanitise(text
)))
90 if __name__
== "__main__":