1 """Language-specific functions, including models of languages based on data of
11 from math
import log10
13 unaccent_specials
= ''.maketrans({"’": "'"})
16 """Remove all non-alphabetic characters from a text
17 >>> letters('The Quick')
19 >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
20 'TheQuickBROWNfoxjumpedoverthelazyDOG'
22 return ''.join([c
for c
in text
if c
in string
.ascii_letters
])
25 """Remove all accents from letters.
26 It does this by converting the unicode string to decomposed compatability
27 form, dropping all the combining accents, then re-encoding the bytes.
40 translated_text
= text
.translate(unaccent_specials
)
41 return unicodedata
.normalize('NFKD', translated_text
).\
42 encode('ascii', 'ignore').\
46 """Remove all non-alphabetic characters and convert the text to lowercase
48 >>> sanitise('The Quick')
50 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
51 'thequickbrownfoxjumpedoverthelazydog'
55 # sanitised = [c.lower() for c in text if c in string.ascii_letters]
56 # return ''.join(sanitised)
57 return letters(unaccent(text
)).lower()
60 def datafile(name
, sep
='\t'):
61 """Read key,value pairs from file.
63 with
open(name
, 'r') as f
:
65 splits
= line
.split(sep
)
66 yield [splits
[0], int(splits
[1])]
68 english_counts
= collections
.Counter(dict(datafile('count_1l.txt')))
69 normalised_english_counts
= norms
.normalise(english_counts
)
71 english_bigram_counts
= collections
.Counter(dict(datafile('count_2l.txt')))
72 normalised_english_bigram_counts
= norms
.normalise(english_bigram_counts
)
74 english_trigram_counts
= collections
.Counter(dict(datafile('count_3l.txt')))
75 normalised_english_trigram_counts
= norms
.normalise(english_trigram_counts
)
77 with
open('words.txt', 'r') as f
:
78 keywords
= [line
.rstrip() for line
in f
]
81 def weighted_choice(d
):
82 """Generate random item from a dictionary of item counts
84 target
= random
.uniform(0, sum(d
.values()))
86 for (l
, p
) in d
.items():
92 def random_english_letter():
93 """Generate a random letter based on English letter counts
95 return weighted_choice(normalised_english_counts
)
99 """Returns all n-grams of a text
101 >>> ngrams(sanitise('the quick brown fox'), 2) # doctest: +NORMALIZE_WHITESPACE
102 ['th', 'he', 'eq', 'qu', 'ui', 'ic', 'ck', 'kb', 'br', 'ro', 'ow', 'wn',
104 >>> ngrams(sanitise('the quick brown fox'), 4) # doctest: +NORMALIZE_WHITESPACE
105 ['theq', 'hequ', 'equi', 'quic', 'uick', 'ickb', 'ckbr', 'kbro', 'brow',
106 'rown', 'ownf', 'wnfo', 'nfox']
108 return [text
[i
:i
+n
] for i
in range(len(text
)-n
+1)]
112 """A probability distribution estimated from counts in datafile.
113 Values are stored and returned as log probabilities.
115 def __init__(self
, data
=[], estimate_of_missing
=None):
116 data1
, data2
= itertools
.tee(data
)
117 self
.total
= sum([d
[1] for d
in data1
])
118 for key
, count
in data2
:
119 self
[key
] = log10(count
/ self
.total
)
120 self
.estimate_of_missing
= estimate_of_missing
or (lambda k
, N
: 1./N
)
121 def __missing__(self
, key
):
122 return self
.estimate_of_missing(key
, self
.total
)
124 def log_probability_of_unknown_word(key
, N
):
125 """Estimate the probability of an unknown word.
127 return -log10(N
* 10**((len(key
) - 2) * 1.4))
129 Pw
= Pdist(datafile('count_1w.txt'), log_probability_of_unknown_word
)
130 Pw_wrong
= Pdist(datafile('count_1w.txt'), lambda _k
, N
: log10(1/N
))
131 Pl
= Pdist(datafile('count_1l.txt'), lambda _k
, _N
: 0)
132 P2l
= Pdist(datafile('count_2l.txt'), lambda _k
, _N
: 0)
133 P3l
= Pdist(datafile('count_3l.txt'), lambda _k
, _N
: 0)
136 """The Naive Bayes log probability of a sequence of words.
138 return sum(Pw
[w
.lower()] for w
in words
)
140 def Pwords_wrong(words
):
141 """The Naive Bayes log probability of a sequence of words.
143 return sum(Pw_wrong
[w
.lower()] for w
in words
)
145 def Pletters(letters
):
146 """The Naive Bayes log probability of a sequence of letters.
148 return sum(Pl
[l
.lower()] for l
in letters
)
150 def Pbigrams(letters
):
151 """The Naive Bayes log probability of the bigrams formed from a sequence
154 return sum(P2l
[p
] for p
in ngrams(letters
, 2))
156 def Ptrigrams(letters
):
157 """The Naive Bayes log probability of the trigrams formed from a sequence
160 return sum(P3l
[p
] for p
in ngrams(letters
, 3))
163 def cosine_similarity_score(text
):
164 """Finds the dissimilarity of a text to English, using the cosine distance
165 of the frequency distribution.
167 >>> cosine_similarity_score('abcabc') # doctest: +ELLIPSIS
170 return norms
.cosine_similarity(english_counts
,
171 collections
.Counter(sanitise(text
)))
174 if __name__
== "__main__":