10 """Remove all non-alphabetic characters from a text
11 >>> letters('The Quick')
13 >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
14 'TheQuickBROWNfoxjumpedoverthelazyDOG'
16 return ''.join([c
for c
in text
if c
in string
.ascii_letters
])
19 """Remove all accents from letters.
20 It does this by converting the unicode string to decomposed compatability
21 form, dropping all the combining accents, then re-encoding the bytes.
34 return unicodedata
.normalize('NFKD', text
).\
35 encode('ascii', 'ignore').\
39 """Remove all non-alphabetic characters and convert the text to lowercase
41 >>> sanitise('The Quick')
43 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
44 'thequickbrownfoxjumpedoverthelazydog'
48 # sanitised = [c.lower() for c in text if c in string.ascii_letters]
49 # return ''.join(sanitised)
50 return letters(unaccent(text
)).lower()
53 def datafile(name
, sep
='\t'):
54 """Read key,value pairs from file.
56 with
open(name
, 'r') as f
:
58 splits
= line
.split(sep
)
59 yield [splits
[0], int(splits
[1])]
61 english_counts
= collections
.Counter(dict(datafile('count_1l.txt')))
62 normalised_english_counts
= norms
.normalise(english_counts
)
64 english_bigram_counts
= collections
.Counter(dict(datafile('count_2l.txt')))
65 normalised_english_bigram_counts
= norms
.normalise(english_bigram_counts
)
67 english_trigram_counts
= collections
.Counter(dict(datafile('count_3l.txt')))
68 normalised_english_trigram_counts
= norms
.normalise(english_trigram_counts
)
70 with
open('words.txt', 'r') as f
:
71 keywords
= [line
.rstrip() for line
in f
]
74 def weighted_choice(d
):
75 """Generate random item from a dictionary of item counts
77 target
= random
.uniform(0, sum(d
.values()))
79 for (l
, p
) in d
.items():
85 def random_english_letter():
86 """Generate a random letter based on English letter counts
88 return weighted_choice(normalised_english_counts
)
92 """A probability distribution estimated from counts in datafile.
93 Values are stored and returned as log probabilities.
95 def __init__(self
, data
=[], estimate_of_missing
=None):
96 data1
, data2
= itertools
.tee(data
)
97 self
.total
= sum([d
[1] for d
in data1
])
98 for key
, count
in data2
:
99 self
[key
] = log10(count
/ self
.total
)
100 self
.estimate_of_missing
= estimate_of_missing
or (lambda k
, N
: 1./N
)
101 def __missing__(self
, key
):
102 return self
.estimate_of_missing(key
, self
.total
)
104 def log_probability_of_unknown_word(key
, N
):
105 """Estimate the probability of an unknown word.
107 return -log10(N
* 10**((len(key
) - 2) * 1.4))
109 Pw
= Pdist(datafile('count_1w.txt'), log_probability_of_unknown_word
)
110 Pl
= Pdist(datafile('count_1l.txt'), lambda _k
, _N
: 0)
113 """The Naive Bayes log probability of a sequence of words.
115 return sum(Pw
[w
.lower()] for w
in words
)
117 def Pletters(letters
):
118 """The Naive Bayes log probability of a sequence of letters.
120 return sum(Pl
[l
.lower()] for l
in letters
)
124 def cosine_distance_score(text
):
125 """Finds the dissimilarity of a text to English, using the cosine distance
126 of the frequency distribution.
128 >>> cosine_distance_score('abcabc') # doctest: +ELLIPSIS
131 return norms
.cosine_distance(english_counts
,
132 collections
.Counter(sanitise(text
)))
135 if __name__
== "__main__":