From defd4de8e665aa31bbf17487bcd5517c5c84b092 Mon Sep 17 00:00:00 2001 From: Neil Smith Date: Tue, 24 Oct 2017 10:13:49 +0100 Subject: [PATCH] Updated letter counts and tests based on it. --- count_1l.txt | 52 +++++++++++++++++++++++----------------------- language_models.py | 19 +++++++++++++++-- 2 files changed, 43 insertions(+), 28 deletions(-) diff --git a/count_1l.txt b/count_1l.txt index c1fc8ef..e9ac0c6 100644 --- a/count_1l.txt +++ b/count_1l.txt @@ -1,26 +1,26 @@ -e 756288 -t 559059 -o 503173 -a 489107 -i 420131 -n 418342 -h 415853 -s 403715 -r 372431 -d 267381 -l 258537 -u 189758 -m 171836 -w 153882 -y 142711 -c 140497 -f 134935 -g 117474 -p 100241 -b 92647 -v 65181 -k 54114 -x 7386 -j 6590 -q 5488 -z 3575 +e 758103 +t 560576 +o 504520 +a 490129 +i 421240 +n 419374 +h 416369 +s 404473 +r 373599 +d 267917 +l 259023 +u 190269 +m 172199 +w 154157 +y 143040 +c 141094 +f 135318 +g 117888 +p 100690 +b 92919 +v 65297 +k 54248 +x 7414 +j 6679 +q 5499 +z 3577 diff --git a/language_models.py b/language_models.py index 8824bca..0fa6e85 100644 --- a/language_models.py +++ b/language_models.py @@ -126,6 +126,7 @@ def log_probability_of_unknown_word(key, N): Pw = Pdist(datafile('count_1w.txt'), log_probability_of_unknown_word) Pl = Pdist(datafile('count_1l.txt'), lambda _k, _N: 0) P2l = Pdist(datafile('count_2l.txt'), lambda _k, _N: 0) +P3l = Pdist(datafile('count_3l.txt'), lambda _k, _N: 0) def Pwords(words): """The Naive Bayes log probability of a sequence of words. @@ -143,15 +144,29 @@ def Pbigrams(letters): """ return sum(P2l[p] for p in ngrams(letters, 2)) +def Pbigrams(letters): + """The Naive Bayes log probability of the bigrams formed from a sequence + of letters. + """ + return sum(P2l[p] for p in ngrams(letters, 2)) + +def Ptrigrams(letters): + """The Naive Bayes log probability of the trigrams formed from a sequence + of letters. + """ + return sum(P3l[p] for p in ngrams(letters, 3)) + def cosine_distance_score(text): """Finds the dissimilarity of a text to English, using the cosine distance of the frequency distribution. >>> cosine_distance_score('abcabc') # doctest: +ELLIPSIS - 0.370847405... + 0.73777... """ - return norms.cosine_distance(english_counts, + # return norms.cosine_distance(english_counts, + # collections.Counter(sanitise(text))) + return 1 - norms.cosine_similarity(english_counts, collections.Counter(sanitise(text))) -- 2.34.1