From 96d46a680a808555a9ff77f2eaa68383569f07ee Mon Sep 17 00:00:00 2001 From: Neil Smith Date: Fri, 14 Mar 2014 12:12:50 +0000 Subject: [PATCH] Updated letter frequencies, updated test values to reflect them --- cipherbreak.py | 24 +++++++++------------ count_1l.txt | 52 +++++++++++++++++++++++----------------------- language_models.py | 8 +++---- lettercount.py | 6 ++---- norms.py | 8 +++---- 5 files changed, 46 insertions(+), 52 deletions(-) diff --git a/cipherbreak.py b/cipherbreak.py index d2c35c9..17df97a 100644 --- a/cipherbreak.py +++ b/cipherbreak.py @@ -50,10 +50,6 @@ def frequencies(text): >>> frequencies('abcdefabcdef')['x'] 0 """ - #counts = collections.defaultdict(int) - #for c in text: - # counts[c] += 1 - #return counts return collections.Counter(c for c in text) @@ -62,13 +58,13 @@ def caesar_break(message, fitness=Pletters): >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrh' \ 'ecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS - (4, -130.849890899...) + (4, -130.849989015...) >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgtee' \ 'raxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS - (19, -128.82516920...) + (19, -128.82410410...) >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurer' \ 'svaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS - (13, -126.25233502...) + (13, -126.25403935...) """ sanitised_message = sanitise(message) best_shift = 0 @@ -95,7 +91,7 @@ def affine_break(message, fitness=Pletters): 'ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls ' \ 'bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai ' \ 'kxd clm ckuxj.') # doctest: +ELLIPSIS - ((15, 22, True), -340.611412245...) + ((15, 22, True), -340.601181913...) """ sanitised_message = sanitise(message) best_multiplier = 0 @@ -131,7 +127,7 @@ def keyword_break(message, wordlist=keywords, fitness=Pletters): >>> keyword_break(keyword_encipher('this is a test message for the ' \ 'keyword decipherment', 'elephant', 1), \ wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS - (('elephant', 1), -52.8345642265...) + (('elephant', 1), -52.834575011...) """ best_keyword = '' best_wrap_alphabet = True @@ -162,7 +158,7 @@ def keyword_break_mp(message, wordlist=keywords, fitness=Pletters, chunksize=500 >>> keyword_break_mp(keyword_encipher('this is a test message for the ' \ 'keyword decipherment', 'elephant', 1), \ wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS - (('elephant', 1), -52.834564226507...) + (('elephant', 1), -52.834575011...) """ with Pool() as pool: helper_args = [(message, word, wrap, fitness) @@ -287,7 +283,7 @@ def vigenere_keyword_break(message, wordlist=keywords, fitness=Pletters): >>> vigenere_keyword_break(vigenere_encipher(sanitise('this is a test ' \ 'message for the vigenere decipherment'), 'cat'), \ wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS - ('cat', -52.9479167030...) + ('cat', -52.947271216...) """ best_keyword = '' best_fit = float("-inf") @@ -315,7 +311,7 @@ def vigenere_keyword_break_mp(message, wordlist=keywords, fitness=Pletters, >>> vigenere_keyword_break_mp(vigenere_encipher(sanitise('this is a test ' \ 'message for the vigenere decipherment'), 'cat'), \ wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS - ('cat', -52.9479167030...) + ('cat', -52.947271216...) """ with Pool() as pool: helper_args = [(message, word, fitness) @@ -345,7 +341,7 @@ def vigenere_frequency_break(message, fitness=Pletters): "certain that the theft has been discovered and that I will " \ "be caught. The SS officer visits less often now that he is " \ "sure"), 'florence')) # doctest: +ELLIPSIS - ('florence', -307.5549865898...) + ('florence', -307.5473096791...) """ best_fit = float("-inf") best_key = '' @@ -376,7 +372,7 @@ def beaufort_frequency_break(message, fitness=Pletters): "certain that the theft has been discovered and that I will " \ "be caught. The SS officer visits less often now " \ "that he is sure"), 'florence')) # doctest: +ELLIPSIS - ('florence', -307.5549865898...) + ('florence', -307.5473096791...) """ best_fit = float("-inf") best_key = '' diff --git a/count_1l.txt b/count_1l.txt index c1fc8ef..e9ac0c6 100644 --- a/count_1l.txt +++ b/count_1l.txt @@ -1,26 +1,26 @@ -e 756288 -t 559059 -o 503173 -a 489107 -i 420131 -n 418342 -h 415853 -s 403715 -r 372431 -d 267381 -l 258537 -u 189758 -m 171836 -w 153882 -y 142711 -c 140497 -f 134935 -g 117474 -p 100241 -b 92647 -v 65181 -k 54114 -x 7386 -j 6590 -q 5488 -z 3575 +e 758103 +t 560576 +o 504520 +a 490129 +i 421240 +n 419374 +h 416369 +s 404473 +r 373599 +d 267917 +l 259023 +u 190269 +m 172199 +w 154157 +y 143040 +c 141094 +f 135318 +g 117888 +p 100690 +b 92919 +v 65297 +k 54248 +x 7414 +j 6679 +q 5499 +z 3577 diff --git a/language_models.py b/language_models.py index 9297468..ceb4596 100644 --- a/language_models.py +++ b/language_models.py @@ -140,14 +140,14 @@ def Pbigrams(letters): return sum(P2l[p] for p in ngrams(letters, 2)) -def cosine_distance_score(text): +def cosine_similarity_score(text): """Finds the dissimilarity of a text to English, using the cosine distance of the frequency distribution. - >>> cosine_distance_score('abcabc') # doctest: +ELLIPSIS - 0.370847405... + >>> cosine_similarity_score('abcabc') # doctest: +ELLIPSIS + 0.26228882... """ - return norms.cosine_distance(english_counts, + return norms.cosine_similarity(english_counts, collections.Counter(sanitise(text))) diff --git a/lettercount.py b/lettercount.py index 18f75c4..956eca1 100644 --- a/lettercount.py +++ b/lettercount.py @@ -8,8 +8,6 @@ for corpus in corpora: text = sanitise(open(corpus).read()) counts.update(text) -sorted_letters = sorted(counts, key=counts.get, reverse=True) - with open('count_1l.txt', 'w') as f: - for l in sorted_letters: - f.write("{}\t{}\n".format(l, counts[l])) + for l, c in counts.most_common(): + f.write("{}\t{}\n".format(l, c)) diff --git a/norms.py b/norms.py index 37fd3c9..b8e4bf1 100644 --- a/norms.py +++ b/norms.py @@ -163,13 +163,13 @@ def cosine_similarity(frequencies1, frequencies2): """Finds the distances between two frequency profiles, expressed as dictionaries. Assumes every key in frequencies1 is also in frequencies2 - >>> cosine_distance({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS + >>> cosine_similarity({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS 1.0000000000... - >>> cosine_distance({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS + >>> cosine_similarity({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS 1.0000000000... - >>> cosine_distance({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS + >>> cosine_similarity({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS 0.5773502691... - >>> cosine_distance({'a':0, 'b':1}, {'a':1, 'b':1}) # doctest: +ELLIPSIS + >>> cosine_similarity({'a':0, 'b':1}, {'a':1, 'b':1}) # doctest: +ELLIPSIS 0.7071067811... """ numerator = 0 -- 2.34.1