From 26f5de0a23dd94ded412f6f507910ac5e26ea2b6 Mon Sep 17 00:00:00 2001 From: Neil Smith Date: Sun, 6 Oct 2013 12:37:11 +0100 Subject: [PATCH] Pulled out the norms into a separate file, corrected a bug in normalise, redid the tests. --- __pycache__/norms.cpython-33.pyc | Bin 0 -> 6307 bytes cipher.py | 127 +++---------------------------- norms.py | 125 ++++++++++++++++++++++++++++++ 3 files changed, 136 insertions(+), 116 deletions(-) create mode 100644 __pycache__/norms.cpython-33.pyc create mode 100644 norms.py diff --git a/__pycache__/norms.cpython-33.pyc b/__pycache__/norms.cpython-33.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b18ed4b715278fb73b31ca57c434135a232eb74b GIT binary patch literal 6307 zcmdT|ZExH}5MJLUX_H>*OF~sBP^-N3A|)5Uo$V_j1-0-2K_!TwQlUhyy-D2q?Br~> zshWrnMZA1L;t%kz5MTQf_z66-_W2y&^_rAa3cmC1*t?#c*_~%*cK>Xgp84nVufE_wG+s_I?}VW1jMv#6G@WGLw0JYg zycy=55;VFsn0H#xCm2Vc*Vuy^>;2dtWt!cYW_+66pJ5nV3dPu~%`=e0$IWiJ5>>dU zp2S_p9e;r$5q%*c#M@=;zC;{uK1Bst>umNnbg8lXb(T!9Ul?2OYw&Z{eX{6YW2?<6 zFih*1hgH`Jxa`O=PvmG_Uhnbkl!rc#Vm9Yla;;smY?u$+?<~m{l9CXL2W7X3vyFi zkQGpO55hAVd}Q2tA!M{Y8oB0Z7UtkvNTgP2q}iX5Ks{jE*jTXZ*_fNTsO!JOjO*5h;r ze3oHXi5qpdRtPdNjX2HHw^0>V*IEuD7Ck+zMcaxOV z^ur`2Uz8!3)>nL%I|~=yBAXJ?GEn_UwJww+fnr)Z)_7M+X+nyoVs%N7!g+p(2RP5d zXTo2O@Rcy{8XM^=#X<*QIoel}`nj`|1E;sa<3#pjSsm#cV{*6u)h||=jCNB4=StL& zzZqgO#@Fm;1&UMnuCwz>QCiab$Y4r{6PleTB7^98Nq7$N+9h{Q7d27m8@S`^D3E}`=kBxwNSVk~ z?#gVhPCgj8-L11X$RVS;2$arR=)674ZljelpVcNYOgF&(Ge7iV%3EX)zTXL5VDlbN z9&sMZ$)l*B-?)!u1fO+3;PHY?zLdu?_hcu=XTlFUJ)pWJApA)Tj|n#XA@4nwA0Q+0 z!-5Zbsf2dKo2c?$+wTAZt@-4tbM+e1o~!q=%0;!eAdh^VWl2bjilnH|0GJ{MRDppu zo`Kr3bWLe1wqmtS+eSK2=2tk*OtdOzBovh%RpIkwZj_kHJ~5Re6tl|t5Mp-Jj-jZg zZCa+2B$>p>PqK^b$#uL5Q)mi6-I6aCm z=r03-ay-fzRBd;xxv4F~&@`jdwiI2lupcd-L!bY%loB%QQ)9sl0YvXkx))hM4jhLv{!^$OVe?_VOgryF_m^F+h=3}iE?G10?EcV z;`;0Q{{o`=mk_xWo!-kKsuvKY$W$bZzvCf`PE=xJj&g$SESKThVtbSQ4m+sOBzR z%&}fOwpe$g*bn(yKP7$%;4f+|O;ZfRRE&<+R&-V8i>g_mVB`RYattpsKo!qpJ&)3Y zrd6Apo ya7NHXbR%z&Zwucjf)MbA^i(+lF+*V literal 0 HcmV?d00001 diff --git a/cipher.py b/cipher.py index 0fc9e88..0536350 100644 --- a/cipher.py +++ b/cipher.py @@ -1,5 +1,13 @@ import string import collections +import norms + +english_counts = collections.defaultdict(int) +with open('count_1l.txt', 'r') as f: + for line in f: + (letter, count) = line.split("\t") + english_counts[letter] = int(count) +normalised_english_counts = norms.normalise(english_counts) def sanitise(text): @@ -30,109 +38,6 @@ def letter_frequencies(text): counts[c] += 1 return counts - -def normalise_frequencies(frequencies): - """Scale a set of letter frequenies so they add to 1 - - >>> sorted(normalise_frequencies(letter_frequencies('abcdefabc')).items()) - [('a', 0.2222222222222222), ('b', 0.2222222222222222), ('c', 0.2222222222222222), ('d', 0.1111111111111111), ('e', 0.1111111111111111), ('f', 0.1111111111111111)] - >>> sorted(normalise_frequencies(letter_frequencies('the quick brown fox jumped over the lazy dog')).items()) - [(' ', 0.18181818181818182), ('a', 0.022727272727272728), ('b', 0.022727272727272728), ('c', 0.022727272727272728), ('d', 0.045454545454545456), ('e', 0.09090909090909091), ('f', 0.022727272727272728), ('g', 0.022727272727272728), ('h', 0.045454545454545456), ('i', 0.022727272727272728), ('j', 0.022727272727272728), ('k', 0.022727272727272728), ('l', 0.022727272727272728), ('m', 0.022727272727272728), ('n', 0.022727272727272728), ('o', 0.09090909090909091), ('p', 0.022727272727272728), ('q', 0.022727272727272728), ('r', 0.045454545454545456), ('t', 0.045454545454545456), ('u', 0.045454545454545456), ('v', 0.022727272727272728), ('w', 0.022727272727272728), ('x', 0.022727272727272728), ('y', 0.022727272727272728), ('z', 0.022727272727272728)] - >>> sorted(normalise_frequencies(letter_frequencies('The Quick BROWN fox jumped! over... the (9lazy) DOG')).items()) - [(' ', 0.1568627450980392), ('!', 0.0196078431372549), ('(', 0.0196078431372549), (')', 0.0196078431372549), ('.', 0.058823529411764705), ('9', 0.0196078431372549), ('B', 0.0196078431372549), ('D', 0.0196078431372549), ('G', 0.0196078431372549), ('N', 0.0196078431372549), ('O', 0.0392156862745098), ('Q', 0.0196078431372549), ('R', 0.0196078431372549), ('T', 0.0196078431372549), ('W', 0.0196078431372549), ('a', 0.0196078431372549), ('c', 0.0196078431372549), ('d', 0.0196078431372549), ('e', 0.0784313725490196), ('f', 0.0196078431372549), ('h', 0.0392156862745098), ('i', 0.0196078431372549), ('j', 0.0196078431372549), ('k', 0.0196078431372549), ('l', 0.0196078431372549), ('m', 0.0196078431372549), ('o', 0.0392156862745098), ('p', 0.0196078431372549), ('r', 0.0196078431372549), ('t', 0.0196078431372549), ('u', 0.0392156862745098), ('v', 0.0196078431372549), ('x', 0.0196078431372549), ('y', 0.0196078431372549), ('z', 0.0196078431372549)] - >>> sorted(normalise_frequencies(letter_frequencies(sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG'))).items()) - [('a', 0.027777777777777776), ('b', 0.027777777777777776), ('c', 0.027777777777777776), ('d', 0.05555555555555555), ('e', 0.1111111111111111), ('f', 0.027777777777777776), ('g', 0.027777777777777776), ('h', 0.05555555555555555), ('i', 0.027777777777777776), ('j', 0.027777777777777776), ('k', 0.027777777777777776), ('l', 0.027777777777777776), ('m', 0.027777777777777776), ('n', 0.027777777777777776), ('o', 0.1111111111111111), ('p', 0.027777777777777776), ('q', 0.027777777777777776), ('r', 0.05555555555555555), ('t', 0.05555555555555555), ('u', 0.05555555555555555), ('v', 0.027777777777777776), ('w', 0.027777777777777776), ('x', 0.027777777777777776), ('y', 0.027777777777777776), ('z', 0.027777777777777776)] - """ - total = sum(frequencies.values()) - return dict((k, v / total) for (k, v) in frequencies.items()) - -def l2_norm(frequencies1, frequencies2): - """Finds the distances between two frequency profiles, expressed as dictionaries. - Assumes every key in frequencies1 is also in frequencies2 - - >>> l2_norm({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1}) - 0.0 - >>> l2_norm({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) - 0.0 - >>> l2_norm({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) - 0.816496580927726 - >>> l2_norm({'a':0, 'b':1}, {'a':1, 'b':1}) - 0.7071067811865476 - """ - f1n = normalise_frequencies(frequencies1) - f2n = normalise_frequencies(frequencies2) - total = 0 - for k in f1n.keys(): - total += (f1n[k] - f2n[k]) ** 2 - return total ** 0.5 -euclidean_distance = l2_norm - -def l1_norm(frequencies1, frequencies2): - """Finds the distances between two frequency profiles, expressed as dictionaries. - Assumes every key in frequencies1 is also in frequencies2 - - >>> l1_norm({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1}) - 0.0 - >>> l1_norm({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) - 0.0 - >>> l1_norm({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) - 1.3333333333333333 - >>> l1_norm({'a':0, 'b':1}, {'a':1, 'b':1}) - 1.0 - """ - f1n = normalise_frequencies(frequencies1) - f2n = normalise_frequencies(frequencies2) - total = 0 - for k in f1n.keys(): - total += abs(f1n[k] - f2n[k]) - return total - -def l3_norm(frequencies1, frequencies2): - """Finds the distances between two frequency profiles, expressed as dictionaries. - Assumes every key in frequencies1 is also in frequencies2 - - >>> l3_norm({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1}) - 0.0 - >>> l3_norm({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) - 0.0 - >>> l3_norm({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) - 0.7181448966772946 - >>> l3_norm({'a':0, 'b':1}, {'a':1, 'b':1}) - 0.6299605249474366 - """ - f1n = normalise_frequencies(frequencies1) - f2n = normalise_frequencies(frequencies2) - total = 0 - for k in f1n.keys(): - total += abs(f1n[k] - f2n[k]) ** 3 - return total ** (1/3) - -def cosine_distance(frequencies1, frequencies2): - """Finds the distances between two frequency profiles, expressed as dictionaries. - Assumes every key in frequencies1 is also in frequencies2 - - >>> cosine_distance({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1}) - -2.220446049250313e-16 - >>> cosine_distance({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) - -2.220446049250313e-16 - >>> cosine_distance({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) - 0.42264973081037416 - >>> cosine_distance({'a':0, 'b':1}, {'a':1, 'b':1}) - 0.29289321881345254 - """ - numerator = 0 - length1 = 0 - length2 = 0 - for k in frequencies1.keys(): - numerator += frequencies1[k] * frequencies2[k] - length1 += frequencies1[k]**2 - for k in frequencies2.keys(): - length2 += frequencies2[k] - return 1 - (numerator / (length1 ** 0.5 * length2 ** 0.5)) - - - - def caesar_encipher_letter(letter, shift): """Encipher a letter, given a shift amount @@ -199,30 +104,20 @@ def caesar_decipher(message, shift): """ return caesar_encipher(message, -shift) -def caesar_break(message, metric=euclidean_distance): +def caesar_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise): sanitised_message = sanitise(message) best_shift = 0 best_fit = float("inf") for shift in range(1, 25): plaintext = caesar_decipher(sanitised_message, shift) - frequencies = letter_frequencies(plaintext) - fit = metric(english_counts, frequencies) + frequencies = message_frequency_scaling(letter_frequencies(plaintext)) + fit = metric(target_frequencies, frequencies) if fit < best_fit: best_fit = fit best_shift = shift return best_shift, best_fit - - - -english_counts = collections.defaultdict(int) -with open('count_1l.txt', 'r') as f: - for line in f: - (letter, count) = line.split("\t") - english_counts[letter] = int(count) - - if __name__ == "__main__": import doctest doctest.testmod() diff --git a/norms.py b/norms.py new file mode 100644 index 0000000..744cbe4 --- /dev/null +++ b/norms.py @@ -0,0 +1,125 @@ +import collections + +def normalise(frequencies): + """Scale a set of frequenies so they have a unit euclidean length + + >>> sorted(normalise({1: 1, 2: 0}).items()) + [(1, 1.0), (2, 0.0)] + >>> sorted(normalise({1: 1, 2: 1}).items()) + [(1, 0.7071067811865475), (2, 0.7071067811865475)] + >>> sorted(normalise({1: 1, 2: 1, 3: 1}).items()) + [(1, 0.5773502691896258), (2, 0.5773502691896258), (3, 0.5773502691896258)] + >>> sorted(normalise({1: 1, 2: 2, 3: 1}).items()) + [(1, 0.4082482904638631), (2, 0.8164965809277261), (3, 0.4082482904638631)] + """ + length = sum([f ** 2 for f in frequencies.values()]) ** 0.5 + return collections.defaultdict(int, ((k, v / length) for (k, v) in frequencies.items())) + +def scale(frequencies): + """Scale a set of frequencies so the largest is 1 + + >>> sorted(scale({1: 1, 2: 0}).items()) + [(1, 1.0), (2, 0.0)] + >>> sorted(scale({1: 1, 2: 1}).items()) + [(1, 1.0), (2, 1.0)] + >>> sorted(scale({1: 1, 2: 1, 3: 1}).items()) + [(1, 1.0), (2, 1.0), (3, 1.0)] + >>> sorted(scale({1: 1, 2: 2, 3: 1}).items()) + [(1, 0.5), (2, 1.0), (3, 0.5)] + """ + largest = max(frequencies.values()) + return collections.defaultdict(int, ((k, v / largest) for (k, v) in frequencies.items())) + + +def l2(frequencies1, frequencies2): + """Finds the distances between two frequency profiles, expressed as dictionaries. + Assumes every key in frequencies1 is also in frequencies2 + + >>> l2({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1}) + 0.0 + >>> l2({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) + 1.7320508075688772 + >>> l2(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1})) + 0.0 + >>> l2({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) + 1.7320508075688772 + >>> l2(normalise({'a':0, 'b':2, 'c':0}), normalise({'a':1, 'b':1, 'c':1})) + 0.9194016867619662 + >>> l2({'a':0, 'b':1}, {'a':1, 'b':1}) + 1.0 + """ + total = 0 + for k in frequencies1.keys(): + total += (frequencies1[k] - frequencies2[k]) ** 2 + return total ** 0.5 +euclidean_distance = l2 + +def l1(frequencies1, frequencies2): + """Finds the distances between two frequency profiles, expressed as dictionaries. + Assumes every key in frequencies1 is also in frequencies2 + + >>> l1({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1}) + 0 + >>> l1({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) + 3 + >>> l1(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1})) + 0.0 + >>> l1({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) + 3 + >>> l1({'a':0, 'b':1}, {'a':1, 'b':1}) + 1 + """ + total = 0 + for k in frequencies1.keys(): + total += abs(frequencies1[k] - frequencies2[k]) + return total + +def l3(frequencies1, frequencies2): + """Finds the distances between two frequency profiles, expressed as dictionaries. + Assumes every key in frequencies1 is also in frequencies2 + + >>> l3({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1}) + 0.0 + >>> l3({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) + 1.4422495703074083 + >>> l3({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) + 1.4422495703074083 + >>> l3(normalise({'a':0, 'b':2, 'c':0}), normalise({'a':1, 'b':1, 'c':1})) + 0.7721675487598008 + >>> l3({'a':0, 'b':1}, {'a':1, 'b':1}) + 1.0 + >>> l3(normalise({'a':0, 'b':1}), normalise({'a':1, 'b':1})) + 0.7234757712960591 + """ + total = 0 + for k in frequencies1.keys(): + total += abs(frequencies1[k] - frequencies2[k]) ** 3 + return total ** (1/3) + +def cosine_distance(frequencies1, frequencies2): + """Finds the distances between two frequency profiles, expressed as dictionaries. + Assumes every key in frequencies1 is also in frequencies2 + + >>> cosine_distance({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1}) + -2.220446049250313e-16 + >>> cosine_distance({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) + -2.220446049250313e-16 + >>> cosine_distance({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) + 0.42264973081037416 + >>> cosine_distance({'a':0, 'b':1}, {'a':1, 'b':1}) + 0.29289321881345254 + """ + numerator = 0 + length1 = 0 + length2 = 0 + for k in frequencies1.keys(): + numerator += frequencies1[k] * frequencies2[k] + length1 += frequencies1[k]**2 + for k in frequencies2.keys(): + length2 += frequencies2[k] + return 1 - (numerator / (length1 ** 0.5 * length2 ** 0.5)) + + +if __name__ == "__main__": + import doctest + doctest.testmod() -- 2.34.1