From ac47ee478ba8e785037c9f25367c111f630eea54 Mon Sep 17 00:00:00 2001 From: Neil Smith Date: Mon, 7 Oct 2013 13:28:24 +0100 Subject: [PATCH] Fixed bugs in geometric and harmonic means, added some tests. --- __pycache__/cipher.cpython-33.pyc | Bin 5874 -> 7661 bytes __pycache__/norms.cpython-33.pyc | Bin 6307 -> 8560 bytes caesar_break_parameter_trials.csv | 310 +++++++++++++++------------ cipher.py | 17 +- find_best_caesar_break_parameters.py | 2 +- norms.py | 38 +++- 6 files changed, 217 insertions(+), 150 deletions(-) diff --git a/__pycache__/cipher.cpython-33.pyc b/__pycache__/cipher.cpython-33.pyc index 05222b00a2b63868a0e7d9cae75cb0884fed535a..71c263ebc64d9ac84291551734dcedb0484849e7 100644 GIT binary patch delta 2273 zcmbtVy^kA36n|@zOXBP$xwFsr5r|d^AtoPp{y5+H1UgW-0veE#0x1HbUGMDf*t1_V zJG*P^PA(EYL>IZBql5+_q3c3HlRp4z8YBt|6bcF?BnsZxkOT#a;L*(U^L+f?Z{EE3 z_SdTqUONA&K41D`c>Vgf6@V^0)=r@QDh^S*d;a_D_5#?8a0lRK1xNvUOF-r)iVk)e z>Ww4i^UjY8RW&_w7uobwu0%Qext8g1Y3nsHSYSqI%?SZzthZZ)x&?g%K zp=@I>>`I>miP(sE=yKEdY2a=IWDBRDMQ2u+d&TCc zfJ{_Vzt7+(;t=a7KE$qpQ&6|D3-AFzf?b1;G`NitV8t%NGOBwxZq(G-H@g~2)fC!M z9AzBh5(*b!AK+aB4m5>>Ap4U(LZIdM3$ThHcMA~DArx+hnnIFCP$V*9q^1C+66#q! zWE5L&;`>PDM0}~~^xy0zyo2fOCfE5V`Nw&`_dkMSV;Uvv(Ll~UWW0jfG8U@GZ8D7kYkYrkQ~P-Q`{KFM zEl4nhii{pWBD0|=QU2uzA#MVHIoRF z8&hOM3am_q!$~Yj&>a{i)~A7>qGVuuBzAXqjqc>&ElUB_E>bJ+ioq!4!i>8f8T46B z(~#thq&}y9mJ5f*Bu#zl;TFMjMzaC;NVg}VNSHqFXN*LV;}h-@hdQb6(Oe*;-$N0T zkmk(4_~L{@ZS#szUvD)!TdkdTv$IogbXr>twKg)0C#m(kp(j0u$CSmMn{k_@nY5iu zdUncV7WcBg7x(P}YH3b<&&kv-N3x7LERAs0Geu~}cE)(jso)5g_3g~hC3Bf$hpEH< zVm7L^Ce)fc^=5su+1YArx9Xi%d&>JsYFVE7W^TJ4A&HyGe3-aVLTt+&1{5bUFLGr* z@cOysduf_5%X1S#xQV{-D9tlsF+X<^#6y^Y)gR$WJU2{4o>?v~p*xemz1G-K+%}ty zHqvV}TkTG(xxM{g)CMfEtT>2+Vd~1%%q0);WdhR?78%B==~_X^1INX`X%v{I)sviz zxJb=dro%y+^I#a5LpHE5mU!xfp35m$`x49KlA95Vht#HSYi!ke?#02CS=vNQ=6mza``*0CD~^2hwqG(mxG*(4_h`V1Y}hSsonJy`Dff~X$Y6U+q3QPmSi)Q z1P~)lmL)Fi9*T{q*FaLP)KDp P%wSC{$U=-yGSYdidUp znHe?&P=X^%!*c=6`}B5XVakny+Yc`R=KH{nfqO1Ep9L8PHx6qW)IWj?;1XDmzUylPBH1HZtH?&Z zTw)cuSY@^4AiB8Z)z+I0$=vetvaCz>A$GMV%otPOV)JS%tVa)U4}F~a5jK}MRTapd66T8$Fyrj3GZ9Rp09S@%}nzjyQ48$_V zv$n7d^T68&8g|yf5wH)j?Geb?ro*{mIi_$d!NrN-yMrfQ7496F1H&?L5c2t4o(tQ^ z^SzNOV*{Z~(Czio7j#WN^_qTX@p%OMx~!__^p&k+tmE6CVZfv4_GaZW=^<&->`MFk zJ)uJ6y_!wlAi8>&e30%Qscw`iBnQ-=>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrhecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') + (4, 0.3186395289018361) + >>> caesar_break('jhzhuhfrqilqhgwrdevwudfwuhdvrqlqjwkhqkdylqjvxemhfwhgwrfulwlflvpwkhhasodqdwlrqrisrzhuwkdwmxulglfdovfl') + (3, 0.32902042861730835) + >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgteeraxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') + (19, 0.4215290123583277) + >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurersvaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') + (13, 0.31602920807545154) + """ sanitised_message = sanitise(message) best_shift = 0 best_fit = float("inf") - for shift in range(1, 25): + for shift in range(26): plaintext = caesar_decipher(sanitised_message, shift) frequencies = message_frequency_scaling(letter_frequencies(plaintext)) fit = metric(target_frequencies, frequencies) diff --git a/find_best_caesar_break_parameters.py b/find_best_caesar_break_parameters.py index 711cff0..ed8bbaa 100644 --- a/find_best_caesar_break_parameters.py +++ b/find_best_caesar_break_parameters.py @@ -11,7 +11,7 @@ scaled_english_counts = norms.scale(english_counts) metrics = [norms.l1, norms.l2, norms.l3, norms.cosine_distance, norms.harmonic_mean, norms.geometric_mean] corpus_frequencies = [normalised_english_counts, scaled_english_counts] scalings = [norms.normalise, norms.scale] -message_lengths = [3000, 1000, 300, 100, 50, 30, 20, 10, 5] +message_lengths = [300, 100, 50, 30, 20, 10, 5] metric_names = ['l1', 'l2', 'l3', 'cosine_distance', 'harmonic_mean', 'geometric_mean'] corpus_frequency_names = ['normalised_english_counts', 'scaled_english_counts'] diff --git a/norms.py b/norms.py index 4fdf1e3..08cff74 100644 --- a/norms.py +++ b/norms.py @@ -97,24 +97,52 @@ def l3(frequencies1, frequencies2): return total ** (1/3) def geometric_mean(frequencies1, frequencies2): - """Finds the distances between two frequency profiles, expressed as dictionaries. + """Finds the geometric mean of the absolute differences between two frequency profiles, + expressed as dictionaries. Assumes every key in frequencies1 is also in frequencies2 - + + >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) + 1 + >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) + 1 + >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1}) + 3 + >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':5, 'c':1})) + 0.057022248808851934 + >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1})) + 0.0 + >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':0})) + 0.009720703533656434 """ - total = 0 + total = 1 for k in frequencies1.keys(): total *= abs(frequencies1[k] - frequencies2[k]) return total def harmonic_mean(frequencies1, frequencies2): - """Finds the distances between two frequency profiles, expressed as dictionaries. + """Finds the harmonic mean of the absolute differences between two frequency profiles, + expressed as dictionaries. Assumes every key in frequencies1 is also in frequencies2 + >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) + 1.0 + >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) + 1.0 + >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1}) + 1.2857142857142858 + >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':5, 'c':1})) + 0.3849001794597505 + >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1})) + 0 + >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':0})) + 0.17497266360581604 """ total = 0 for k in frequencies1.keys(): + if abs(frequencies1[k] - frequencies2[k]) == 0: + return 0 total += 1 / abs(frequencies1[k] - frequencies2[k]) - return 1 / total + return len(frequencies1) / total def cosine_distance(frequencies1, frequencies2): -- 2.34.1