From ac47ee478ba8e785037c9f25367c111f630eea54 Mon Sep 17 00:00:00 2001 From: Neil Smith <neil.github@njae.me.uk> Date: Mon, 7 Oct 2013 13:28:24 +0100 Subject: [PATCH] Fixed bugs in geometric and harmonic means, added some tests. --- __pycache__/cipher.cpython-33.pyc | Bin 5874 -> 7661 bytes __pycache__/norms.cpython-33.pyc | Bin 6307 -> 8560 bytes caesar_break_parameter_trials.csv | 310 +++++++++++++++------------ cipher.py | 17 +- find_best_caesar_break_parameters.py | 2 +- norms.py | 38 +++- 6 files changed, 217 insertions(+), 150 deletions(-) diff --git a/__pycache__/cipher.cpython-33.pyc b/__pycache__/cipher.cpython-33.pyc index 05222b00a2b63868a0e7d9cae75cb0884fed535a..71c263ebc64d9ac84291551734dcedb0484849e7 100644 GIT binary patch delta 2273 zcmbtVy^kA36n|@zOXBP$xwFsr5r|d^AtoPp{y5+H1UgW-0veE#0x1HbUGMDf*t1_V zJG*P^PA(EYL>IZBql5+_q3c3HlRp4z8YBt|6bcF?BnsZxkOT#a;L*(U^L+f?Z{EE3 z_SdTqUONA&K41D`c>Vgf6@V^0)=r@QDh^S*d;a_D_5#?8a0lRK1xNvUOF-r)iVk)e z><Zi|!p$m>Ww4i^UjY8RW&_w7uobwu0%Qext8g1Y3nsHSYSqI%?SZzthZZ)x&?g%K zp=@I>>`I>miP(sE=yKEdY2a=IW<tZjWLLX1@(91$^-RV{;F9sd>DBRDMQ2u+d&TCc zfJ{_Vzt7+(;t=a7KE$qpQ&6|D3-AFzf?b1;G`NitV8t%NGOBwxZq(G-H@g~2)fC!M z9AzBh5(*b!AK+aB4m5>>Ap4U(LZIdM3$ThHcMA~DArx+hnnIFCP$V*9q^1C+66<xT zmgsawsex3s6Ot#Blk%y2yYR2h9jzjwQ09(<TvImGqlopFS=cq1*hPhkq#OFtE>#q! zWE5L&;`>PDM0}~~^xy0zyo2fOCfE5V`Nw&`_dkMSV;Uvv(Ll~UWW0jf<Q||sHhcJU zu`u`gQ~Bu$H$5^p{-M0AojI(Oek{xykbmeGv@3__=G!x8$U7%5%)jyQ^2cO#{MBN4 ze6~7w`1#52W=6;S({7DFIell=3mXe(wHt@8ESNJ>G8U@GZ8D7kYkYrkQ~P-Q`{KFM zEl4nhi<q<HTKU@OGVGV&U=H?=!od-UFRT9c{t?)pQ>i{pWBD0|=QU2uzA#MVHIoRF z8&hOM3am_q!$~Yj&>a{i)~A7>qGVuuBzAXqjqc>&ElUB_E>bJ+ioq!4!i>8f8T46B z(~#thq&}y9mJ5f*Bu#zl;TFMjMzaC;NVg}VNSHqFXN*LV;}h-@hdQb6(Oe*;-$N0T zkmk(4_~L{@ZS#szUvD)!TdkdTv$IogbXr>twKg)0C#m(kp(j0u$CSmMn{k_@nY5iu zdUncV7WcBg7x(P}YH3b<&&kv-N3x7LERAs0Geu~}cE)(jso)5g_3g~hC3Bf$hpEH< zVm7L^Ce)fc^=5su+1YArx9Xi%d&>JsYFVE7W^TJ4A&HyGe3-aVLTt+&1{5bUFLGr* z@cOysduf_5%X1S#xQV{-D9tlsF+X<^#6y^Y)gR$WJU2{4o>?v~p*xemz1G-K+%}ty zHqvV}TkTG(xxM{g)CMfEtT>2+Vd~1%%q0);WdhR?78%B==~_X^1INX`X%v{I)sviz zxJb=dro%y+^I#a5LpHE5mU!xfp35m$`x49KlA95Vht#HSYi!k<JI#8hzTMtzH`<$1 zZWQwzCN-A^4prs4rb@9qhC-O=C@RS0Q{u4kH|5`E&DndEGmH2+2T?@<^SU;t=~{XG zQ{{@bHm)q)FFh*1sx*#&S^9cblV4V!z4QVa)6tS%IHfIW^V*WOrkyJ4C-kDepg*UV M^i#TiW=-w*6PLWAhX4Qo delta 470 zcmZ8c&o2W(7=7QaE^1b7ySA<MBYq_$R3apVh&TurDGm~m2xdk7EY*W_#i6~4gUnU@ z2}JhjByn+(h@*>e?#02CS=vNQ=6mza``*0CD~^2hwqG(mxG*(4_h<lX__JzcKQ1fN zc;5ap=_uGV;pp%i25x|dQBbj1561$JU`xS4j5xpv+fgh=;Y6_+Lmr^rA9lXW^paGh zfbX0TIrXH}4OR4eX^phf^_sO(%zI+*O6W(~x0Xt8I0uRJk+^EKsoRUy4VFz#jZT{M zZj8?|kyGQeB)%erlG&ZvOiAz6Tqz6gOskqZzn075(i)&M@ocr}t2k7^RqzVr74c<F z3@6|Qaje0u#Bl)o34brR0k|6070h=9WQ>`V1Y}hSsonJy`Dff~X$Y6U+q3QPmSi)Q z1P~)lmL)Fi9*T<x`(C^;OUAmL{ueb{CFP?#dpx`CjQO^i4hc*?W=e>{q*FaLP)KDp P%wSC{$U=<Swf_7Y5t3W~ diff --git a/__pycache__/norms.cpython-33.pyc b/__pycache__/norms.cpython-33.pyc index b18ed4b715278fb73b31ca57c434135a232eb74b..7d10c2f61bc74f9763ecdc1456169892160835f5 100644 GIT binary patch delta 1140 zcma)5&rcIU6n?WMi0y2tEz%<Xn1mWiV#sW})7=$A8Zkt@n9zWRgc#@!QlM?@wuwJV zAmYhH6CI-m6D}M*8E@YF5BxK{nD`Hv_;#T{Ekq`HZzgZv_r2Zsz1cN>-yGSYdidUp znHe?&P=X^%!*c=6`}B5XVakny+Yc`R=KH{nfqO1Ep9L8PHx6qW)IWj?;1XDmz<dN` z1Xd|*1CVMEEoKs5K3_?w(U`6rT}(S08az(G{gwvBj9;qx6a93#>UylPBH1HZtH?&Z zTw)cuSY@^4AiB8Z)z+I0$=vetvaCz>A$GMV%otPOV)JS%tVa)U4}F~a5jK<lJ)4}I zJmfshU785B6;1t1$Rf^!xW-0Hqt}g%v9W-gk8@%1Y&L7!Hn(kII63pAqAW7oO<73u zBV63EvfSc1k;~;p-rVOZ6SdI4yBWn+b^fC-n%9R8;=Lq67-=4p<9FC!J!ogprqH|! zIus0*crphqfK~7XyR~o|HbT%KeyA0Kp%$KSOKlTyH%XxNgpVttcCL1O;;^-^Z$tBG z5Q@pe#Y_lGHh*;}y|P%Z)T;jxx>}MRTapd66T8$Fyrj3GZ9Rp09S@%}nzjyQ48$_V zv$n7d^T68&8g|yf5wH)j?Geb?ro*{mIi_$d!NrN-yMrfQ7496F1H&?L5c2t4o(tQ^ z^SzNOV*{Z~(Czio7j#WN^_qTX@p%OMx~!__^p&k+tmE6CVZfv4_GaZW=^<&->`MFk zJ)uJ6y_!wlAi8>&e30%Qscw`iBnQ-=<eQ5r<fj_#r!jiISEB=TkS6H`nxce~zhv|n ArT_o{ delta 137 zcmez1wAfI29uF_q2d}`O8bJnzWCkR_1Y|n^adGoR<yh@h4u%vyhHNH=qI!lD28I+y zh8#|YC{Bh{PKGQlhGqr^#yqfSum=Ce!}7wL^JKc18Ivdfl(%8zo@}NNF?qAXLJ1L| ZaXd^s{5%3YqHIh&B0OR|5<HANi~uN581?`F diff --git a/caesar_break_parameter_trials.csv b/caesar_break_parameter_trials.csv index df9b836..ba7ee27 100644 --- a/caesar_break_parameter_trials.csv +++ b/caesar_break_parameter_trials.csv @@ -1,144 +1,168 @@ -l1, normalised_english_counts, normalise, 3000, 0.9616 -l1, normalised_english_counts, normalise, 1000, 0.9562 -l1, normalised_english_counts, normalise, 300, 0.9598 -l1, normalised_english_counts, normalise, 100, 0.9622 -l1, normalised_english_counts, normalise, 50, 0.9584 -l1, normalised_english_counts, normalise, 30, 0.953 -l1, normalised_english_counts, normalise, 20, 0.917 -l1, normalised_english_counts, normalise, 10, 0.7328 -l1, normalised_english_counts, normalise, 5, 0.4394 -l1, normalised_english_counts, scale, 3000, 0.9618 -l1, normalised_english_counts, scale, 1000, 0.9574 -l1, normalised_english_counts, scale, 300, 0.9624 -l1, normalised_english_counts, scale, 100, 0.9566 -l1, normalised_english_counts, scale, 50, 0.959 -l1, normalised_english_counts, scale, 30, 0.9476 -l1, normalised_english_counts, scale, 20, 0.8968 -l1, normalised_english_counts, scale, 10, 0.6844 -l1, normalised_english_counts, scale, 5, 0.4298 -l1, scaled_english_counts, normalise, 3000, 0.957 -l1, scaled_english_counts, normalise, 1000, 0.9662 -l1, scaled_english_counts, normalise, 300, 0.9604 -l1, scaled_english_counts, normalise, 100, 0.9602 -l1, scaled_english_counts, normalise, 50, 0.9578 -l1, scaled_english_counts, normalise, 30, 0.9504 -l1, scaled_english_counts, normalise, 20, 0.9174 -l1, scaled_english_counts, normalise, 10, 0.7204 -l1, scaled_english_counts, normalise, 5, 0.4506 -l1, scaled_english_counts, scale, 3000, 0.9584 -l1, scaled_english_counts, scale, 1000, 0.9586 -l1, scaled_english_counts, scale, 300, 0.964 -l1, scaled_english_counts, scale, 100, 0.9582 -l1, scaled_english_counts, scale, 50, 0.9606 -l1, scaled_english_counts, scale, 30, 0.944 -l1, scaled_english_counts, scale, 20, 0.915 -l1, scaled_english_counts, scale, 10, 0.7324 -l1, scaled_english_counts, scale, 5, 0.4446 -l2, normalised_english_counts, normalise, 3000, 0.953 -l2, normalised_english_counts, normalise, 1000, 0.962 -l2, normalised_english_counts, normalise, 300, 0.9638 -l2, normalised_english_counts, normalise, 100, 0.9632 -l2, normalised_english_counts, normalise, 50, 0.9604 -l2, normalised_english_counts, normalise, 30, 0.95 -l2, normalised_english_counts, normalise, 20, 0.892 -l2, normalised_english_counts, normalise, 10, 0.7124 -l2, normalised_english_counts, normalise, 5, 0.4406 -l2, normalised_english_counts, scale, 3000, 0.9626 -l2, normalised_english_counts, scale, 1000, 0.956 -l2, normalised_english_counts, scale, 300, 0.962 -l2, normalised_english_counts, scale, 100, 0.9572 -l2, normalised_english_counts, scale, 50, 0.9526 -l2, normalised_english_counts, scale, 30, 0.9478 -l2, normalised_english_counts, scale, 20, 0.9046 -l2, normalised_english_counts, scale, 10, 0.6896 -l2, normalised_english_counts, scale, 5, 0.4308 -l2, scaled_english_counts, normalise, 3000, 0.9574 -l2, scaled_english_counts, normalise, 1000, 0.9568 -l2, scaled_english_counts, normalise, 300, 0.9536 -l2, scaled_english_counts, normalise, 100, 0.9624 -l2, scaled_english_counts, normalise, 50, 0.9606 -l2, scaled_english_counts, normalise, 30, 0.9384 -l2, scaled_english_counts, normalise, 20, 0.8914 -l2, scaled_english_counts, normalise, 10, 0.6892 -l2, scaled_english_counts, normalise, 5, 0.4196 -l2, scaled_english_counts, scale, 3000, 0.9532 -l2, scaled_english_counts, scale, 1000, 0.9588 -l2, scaled_english_counts, scale, 300, 0.9644 -l2, scaled_english_counts, scale, 100, 0.9572 -l2, scaled_english_counts, scale, 50, 0.9586 -l2, scaled_english_counts, scale, 30, 0.9436 -l2, scaled_english_counts, scale, 20, 0.9036 -l2, scaled_english_counts, scale, 10, 0.693 -l2, scaled_english_counts, scale, 5, 0.4376 -l3, normalised_english_counts, normalise, 3000, 0.9626 -l3, normalised_english_counts, normalise, 1000, 0.9582 -l3, normalised_english_counts, normalise, 300, 0.9542 -l3, normalised_english_counts, normalise, 100, 0.9606 -l3, normalised_english_counts, normalise, 50, 0.953 -l3, normalised_english_counts, normalise, 30, 0.9248 -l3, normalised_english_counts, normalise, 20, 0.8684 -l3, normalised_english_counts, normalise, 10, 0.6106 -l3, normalised_english_counts, normalise, 5, 0.4064 -l3, normalised_english_counts, scale, 3000, 0.961 -l3, normalised_english_counts, scale, 1000, 0.9568 -l3, normalised_english_counts, scale, 300, 0.9566 -l3, normalised_english_counts, scale, 100, 0.9554 -l3, normalised_english_counts, scale, 50, 0.9436 -l3, normalised_english_counts, scale, 30, 0.8936 -l3, normalised_english_counts, scale, 20, 0.8016 -l3, normalised_english_counts, scale, 10, 0.579 +l1, normalised_english_counts, normalise, 300, 0.9992 +l1, normalised_english_counts, normalise, 100, 0.9996 +l1, normalised_english_counts, normalise, 50, 0.9992 +l1, normalised_english_counts, normalise, 30, 0.9914 +l1, normalised_english_counts, normalise, 20, 0.9532 +l1, normalised_english_counts, normalise, 10, 0.7442 +l1, normalised_english_counts, normalise, 5, 0.4358 +l1, normalised_english_counts, scale, 300, 1.0 +l1, normalised_english_counts, scale, 100, 0.999 +l1, normalised_english_counts, scale, 50, 0.9988 +l1, normalised_english_counts, scale, 30, 0.9848 +l1, normalised_english_counts, scale, 20, 0.9316 +l1, normalised_english_counts, scale, 10, 0.715 +l1, normalised_english_counts, scale, 5, 0.436 +l1, scaled_english_counts, normalise, 300, 0.9994 +l1, scaled_english_counts, normalise, 100, 0.9998 +l1, scaled_english_counts, normalise, 50, 0.999 +l1, scaled_english_counts, normalise, 30, 0.9868 +l1, scaled_english_counts, normalise, 20, 0.9482 +l1, scaled_english_counts, normalise, 10, 0.7434 +l1, scaled_english_counts, normalise, 5, 0.4532 +l1, scaled_english_counts, scale, 300, 0.9996 +l1, scaled_english_counts, scale, 100, 1.0 +l1, scaled_english_counts, scale, 50, 0.9988 +l1, scaled_english_counts, scale, 30, 0.9874 +l1, scaled_english_counts, scale, 20, 0.9488 +l1, scaled_english_counts, scale, 10, 0.745 +l1, scaled_english_counts, scale, 5, 0.4548 +l2, normalised_english_counts, normalise, 300, 0.9994 +l2, normalised_english_counts, normalise, 100, 0.9992 +l2, normalised_english_counts, normalise, 50, 0.9978 +l2, normalised_english_counts, normalise, 30, 0.9836 +l2, normalised_english_counts, normalise, 20, 0.9318 +l2, normalised_english_counts, normalise, 10, 0.7072 +l2, normalised_english_counts, normalise, 5, 0.4294 +l2, normalised_english_counts, scale, 300, 0.9988 +l2, normalised_english_counts, scale, 100, 0.9998 +l2, normalised_english_counts, scale, 50, 0.9978 +l2, normalised_english_counts, scale, 30, 0.9868 +l2, normalised_english_counts, scale, 20, 0.9364 +l2, normalised_english_counts, scale, 10, 0.7136 +l2, normalised_english_counts, scale, 5, 0.446 +l2, scaled_english_counts, normalise, 300, 0.9992 +l2, scaled_english_counts, normalise, 100, 0.9996 +l2, scaled_english_counts, normalise, 50, 0.9984 +l2, scaled_english_counts, normalise, 30, 0.9854 +l2, scaled_english_counts, normalise, 20, 0.9328 +l2, scaled_english_counts, normalise, 10, 0.7122 +l2, scaled_english_counts, normalise, 5, 0.4328 +l2, scaled_english_counts, scale, 300, 1.0 +l2, scaled_english_counts, scale, 100, 0.9998 +l2, scaled_english_counts, scale, 50, 0.9972 +l2, scaled_english_counts, scale, 30, 0.9842 +l2, scaled_english_counts, scale, 20, 0.9356 +l2, scaled_english_counts, scale, 10, 0.7126 +l2, scaled_english_counts, scale, 5, 0.4318 +l3, normalised_english_counts, normalise, 300, 0.9996 +l3, normalised_english_counts, normalise, 100, 0.999 +l3, normalised_english_counts, normalise, 50, 0.994 +l3, normalised_english_counts, normalise, 30, 0.9658 +l3, normalised_english_counts, normalise, 20, 0.8926 +l3, normalised_english_counts, normalise, 10, 0.6252 +l3, normalised_english_counts, normalise, 5, 0.3974 +l3, normalised_english_counts, scale, 300, 0.9996 +l3, normalised_english_counts, scale, 100, 0.998 +l3, normalised_english_counts, scale, 50, 0.9828 +l3, normalised_english_counts, scale, 30, 0.9334 +l3, normalised_english_counts, scale, 20, 0.8304 +l3, normalised_english_counts, scale, 10, 0.5968 l3, normalised_english_counts, scale, 5, 0.4114 -l3, scaled_english_counts, normalise, 3000, 0.9616 -l3, scaled_english_counts, normalise, 1000, 0.9612 -l3, scaled_english_counts, normalise, 300, 0.9624 -l3, scaled_english_counts, normalise, 100, 0.9524 -l3, scaled_english_counts, normalise, 50, 0.9474 -l3, scaled_english_counts, normalise, 30, 0.9066 -l3, scaled_english_counts, normalise, 20, 0.8004 -l3, scaled_english_counts, normalise, 10, 0.5686 -l3, scaled_english_counts, normalise, 5, 0.3404 -l3, scaled_english_counts, scale, 3000, 0.96 -l3, scaled_english_counts, scale, 1000, 0.96 -l3, scaled_english_counts, scale, 300, 0.9596 -l3, scaled_english_counts, scale, 100, 0.96 -l3, scaled_english_counts, scale, 50, 0.954 -l3, scaled_english_counts, scale, 30, 0.9374 -l3, scaled_english_counts, scale, 20, 0.862 -l3, scaled_english_counts, scale, 10, 0.6276 -l3, scaled_english_counts, scale, 5, 0.399 -cosine_distance, normalised_english_counts, normalise, 3000, 0.9618 -cosine_distance, normalised_english_counts, normalise, 1000, 0.96 -cosine_distance, normalised_english_counts, normalise, 300, 0.9604 -cosine_distance, normalised_english_counts, normalise, 100, 0.9538 -cosine_distance, normalised_english_counts, normalise, 50, 0.9608 -cosine_distance, normalised_english_counts, normalise, 30, 0.9426 -cosine_distance, normalised_english_counts, normalise, 20, 0.9012 -cosine_distance, normalised_english_counts, normalise, 10, 0.6916 -cosine_distance, normalised_english_counts, normalise, 5, 0.4286 -cosine_distance, normalised_english_counts, scale, 3000, 0.9606 -cosine_distance, normalised_english_counts, scale, 1000, 0.9572 -cosine_distance, normalised_english_counts, scale, 300, 0.9628 -cosine_distance, normalised_english_counts, scale, 100, 0.959 -cosine_distance, normalised_english_counts, scale, 50, 0.9542 -cosine_distance, normalised_english_counts, scale, 30, 0.951 -cosine_distance, normalised_english_counts, scale, 20, 0.9028 -cosine_distance, normalised_english_counts, scale, 10, 0.7028 -cosine_distance, normalised_english_counts, scale, 5, 0.44 -cosine_distance, scaled_english_counts, normalise, 3000, 0.9582 -cosine_distance, scaled_english_counts, normalise, 1000, 0.9614 -cosine_distance, scaled_english_counts, normalise, 300, 0.9632 -cosine_distance, scaled_english_counts, normalise, 100, 0.9584 -cosine_distance, scaled_english_counts, normalise, 50, 0.9574 -cosine_distance, scaled_english_counts, normalise, 30, 0.9506 -cosine_distance, scaled_english_counts, normalise, 20, 0.8956 -cosine_distance, scaled_english_counts, normalise, 10, 0.6916 -cosine_distance, scaled_english_counts, normalise, 5, 0.4356 -cosine_distance, scaled_english_counts, scale, 3000, 0.9572 -cosine_distance, scaled_english_counts, scale, 1000, 0.961 -cosine_distance, scaled_english_counts, scale, 300, 0.9596 -cosine_distance, scaled_english_counts, scale, 100, 0.9544 -cosine_distance, scaled_english_counts, scale, 50, 0.9598 -cosine_distance, scaled_english_counts, scale, 30, 0.9414 -cosine_distance, scaled_english_counts, scale, 20, 0.9036 -cosine_distance, scaled_english_counts, scale, 10, 0.6928 -cosine_distance, scaled_english_counts, scale, 5, 0.4178 +l3, scaled_english_counts, normalise, 300, 0.9994 +l3, scaled_english_counts, normalise, 100, 0.9984 +l3, scaled_english_counts, normalise, 50, 0.9876 +l3, scaled_english_counts, normalise, 30, 0.9284 +l3, scaled_english_counts, normalise, 20, 0.8322 +l3, scaled_english_counts, normalise, 10, 0.579 +l3, scaled_english_counts, normalise, 5, 0.3466 +l3, scaled_english_counts, scale, 300, 1.0 +l3, scaled_english_counts, scale, 100, 0.999 +l3, scaled_english_counts, scale, 50, 0.994 +l3, scaled_english_counts, scale, 30, 0.9688 +l3, scaled_english_counts, scale, 20, 0.8952 +l3, scaled_english_counts, scale, 10, 0.6416 +l3, scaled_english_counts, scale, 5, 0.4042 +cosine_distance, normalised_english_counts, normalise, 300, 0.9994 +cosine_distance, normalised_english_counts, normalise, 100, 1.0 +cosine_distance, normalised_english_counts, normalise, 50, 0.9978 +cosine_distance, normalised_english_counts, normalise, 30, 0.9856 +cosine_distance, normalised_english_counts, normalise, 20, 0.9374 +cosine_distance, normalised_english_counts, normalise, 10, 0.7212 +cosine_distance, normalised_english_counts, normalise, 5, 0.4282 +cosine_distance, normalised_english_counts, scale, 300, 0.9998 +cosine_distance, normalised_english_counts, scale, 100, 0.9994 +cosine_distance, normalised_english_counts, scale, 50, 0.9972 +cosine_distance, normalised_english_counts, scale, 30, 0.9846 +cosine_distance, normalised_english_counts, scale, 20, 0.9324 +cosine_distance, normalised_english_counts, scale, 10, 0.7144 +cosine_distance, normalised_english_counts, scale, 5, 0.4284 +cosine_distance, scaled_english_counts, normalise, 300, 0.9994 +cosine_distance, scaled_english_counts, normalise, 100, 0.9996 +cosine_distance, scaled_english_counts, normalise, 50, 0.9978 +cosine_distance, scaled_english_counts, normalise, 30, 0.9856 +cosine_distance, scaled_english_counts, normalise, 20, 0.935 +cosine_distance, scaled_english_counts, normalise, 10, 0.7232 +cosine_distance, scaled_english_counts, normalise, 5, 0.415 +cosine_distance, scaled_english_counts, scale, 300, 0.9982 +cosine_distance, scaled_english_counts, scale, 100, 0.9988 +cosine_distance, scaled_english_counts, scale, 50, 0.9976 +cosine_distance, scaled_english_counts, scale, 30, 0.9844 +cosine_distance, scaled_english_counts, scale, 20, 0.9314 +cosine_distance, scaled_english_counts, scale, 10, 0.7102 +cosine_distance, scaled_english_counts, scale, 5, 0.4376 +harmonic_mean, normalised_english_counts, normalise, 300, 0.4684 +harmonic_mean, normalised_english_counts, normalise, 100, 0.5068 +harmonic_mean, normalised_english_counts, normalise, 50, 0.6978 +harmonic_mean, normalised_english_counts, normalise, 30, 0.593 +harmonic_mean, normalised_english_counts, normalise, 20, 0.536 +harmonic_mean, normalised_english_counts, normalise, 10, 0.4284 +harmonic_mean, normalised_english_counts, normalise, 5, 0.3542 +harmonic_mean, normalised_english_counts, scale, 300, 0.3602 +harmonic_mean, normalised_english_counts, scale, 100, 0.57 +harmonic_mean, normalised_english_counts, scale, 50, 0.795 +harmonic_mean, normalised_english_counts, scale, 30, 0.7694 +harmonic_mean, normalised_english_counts, scale, 20, 0.6924 +harmonic_mean, normalised_english_counts, scale, 10, 0.559 +harmonic_mean, normalised_english_counts, scale, 5, 0.39 +harmonic_mean, scaled_english_counts, normalise, 300, 0.1214 +harmonic_mean, scaled_english_counts, normalise, 100, 0.132 +harmonic_mean, scaled_english_counts, normalise, 50, 0.1956 +harmonic_mean, scaled_english_counts, normalise, 30, 0.2686 +harmonic_mean, scaled_english_counts, normalise, 20, 0.258 +harmonic_mean, scaled_english_counts, normalise, 10, 0.2042 +harmonic_mean, scaled_english_counts, normalise, 5, 0.227 +harmonic_mean, scaled_english_counts, scale, 300, 0.7956 +harmonic_mean, scaled_english_counts, scale, 100, 0.5672 +harmonic_mean, scaled_english_counts, scale, 50, 0.4404 +harmonic_mean, scaled_english_counts, scale, 30, 0.3584 +harmonic_mean, scaled_english_counts, scale, 20, 0.3012 +harmonic_mean, scaled_english_counts, scale, 10, 0.2136 +harmonic_mean, scaled_english_counts, scale, 5, 0.1426 +geometric_mean, normalised_english_counts, normalise, 300, 0.9996 +geometric_mean, normalised_english_counts, normalise, 100, 0.9992 +geometric_mean, normalised_english_counts, normalise, 50, 0.9928 +geometric_mean, normalised_english_counts, normalise, 30, 0.9552 +geometric_mean, normalised_english_counts, normalise, 20, 0.8936 +geometric_mean, normalised_english_counts, normalise, 10, 0.6582 +geometric_mean, normalised_english_counts, normalise, 5, 0.4316 +geometric_mean, normalised_english_counts, scale, 300, 0.97 +geometric_mean, normalised_english_counts, scale, 100, 0.9762 +geometric_mean, normalised_english_counts, scale, 50, 0.9724 +geometric_mean, normalised_english_counts, scale, 30, 0.9224 +geometric_mean, normalised_english_counts, scale, 20, 0.8496 +geometric_mean, normalised_english_counts, scale, 10, 0.6846 +geometric_mean, normalised_english_counts, scale, 5, 0.4268 +geometric_mean, scaled_english_counts, normalise, 300, 0.9556 +geometric_mean, scaled_english_counts, normalise, 100, 0.8724 +geometric_mean, scaled_english_counts, normalise, 50, 0.7176 +geometric_mean, scaled_english_counts, normalise, 30, 0.6536 +geometric_mean, scaled_english_counts, normalise, 20, 0.5586 +geometric_mean, scaled_english_counts, normalise, 10, 0.3926 +geometric_mean, scaled_english_counts, normalise, 5, 0.319 +geometric_mean, scaled_english_counts, scale, 300, 0.7822 +geometric_mean, scaled_english_counts, scale, 100, 0.5784 +geometric_mean, scaled_english_counts, scale, 50, 0.4318 +geometric_mean, scaled_english_counts, scale, 30, 0.349 +geometric_mean, scaled_english_counts, scale, 20, 0.2932 +geometric_mean, scaled_english_counts, scale, 10, 0.2098 +geometric_mean, scaled_english_counts, scale, 5, 0.1406 diff --git a/cipher.py b/cipher.py index 0536350..b883abe 100644 --- a/cipher.py +++ b/cipher.py @@ -21,6 +21,9 @@ def sanitise(text): sanitised = [c.lower() for c in text if c in string.ascii_letters] return ''.join(sanitised) +def ngrams(text, n): + return [tuple(text[i:i+n]) for i in range(len(text)-n+1)] + def letter_frequencies(text): """Count the number of occurrences of each character in text @@ -105,10 +108,22 @@ def caesar_decipher(message, shift): return caesar_encipher(message, -shift) def caesar_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise): + """Breaks a Caesar cipher using frequency analysis + + + >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrhecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') + (4, 0.3186395289018361) + >>> caesar_break('jhzhuhfrqilqhgwrdevwudfwuhdvrqlqjwkhqkdylqjvxemhfwhgwrfulwlflvpwkhhasodqdwlrqrisrzhuwkdwmxulglfdovfl') + (3, 0.32902042861730835) + >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgteeraxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') + (19, 0.4215290123583277) + >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurersvaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') + (13, 0.31602920807545154) + """ sanitised_message = sanitise(message) best_shift = 0 best_fit = float("inf") - for shift in range(1, 25): + for shift in range(26): plaintext = caesar_decipher(sanitised_message, shift) frequencies = message_frequency_scaling(letter_frequencies(plaintext)) fit = metric(target_frequencies, frequencies) diff --git a/find_best_caesar_break_parameters.py b/find_best_caesar_break_parameters.py index 711cff0..ed8bbaa 100644 --- a/find_best_caesar_break_parameters.py +++ b/find_best_caesar_break_parameters.py @@ -11,7 +11,7 @@ scaled_english_counts = norms.scale(english_counts) metrics = [norms.l1, norms.l2, norms.l3, norms.cosine_distance, norms.harmonic_mean, norms.geometric_mean] corpus_frequencies = [normalised_english_counts, scaled_english_counts] scalings = [norms.normalise, norms.scale] -message_lengths = [3000, 1000, 300, 100, 50, 30, 20, 10, 5] +message_lengths = [300, 100, 50, 30, 20, 10, 5] metric_names = ['l1', 'l2', 'l3', 'cosine_distance', 'harmonic_mean', 'geometric_mean'] corpus_frequency_names = ['normalised_english_counts', 'scaled_english_counts'] diff --git a/norms.py b/norms.py index 4fdf1e3..08cff74 100644 --- a/norms.py +++ b/norms.py @@ -97,24 +97,52 @@ def l3(frequencies1, frequencies2): return total ** (1/3) def geometric_mean(frequencies1, frequencies2): - """Finds the distances between two frequency profiles, expressed as dictionaries. + """Finds the geometric mean of the absolute differences between two frequency profiles, + expressed as dictionaries. Assumes every key in frequencies1 is also in frequencies2 - + + >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) + 1 + >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) + 1 + >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1}) + 3 + >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':5, 'c':1})) + 0.057022248808851934 + >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1})) + 0.0 + >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':0})) + 0.009720703533656434 """ - total = 0 + total = 1 for k in frequencies1.keys(): total *= abs(frequencies1[k] - frequencies2[k]) return total def harmonic_mean(frequencies1, frequencies2): - """Finds the distances between two frequency profiles, expressed as dictionaries. + """Finds the harmonic mean of the absolute differences between two frequency profiles, + expressed as dictionaries. Assumes every key in frequencies1 is also in frequencies2 + >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) + 1.0 + >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) + 1.0 + >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1}) + 1.2857142857142858 + >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':5, 'c':1})) + 0.3849001794597505 + >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1})) + 0 + >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':0})) + 0.17497266360581604 """ total = 0 for k in frequencies1.keys(): + if abs(frequencies1[k] - frequencies2[k]) == 0: + return 0 total += 1 / abs(frequencies1[k] - frequencies2[k]) - return 1 / total + return len(frequencies1) / total def cosine_distance(frequencies1, frequencies2): -- 2.43.0