From 26d9d2228e47a6ff8b8696d37c0a8d6d6b906c67 Mon Sep 17 00:00:00 2001 From: Neil Smith Date: Sun, 6 Oct 2013 18:09:49 +0100 Subject: [PATCH] Added experiment for checking which metrics work best for using unigram frequency analysis for breaking Caesar ciphers --- .gitignore | 8 ++ __pycache__/cipher.cpython-33.pyc | Bin 0 -> 5874 bytes caesar_break_parameter_trials.csv | 144 +++++++++++++++++++++++++++ find_best_caesar_break_parameters.py | 60 +++++++++++ norms.py | 21 ++++ norms.pyc | Bin 0 -> 5694 bytes 6 files changed, 233 insertions(+) create mode 100644 .gitignore create mode 100644 __pycache__/cipher.cpython-33.pyc create mode 100644 caesar_break_parameter_trials.csv create mode 100644 find_best_caesar_break_parameters.py create mode 100644 norms.pyc diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..14ba524 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +*~ +/.bundle +/config/database.yml +/config/deploy.rb +*doc +/log/*log +/tmp + diff --git a/__pycache__/cipher.cpython-33.pyc b/__pycache__/cipher.cpython-33.pyc new file mode 100644 index 0000000000000000000000000000000000000000..05222b00a2b63868a0e7d9cae75cb0884fed535a GIT binary patch literal 5874 zcmd^DZExJh5gt63%DUy7iApkMoMivE=Tgg!G%?op@Lz#n@ySI_Rw?#|B4&dlyVow>&LSDUZ@ zeV53m+HV&4xA?Nh7Eyrz7DYrm23xdaRqZzI*rY7#jgcCscj%fRHA#U@!5F=>>4)FY z&J-PeLTZZEcR!|`X;RbFo1s9czr*+j?acCEHOu1_#y4p7@N3%M#=@I_R5tY-Zsy77 zD43vNlD-E&cPN;mU|R4z1(>oS_yY=NDR2ber2s?j2wtE7WFH8=M?CF)QgZ?v3P8C) z-;dGG15%3=+@k}V(uY>L{UzF2BDF-lhxCrfibW_}&HIp2h>tc32c@4Bao&C&weq99 zfb1xha>i%OKfC;1;&0}nzz!iQ?BDY<)QETT`%?gT&0=o#ewT5@f(%qZr)XHu8sf_H%i{B)b}#gaPgC3ZEelX zyg1CmOf{Fk1@UGP`UlG^(z}e&hf=?e*!|-5tL+W9lN`CdqCZf<6Bj(DtyW7`Z+;$m zC&w%9`l~NTr`=1Fw=u@)VLS*6GIfybFUykWWX4k1!k2xFVqc`S`-FuBG37)(iw-6! zo#blIW)1zrro%~y59HJwLs3kk$nrFd_lr3|FZ08&9jTnmMD+{E{bG*z*$GOjAm z?+y*r4P!A$QSi{Pk~CLAQ|qGLNnv;xAF8am?Ctr1>Y!X+X@$A!XU!FDpMOT*@>BQ4 z%2T(wyl0SaPyqB?dX+&PLA$qEm=BNGzblqnGjXwS*)*H0%l3MaHy11YT2^N+H6P5S zM$Dx~%%$D~bNQ)^&|LDq_6=Rv(1oV0XV8H`kwJZfVuKPOy~e<_9U7DZXU*Rt#zZeIf$>+_ZgR(7Hii8nj{1DIILSe>8)Mf3NH2Qa zhiJ_>aF8=^%~=apaXFpOagKk?2Y(VzJOJb|!+`v!s%lGoen$BiXFMEgY>FMJ>d0eL zKF(DPM=zqzLxh?oq(Y0pOK93c&sH#w@WbJ&_!mU=djts%#jfY-oji5-!#65MJ?n;@ z-1Yh#pc=;C`<}|Yw9UbV$Cc4vN0R4d-m|B(56+IRBY#nn`!|#8=p;-m2RD=Rd8}qX zxtW|jZ$<9iOwR7PBEQ8}nfzHLn&X?N=hJKGOYX%KeLCo{aN)x<7B>pY2$# zU0h9XP6eM1b1Ik_90}MjOnMeqqAi{H{ebp$bvqM2m$jG72IW{|#@bc@&375VMtJHo>sFp*@;G+meK2$hj4OSJm3U!d|=q7Lp$a)tVfNLa2= za`ogZl(?4i3T2;n7=4vx-oA3*hIv<#xtHP_+)^FLT(v6^dlHde<=I{T$Q>EusuQ;w<#UALA6qCp(tG%_pP?u(RS)=5 z#nx)k-{6Yv^A{-|UDwp8C*@h|m#0V@6fY!#6f3#m8uc9$^F+8teS|+FNAtF~DK?a1 z3Pq{uns?x9Tg5r<9jKL8F~jEt7TFz*F!BwW^kvI=WEGbX7`wfdH~0^xs|B?8x6kSn z;d%s%9NH&erbcafNXr&Ej1V``mZnu7P#U1DptXOAXtW=>R6Ex=lG;QS!);aq%adsk#Tk)XA_i*8{w8oI@%-Lh>?9D;lLSK-`%9MDM?Sp z>2!*l&Zg;fjB}RLX--&p*i3m}KAjPQ2A$&nBpu)tsKprbVk}lVZ1Ml?(+TeGa6Dkc z+mLspGO_DODi}UOyAGDq%y{{rAg0`j5-%5&dhx!xZ1-2>ZcP@(xud}jy|3^#;>!@k z%YBvCFX9C=QktY`KW=Au*5LOPydN6of_8P__?YdfEN|_Wk ze)VA1Y7YP+4bWd>6!Pq*^SZrMc?S?TWgNlCa;!;wi-_C{__#J_>1Vncy*!h&+kJ4* zZZ~%r=EthqT(d&Uzw0g>Y?I&y!3S2Wh%~wmkd-aDNsiqd>}TZJ<=e8I9LI- zh$`dc1&G&zj58$~Ftq?V?71b*naAZR`!#;M-c=o)GOMWDRpm45Z56o~jw<{p43rnQ z13Ywjcmujh+_BeJ?Y0`BW3#U3 atOwSI*8G?=>)dsI>NK2r$N9+}?*AV}yjgt! literal 0 HcmV?d00001 diff --git a/caesar_break_parameter_trials.csv b/caesar_break_parameter_trials.csv new file mode 100644 index 0000000..df9b836 --- /dev/null +++ b/caesar_break_parameter_trials.csv @@ -0,0 +1,144 @@ +l1, normalised_english_counts, normalise, 3000, 0.9616 +l1, normalised_english_counts, normalise, 1000, 0.9562 +l1, normalised_english_counts, normalise, 300, 0.9598 +l1, normalised_english_counts, normalise, 100, 0.9622 +l1, normalised_english_counts, normalise, 50, 0.9584 +l1, normalised_english_counts, normalise, 30, 0.953 +l1, normalised_english_counts, normalise, 20, 0.917 +l1, normalised_english_counts, normalise, 10, 0.7328 +l1, normalised_english_counts, normalise, 5, 0.4394 +l1, normalised_english_counts, scale, 3000, 0.9618 +l1, normalised_english_counts, scale, 1000, 0.9574 +l1, normalised_english_counts, scale, 300, 0.9624 +l1, normalised_english_counts, scale, 100, 0.9566 +l1, normalised_english_counts, scale, 50, 0.959 +l1, normalised_english_counts, scale, 30, 0.9476 +l1, normalised_english_counts, scale, 20, 0.8968 +l1, normalised_english_counts, scale, 10, 0.6844 +l1, normalised_english_counts, scale, 5, 0.4298 +l1, scaled_english_counts, normalise, 3000, 0.957 +l1, scaled_english_counts, normalise, 1000, 0.9662 +l1, scaled_english_counts, normalise, 300, 0.9604 +l1, scaled_english_counts, normalise, 100, 0.9602 +l1, scaled_english_counts, normalise, 50, 0.9578 +l1, scaled_english_counts, normalise, 30, 0.9504 +l1, scaled_english_counts, normalise, 20, 0.9174 +l1, scaled_english_counts, normalise, 10, 0.7204 +l1, scaled_english_counts, normalise, 5, 0.4506 +l1, scaled_english_counts, scale, 3000, 0.9584 +l1, scaled_english_counts, scale, 1000, 0.9586 +l1, scaled_english_counts, scale, 300, 0.964 +l1, scaled_english_counts, scale, 100, 0.9582 +l1, scaled_english_counts, scale, 50, 0.9606 +l1, scaled_english_counts, scale, 30, 0.944 +l1, scaled_english_counts, scale, 20, 0.915 +l1, scaled_english_counts, scale, 10, 0.7324 +l1, scaled_english_counts, scale, 5, 0.4446 +l2, normalised_english_counts, normalise, 3000, 0.953 +l2, normalised_english_counts, normalise, 1000, 0.962 +l2, normalised_english_counts, normalise, 300, 0.9638 +l2, normalised_english_counts, normalise, 100, 0.9632 +l2, normalised_english_counts, normalise, 50, 0.9604 +l2, normalised_english_counts, normalise, 30, 0.95 +l2, normalised_english_counts, normalise, 20, 0.892 +l2, normalised_english_counts, normalise, 10, 0.7124 +l2, normalised_english_counts, normalise, 5, 0.4406 +l2, normalised_english_counts, scale, 3000, 0.9626 +l2, normalised_english_counts, scale, 1000, 0.956 +l2, normalised_english_counts, scale, 300, 0.962 +l2, normalised_english_counts, scale, 100, 0.9572 +l2, normalised_english_counts, scale, 50, 0.9526 +l2, normalised_english_counts, scale, 30, 0.9478 +l2, normalised_english_counts, scale, 20, 0.9046 +l2, normalised_english_counts, scale, 10, 0.6896 +l2, normalised_english_counts, scale, 5, 0.4308 +l2, scaled_english_counts, normalise, 3000, 0.9574 +l2, scaled_english_counts, normalise, 1000, 0.9568 +l2, scaled_english_counts, normalise, 300, 0.9536 +l2, scaled_english_counts, normalise, 100, 0.9624 +l2, scaled_english_counts, normalise, 50, 0.9606 +l2, scaled_english_counts, normalise, 30, 0.9384 +l2, scaled_english_counts, normalise, 20, 0.8914 +l2, scaled_english_counts, normalise, 10, 0.6892 +l2, scaled_english_counts, normalise, 5, 0.4196 +l2, scaled_english_counts, scale, 3000, 0.9532 +l2, scaled_english_counts, scale, 1000, 0.9588 +l2, scaled_english_counts, scale, 300, 0.9644 +l2, scaled_english_counts, scale, 100, 0.9572 +l2, scaled_english_counts, scale, 50, 0.9586 +l2, scaled_english_counts, scale, 30, 0.9436 +l2, scaled_english_counts, scale, 20, 0.9036 +l2, scaled_english_counts, scale, 10, 0.693 +l2, scaled_english_counts, scale, 5, 0.4376 +l3, normalised_english_counts, normalise, 3000, 0.9626 +l3, normalised_english_counts, normalise, 1000, 0.9582 +l3, normalised_english_counts, normalise, 300, 0.9542 +l3, normalised_english_counts, normalise, 100, 0.9606 +l3, normalised_english_counts, normalise, 50, 0.953 +l3, normalised_english_counts, normalise, 30, 0.9248 +l3, normalised_english_counts, normalise, 20, 0.8684 +l3, normalised_english_counts, normalise, 10, 0.6106 +l3, normalised_english_counts, normalise, 5, 0.4064 +l3, normalised_english_counts, scale, 3000, 0.961 +l3, normalised_english_counts, scale, 1000, 0.9568 +l3, normalised_english_counts, scale, 300, 0.9566 +l3, normalised_english_counts, scale, 100, 0.9554 +l3, normalised_english_counts, scale, 50, 0.9436 +l3, normalised_english_counts, scale, 30, 0.8936 +l3, normalised_english_counts, scale, 20, 0.8016 +l3, normalised_english_counts, scale, 10, 0.579 +l3, normalised_english_counts, scale, 5, 0.4114 +l3, scaled_english_counts, normalise, 3000, 0.9616 +l3, scaled_english_counts, normalise, 1000, 0.9612 +l3, scaled_english_counts, normalise, 300, 0.9624 +l3, scaled_english_counts, normalise, 100, 0.9524 +l3, scaled_english_counts, normalise, 50, 0.9474 +l3, scaled_english_counts, normalise, 30, 0.9066 +l3, scaled_english_counts, normalise, 20, 0.8004 +l3, scaled_english_counts, normalise, 10, 0.5686 +l3, scaled_english_counts, normalise, 5, 0.3404 +l3, scaled_english_counts, scale, 3000, 0.96 +l3, scaled_english_counts, scale, 1000, 0.96 +l3, scaled_english_counts, scale, 300, 0.9596 +l3, scaled_english_counts, scale, 100, 0.96 +l3, scaled_english_counts, scale, 50, 0.954 +l3, scaled_english_counts, scale, 30, 0.9374 +l3, scaled_english_counts, scale, 20, 0.862 +l3, scaled_english_counts, scale, 10, 0.6276 +l3, scaled_english_counts, scale, 5, 0.399 +cosine_distance, normalised_english_counts, normalise, 3000, 0.9618 +cosine_distance, normalised_english_counts, normalise, 1000, 0.96 +cosine_distance, normalised_english_counts, normalise, 300, 0.9604 +cosine_distance, normalised_english_counts, normalise, 100, 0.9538 +cosine_distance, normalised_english_counts, normalise, 50, 0.9608 +cosine_distance, normalised_english_counts, normalise, 30, 0.9426 +cosine_distance, normalised_english_counts, normalise, 20, 0.9012 +cosine_distance, normalised_english_counts, normalise, 10, 0.6916 +cosine_distance, normalised_english_counts, normalise, 5, 0.4286 +cosine_distance, normalised_english_counts, scale, 3000, 0.9606 +cosine_distance, normalised_english_counts, scale, 1000, 0.9572 +cosine_distance, normalised_english_counts, scale, 300, 0.9628 +cosine_distance, normalised_english_counts, scale, 100, 0.959 +cosine_distance, normalised_english_counts, scale, 50, 0.9542 +cosine_distance, normalised_english_counts, scale, 30, 0.951 +cosine_distance, normalised_english_counts, scale, 20, 0.9028 +cosine_distance, normalised_english_counts, scale, 10, 0.7028 +cosine_distance, normalised_english_counts, scale, 5, 0.44 +cosine_distance, scaled_english_counts, normalise, 3000, 0.9582 +cosine_distance, scaled_english_counts, normalise, 1000, 0.9614 +cosine_distance, scaled_english_counts, normalise, 300, 0.9632 +cosine_distance, scaled_english_counts, normalise, 100, 0.9584 +cosine_distance, scaled_english_counts, normalise, 50, 0.9574 +cosine_distance, scaled_english_counts, normalise, 30, 0.9506 +cosine_distance, scaled_english_counts, normalise, 20, 0.8956 +cosine_distance, scaled_english_counts, normalise, 10, 0.6916 +cosine_distance, scaled_english_counts, normalise, 5, 0.4356 +cosine_distance, scaled_english_counts, scale, 3000, 0.9572 +cosine_distance, scaled_english_counts, scale, 1000, 0.961 +cosine_distance, scaled_english_counts, scale, 300, 0.9596 +cosine_distance, scaled_english_counts, scale, 100, 0.9544 +cosine_distance, scaled_english_counts, scale, 50, 0.9598 +cosine_distance, scaled_english_counts, scale, 30, 0.9414 +cosine_distance, scaled_english_counts, scale, 20, 0.9036 +cosine_distance, scaled_english_counts, scale, 10, 0.6928 +cosine_distance, scaled_english_counts, scale, 5, 0.4178 diff --git a/find_best_caesar_break_parameters.py b/find_best_caesar_break_parameters.py new file mode 100644 index 0000000..711cff0 --- /dev/null +++ b/find_best_caesar_break_parameters.py @@ -0,0 +1,60 @@ +import random +from cipher import * + + +corpus = sanitise(''.join([open('shakespeare.txt', 'r').read(), open('sherlock-holmes.txt', 'r').read(), open('war-and-peace.txt', 'r').read()])) +corpus_length = len(corpus) + +scaled_english_counts = norms.scale(english_counts) + + +metrics = [norms.l1, norms.l2, norms.l3, norms.cosine_distance, norms.harmonic_mean, norms.geometric_mean] +corpus_frequencies = [normalised_english_counts, scaled_english_counts] +scalings = [norms.normalise, norms.scale] +message_lengths = [3000, 1000, 300, 100, 50, 30, 20, 10, 5] + +metric_names = ['l1', 'l2', 'l3', 'cosine_distance', 'harmonic_mean', 'geometric_mean'] +corpus_frequency_names = ['normalised_english_counts', 'scaled_english_counts'] +scaling_names = ['normalise', 'scale'] + +trials = 5000 + +scores = collections.defaultdict(int) +for metric in range(len(metrics)): + scores[metric_names[metric]] = collections.defaultdict(int) + for corpus_freqency in range(len(corpus_frequencies)): + scores[metric_names[metric]][corpus_frequency_names[corpus_freqency]] = collections.defaultdict(int) + for scaling in range(len(scalings)): + scores[metric_names[metric]][corpus_frequency_names[corpus_freqency]][scaling_names[scaling]] = collections.defaultdict(int) + for message_length in message_lengths: + for i in range(trials): + sample_start = random.randint(0, corpus_length - message_length) + sample = corpus[sample_start:(sample_start + message_length)] + key = random.randint(1, 25) + sample_ciphertext = caesar_encipher(sample, key) + (found_key, score) = caesar_break(sample_ciphertext, + metric=metrics[metric], + target_frequencies=corpus_frequencies[corpus_freqency], + message_frequency_scaling=scalings[scaling]) + if found_key == key: + scores[metric_names[metric]][corpus_frequency_names[corpus_freqency]][scaling_names[scaling]][message_length] += 1 + print(', '.join([metric_names[metric], + corpus_frequency_names[corpus_freqency], + scaling_names[scaling], + str(message_length), + str(scores[metric_names[metric]][corpus_frequency_names[corpus_freqency]][scaling_names[scaling]][message_length] / trials) ])) + + +with open('caesar_break_parameter_trials.csv', 'w') as f: + for metric in range(len(metrics)): + for corpus_freqency in range(len(corpus_frequencies)): + for scaling in range(len(scalings)): + for message_length in message_lengths: + print(', '.join([metric_names[metric], + corpus_frequency_names[corpus_freqency], + scaling_names[scaling], + str(message_length), + str(scores[metric_names[metric]][corpus_frequency_names[corpus_freqency]][scaling_names[scaling]][message_length] / trials) ]), + file=f) + + \ No newline at end of file diff --git a/norms.py b/norms.py index 744cbe4..4fdf1e3 100644 --- a/norms.py +++ b/norms.py @@ -96,6 +96,27 @@ def l3(frequencies1, frequencies2): total += abs(frequencies1[k] - frequencies2[k]) ** 3 return total ** (1/3) +def geometric_mean(frequencies1, frequencies2): + """Finds the distances between two frequency profiles, expressed as dictionaries. + Assumes every key in frequencies1 is also in frequencies2 + + """ + total = 0 + for k in frequencies1.keys(): + total *= abs(frequencies1[k] - frequencies2[k]) + return total + +def harmonic_mean(frequencies1, frequencies2): + """Finds the distances between two frequency profiles, expressed as dictionaries. + Assumes every key in frequencies1 is also in frequencies2 + + """ + total = 0 + for k in frequencies1.keys(): + total += 1 / abs(frequencies1[k] - frequencies2[k]) + return 1 / total + + def cosine_distance(frequencies1, frequencies2): """Finds the distances between two frequency profiles, expressed as dictionaries. Assumes every key in frequencies1 is also in frequencies2 diff --git a/norms.pyc b/norms.pyc new file mode 100644 index 0000000000000000000000000000000000000000..540caf5c836c654bd2ccefced84763193e2689c8 GIT binary patch literal 5694 zcmd5=ZBN`r5FY!0(de+{3cJ}spW_Dfs`^xNJ-+bqXP<&*( zeuiJZhed?{5+neR(U9RMG$e>>!jK`F5=ISJ4IWQHG%c#r5X}f<2BJ&CxCHDnM3>>Y z4BhrIW*r{afz@GW7G42JsTgTpt>dWiKlphVN@$Wd)A+67m#<@y0c-&r0qjWB@5?9i z(3ju{y^^5uREE8Ic!d=i8oyy<8Fp%rO~Ee!o5Jy*^S;xfFE)H7!@4>};58g-Jq!}2 z1PW)F(%DotyKJw=(wK3Dca&_K9V*+wJ~im2ai*|dn8XnaQYB&OR<=E>;71f|Yirn} zo3ThuJKc7W#GI)w^?QoGtQhwct-sWaGuGzn(voQZgNhIIrna=KsK&CQq4nf6K>Ao9 zt?6ivuGx;O>#p6hoK}fD^5AsCEGs7Z{+hvBj$^ho!}fI7vyGNp4mR$ANwXVya4IkG zhne7orMZUX8lGm^rfZve>4mG?mS?wI%`+Uwu*C!6#Yhj~MNEuChSBHG2ZDpdJ)($- zU;Z%`$i64H;76I5*O!TWM>6~hu(&D1_tIXCD88-|eIKe=fJz9+MS!N3(e3A=+!sx< z8N87bNHpeOV10GF(`KtFi<8waJ7KTQ(u}X}b~{_$pxutst<^Nh;!YYQD`C95&AKb$ zc90MVc0$7O=I$XUCEed*DLdHhuFYeGlLs@P(tsJc#CvTq`hJk~7|-6rTVV%bLc-4L z5!(!UNfyOnCc4CFmXRKF`Vye%RwyMzsfej<=JTex`eLLDnp8%UDkUY5X=Pw;MG{Lz z1JVug#6<(cMHwql?qgzt1Y=_YO^qjrH$EoNOpBnPBtdrz=dHwC(Z|LBPGRN0>c0vy zUXFtxLe_|`MnEQL>Le6KCREL-Kqm|JI2aZ1c|a&Uy+FD7xPIUZ0@70EQoDp!v086Y zYqxQf5v{d@1D}NBllpve#J@}h5n4Fs&yt1TqIw0bi;4w8vSRJ86KXD*>rzO+3{}W6 zQYe3b1twAqY6ndr9LSTb0{t2VIn>fo4L+cVhnJg_W*uz2*MLX(G^#@mFz|5IK7643P!fgIAMHQp?R^n&_%H|g|a^9 z7_$(D?X&Ko@*ER#oR&D)N+%Hp31;h)_l=XfObqqq-Qey$%({1<<(CjI{bgnB?L0|i zRxWe$F#uAX0O*1M#$*7!>6nJr(p=4H*{+M(Xc(`IYPoAwWXxxk7FFkqy~H>-wG(b? z7wBdc`3!D)x@T#+?b?p5d$xVz#}La6 z$PjaEMTwZ(hJ?7T(op;l41e#G2nT%E&m+K*?p2XY)UP?*y`rTm&>3hv6cma{RW)&X zuTDk#0qMmm`?R$ad$ZtKPIdnp-tj5Y`wc263BD~h>mGvVY`#itR>rXTYiv@m*&1f^ zCzIGbnp*JA;&FOG`(0(Wc^+mDr>p(GAbm zT3)`B&vOf%CSsu?3adhJ{vui;vHYKEGrz@AhjPb9x!Ndilv}vpv68EeN<52VIRyvz zhn3fa{`B#7ZaUx2;bnm`2=41BwO}<@U=8+e!=r;cbjsZ+c7qF|`;<8NVnYY<_nyKd z-2TxP5r>?D#u2^`!pp+q&2^Q0miUz9ke=IWa1SZ{K_)K&$y*qmui!!umZG<4V1O{-8V*@