From 26f5de0a23dd94ded412f6f507910ac5e26ea2b6 Mon Sep 17 00:00:00 2001
From: Neil Smith <neil.github@njae.me.uk>
Date: Sun, 6 Oct 2013 12:37:11 +0100
Subject: [PATCH] Pulled out the norms into a separate file, corrected a bug in
 normalise, redid the tests.

---
 __pycache__/norms.cpython-33.pyc | Bin 0 -> 6307 bytes
 cipher.py                        | 127 +++----------------------------
 norms.py                         | 125 ++++++++++++++++++++++++++++++
 3 files changed, 136 insertions(+), 116 deletions(-)
 create mode 100644 __pycache__/norms.cpython-33.pyc
 create mode 100644 norms.py

diff --git a/__pycache__/norms.cpython-33.pyc b/__pycache__/norms.cpython-33.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b18ed4b715278fb73b31ca57c434135a232eb74b
GIT binary patch
literal 6307
zcmdT|ZExH}5MJLUX_H>*OF~sBP^-N3A|)5Uo$V_j1-0-2K_!TwQlUhyy-D2q?Br~>
zshWrnMZA1L;t%kz5MTQf_z66-_W2y&^_rAa3cmC1*t?#c*_~%*cK>Xgp84nVufE<r
z$C%4XR}J*XxZ|fNJp3gVFm?w-jd^wU6KIKf6U>_wG+s_I?}VW1jMv#6G@WGLw0JYg
zycy=55;VFsn0H#xCm2Vc*Vuy^>;2dtWt!cYW_+66pJ5nV3dPu~%`=e0$IWiJ5>>dU
zp2S_p9e;r$5q%*c#M@=;zC;{uK1Bst>umNnbg8lXb(T!9Ul?2OYw&Z{eX{6YW2?<6
zFih*1hgH`Jxa`O=PvmG_Uhnbkl!rc#<v5a)P5xNkbRJTNH1rdhr*7bT+zI7?ha1V}
zjEqYZD=RDbqL*;58AiRX6ZkQ2K2fj9>Vm9Yla;;smY?u$+?<~m{l9CXL2W7X3vyFi
zkQGpO55h<t$w+Bgilr*1Wvi-fwhgOY@EvM6*f0yS4%weGSlhDnwxXFG)$W*D+b#wh
z)u5AhLk$O#0uwVv3PZ6q!`3>AVd}Q2tA!M{Y8oB0Z7UtkvNTgP2q}iX5K<H-N3fW{
zy^1@26UBxwox5k*p2T)xPmIm}QDfUvFd#BdpFU{fW=*0}Dq2cPP3zffP%OQHVret#
z@}-db!P3phO}jiy;-#%#w9#|AT|eAd3Z2A{LMK>s{jE*jTXZ*_fNTsO!JOjO*5h;r
ze3oHXi5qpdRtPdNjX2HHw^0>V*IEuD7<Ie?qTd;c(Dm$kp=%wFq^piQejf!Ob@vkc
zzDBz4)kxjl8vB*8OY1fEt+ZVy?K8D+Wx5fEvdRyH;t$2lq=c^wBBJw=&A~MIE^qLV
z?`-u}&f-B#!4ym10zGO9`h9c6(-&!1i1*M5QXZ$L@XC!a*DftU_U!R>Ck+zMcaxOV
z^ur`2Uz8!3)>nL%I|~=yBAXJ?GEn_UwJww+fnr)Z)_7M+X+nyoVs%N7!g+p(2RP5d
zXTo2O@Rcy{8XM^=#X<*QIoel}`nj`|1E;sa<3#pjSsm#cV{*6u)h||=jCNB4=StL&
zzZqgO#@Fm;1&<AHav-MT9a!H<=j<~_nM+zMCC&aT1nLQ}&sXggBK0yx=_B<b(PRcC
zM>UMnuCwz>QCiab$Y4r{6PleTB7^98Nq7$N+9h{Q7d27m8@S`^D3E}`=kBxwNSVk~
z?#gVhPCgj8-L11X$RVS;2$arR=)674ZljelpVcNYOgF&(Ge7iV%3EX)zTXL5VDlbN
z9&sMZ$)l*B-?)!u1fO+3;PHY?zLdu?_hcu=XTlFUJ)pWJApA)Tj|n#XA@4nwA0Q+0
z!-5Zbsf2dKo2c?$+wTAZt@-4tbM+e1o~!q=%0;!eAdh^VWl2bjilnH|0GJ{MRDppu
zo`Kr3bWLe1wqmtS+eSK2=2tk*OtdOzBovh%RpIkwZj_kHJ~5Re6tl|t5Mp-Jj-jZg
zZCa+<G0lCGA(kQMJ+v%)*?PcJiTU$kZ&~_NWENT6CqsmZ#F<4l@Kg$tYFc@r32Y=$
zf>2B$>p>PqK^<MzM3@V-D=6cW#Q5BlG%LL$iOeO?_Cb;&w}9kFAaY15Wk_DhA(;S+
z`%#IQS_0*2mJj|7osJbC2ez-i3>b$#uL5Q)mi6-I6<k#r9m1qb4kA~oijRX!bsz`~
z0FbS6-{eK6=-fMQrmuEj%$ez*A=8(0(^p5E{-*=eH_E2}!8p^e#-s^~Nfh8xv>aCm
z=r03-ay-fzRBd;xxv4F~&@`jdwiI2lupcd<BIkD~^BvExg0d>-L!b<SQ#Gx&VO#Bv
zttj>Y%loB%QQ)9sl0YvXkx))hM4jhLv{!^$OVe?_VOgryF_m^F+h=3}iE?G10?EcV
z;`;0Q{{o`=mk_xWo!-kKsuvKY$W$bZzvCf`PE=xJj&g$SESKThVtbSQ<Z&`f9#PFv
zj6xPk!3&QWZ0l_MGP}KVg|__n^W1Yzd_Eyg0npI`|Lq^xZ6uqNkYY$6d>4m+sOBzR
z%&}fOwpe$g*bn(yKP7$%;4f+|O;ZfRRE&<+R&-V8i>g_mVB`RYattpsKo!qpJ&)3Y
z<H5O@r7O0o=$7#!@e5E^*ygFA($YGb-O)AGwpHC|Yi(l?8O6z1hC~=Eu4mUP`5{?!
zH60w0ABy7*Y(IKV67^ECX%h#pYKhifS?)+@Y?Y;T@hYbM9XmZG(Qj6|fZwcyjoygv
zDdn7NYhA|=*VdYJ)--hBBvkAl84os#d4ihKSjDk(VZvFfnpOIkkwd4;*Ve>rd6Apo
ya7NHXbR%z&Zwu<kcEY6fGWe$fUm<@S&!K448fO}38?R5*8gq>cjf)MbA^i(+lF+*V

literal 0
HcmV?d00001

diff --git a/cipher.py b/cipher.py
index 0fc9e88..0536350 100644
--- a/cipher.py
+++ b/cipher.py
@@ -1,5 +1,13 @@
 import string
 import collections
+import norms
+
+english_counts = collections.defaultdict(int)
+with open('count_1l.txt', 'r') as f:
+    for line in f:
+        (letter, count) = line.split("\t")
+        english_counts[letter] = int(count)
+normalised_english_counts = norms.normalise(english_counts)        
 
 
 def sanitise(text):
@@ -30,109 +38,6 @@ def letter_frequencies(text):
         counts[c] += 1
     return counts
 
-
-def normalise_frequencies(frequencies):
-    """Scale a set of letter frequenies so they add to 1
-    
-    >>> sorted(normalise_frequencies(letter_frequencies('abcdefabc')).items())
-    [('a', 0.2222222222222222), ('b', 0.2222222222222222), ('c', 0.2222222222222222), ('d', 0.1111111111111111), ('e', 0.1111111111111111), ('f', 0.1111111111111111)]
-    >>> sorted(normalise_frequencies(letter_frequencies('the quick brown fox jumped over the lazy dog')).items())
-    [(' ', 0.18181818181818182), ('a', 0.022727272727272728), ('b', 0.022727272727272728), ('c', 0.022727272727272728), ('d', 0.045454545454545456), ('e', 0.09090909090909091), ('f', 0.022727272727272728), ('g', 0.022727272727272728), ('h', 0.045454545454545456), ('i', 0.022727272727272728), ('j', 0.022727272727272728), ('k', 0.022727272727272728), ('l', 0.022727272727272728), ('m', 0.022727272727272728), ('n', 0.022727272727272728), ('o', 0.09090909090909091), ('p', 0.022727272727272728), ('q', 0.022727272727272728), ('r', 0.045454545454545456), ('t', 0.045454545454545456), ('u', 0.045454545454545456), ('v', 0.022727272727272728), ('w', 0.022727272727272728), ('x', 0.022727272727272728), ('y', 0.022727272727272728), ('z', 0.022727272727272728)]
-    >>> sorted(normalise_frequencies(letter_frequencies('The Quick BROWN fox jumped! over... the (9lazy) DOG')).items())
-    [(' ', 0.1568627450980392), ('!', 0.0196078431372549), ('(', 0.0196078431372549), (')', 0.0196078431372549), ('.', 0.058823529411764705), ('9', 0.0196078431372549), ('B', 0.0196078431372549), ('D', 0.0196078431372549), ('G', 0.0196078431372549), ('N', 0.0196078431372549), ('O', 0.0392156862745098), ('Q', 0.0196078431372549), ('R', 0.0196078431372549), ('T', 0.0196078431372549), ('W', 0.0196078431372549), ('a', 0.0196078431372549), ('c', 0.0196078431372549), ('d', 0.0196078431372549), ('e', 0.0784313725490196), ('f', 0.0196078431372549), ('h', 0.0392156862745098), ('i', 0.0196078431372549), ('j', 0.0196078431372549), ('k', 0.0196078431372549), ('l', 0.0196078431372549), ('m', 0.0196078431372549), ('o', 0.0392156862745098), ('p', 0.0196078431372549), ('r', 0.0196078431372549), ('t', 0.0196078431372549), ('u', 0.0392156862745098), ('v', 0.0196078431372549), ('x', 0.0196078431372549), ('y', 0.0196078431372549), ('z', 0.0196078431372549)]
-    >>> sorted(normalise_frequencies(letter_frequencies(sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG'))).items())
-    [('a', 0.027777777777777776), ('b', 0.027777777777777776), ('c', 0.027777777777777776), ('d', 0.05555555555555555), ('e', 0.1111111111111111), ('f', 0.027777777777777776), ('g', 0.027777777777777776), ('h', 0.05555555555555555), ('i', 0.027777777777777776), ('j', 0.027777777777777776), ('k', 0.027777777777777776), ('l', 0.027777777777777776), ('m', 0.027777777777777776), ('n', 0.027777777777777776), ('o', 0.1111111111111111), ('p', 0.027777777777777776), ('q', 0.027777777777777776), ('r', 0.05555555555555555), ('t', 0.05555555555555555), ('u', 0.05555555555555555), ('v', 0.027777777777777776), ('w', 0.027777777777777776), ('x', 0.027777777777777776), ('y', 0.027777777777777776), ('z', 0.027777777777777776)]
-    """
-    total = sum(frequencies.values())
-    return dict((k, v / total) for (k, v) in frequencies.items())
-
-def l2_norm(frequencies1, frequencies2):
-    """Finds the distances between two frequency profiles, expressed as dictionaries.
-    Assumes every key in frequencies1 is also in frequencies2
-    
-    >>> l2_norm({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
-    0.0
-    >>> l2_norm({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
-    0.0
-    >>> l2_norm({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
-    0.816496580927726
-    >>> l2_norm({'a':0, 'b':1}, {'a':1, 'b':1})
-    0.7071067811865476
-    """
-    f1n = normalise_frequencies(frequencies1)
-    f2n = normalise_frequencies(frequencies2)
-    total = 0
-    for k in f1n.keys():
-        total += (f1n[k] - f2n[k]) ** 2
-    return total ** 0.5
-euclidean_distance = l2_norm
-
-def l1_norm(frequencies1, frequencies2):
-    """Finds the distances between two frequency profiles, expressed as dictionaries.
-    Assumes every key in frequencies1 is also in frequencies2
-
-    >>> l1_norm({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
-    0.0
-    >>> l1_norm({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
-    0.0
-    >>> l1_norm({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
-    1.3333333333333333
-    >>> l1_norm({'a':0, 'b':1}, {'a':1, 'b':1})
-    1.0
-    """
-    f1n = normalise_frequencies(frequencies1)
-    f2n = normalise_frequencies(frequencies2)
-    total = 0
-    for k in f1n.keys():
-        total += abs(f1n[k] - f2n[k])
-    return total
-
-def l3_norm(frequencies1, frequencies2):
-    """Finds the distances between two frequency profiles, expressed as dictionaries.
-    Assumes every key in frequencies1 is also in frequencies2
-
-    >>> l3_norm({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
-    0.0
-    >>> l3_norm({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
-    0.0
-    >>> l3_norm({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
-    0.7181448966772946
-    >>> l3_norm({'a':0, 'b':1}, {'a':1, 'b':1})
-    0.6299605249474366
-    """
-    f1n = normalise_frequencies(frequencies1)
-    f2n = normalise_frequencies(frequencies2)
-    total = 0
-    for k in f1n.keys():
-        total += abs(f1n[k] - f2n[k]) ** 3
-    return total ** (1/3)
-
-def cosine_distance(frequencies1, frequencies2):
-    """Finds the distances between two frequency profiles, expressed as dictionaries.
-    Assumes every key in frequencies1 is also in frequencies2
-
-    >>> cosine_distance({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
-    -2.220446049250313e-16
-    >>> cosine_distance({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
-    -2.220446049250313e-16
-    >>> cosine_distance({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
-    0.42264973081037416
-    >>> cosine_distance({'a':0, 'b':1}, {'a':1, 'b':1})
-    0.29289321881345254
-    """
-    numerator = 0
-    length1 = 0
-    length2 = 0
-    for k in frequencies1.keys():
-        numerator += frequencies1[k] * frequencies2[k]
-        length1 += frequencies1[k]**2
-    for k in frequencies2.keys():
-        length2 += frequencies2[k]
-    return 1 - (numerator / (length1 ** 0.5 * length2 ** 0.5))
-
-
-
-
 def caesar_encipher_letter(letter, shift):
     """Encipher a letter, given a shift amount
 
@@ -199,30 +104,20 @@ def caesar_decipher(message, shift):
     """
     return caesar_encipher(message, -shift)
 
-def caesar_break(message, metric=euclidean_distance):
+def caesar_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
     sanitised_message = sanitise(message)
     best_shift = 0
     best_fit = float("inf")
     for shift in range(1, 25):
         plaintext = caesar_decipher(sanitised_message, shift)
-        frequencies = letter_frequencies(plaintext)
-        fit = metric(english_counts, frequencies)
+        frequencies = message_frequency_scaling(letter_frequencies(plaintext))
+        fit = metric(target_frequencies, frequencies)
         if fit < best_fit:
             best_fit = fit
             best_shift = shift
     return best_shift, best_fit
 
 
-
-
-
-english_counts = collections.defaultdict(int)
-with open('count_1l.txt', 'r') as f:
-    for line in f:
-        (letter, count) = line.split("\t")
-        english_counts[letter] = int(count)
-
-
 if __name__ == "__main__":
     import doctest
     doctest.testmod()
diff --git a/norms.py b/norms.py
new file mode 100644
index 0000000..744cbe4
--- /dev/null
+++ b/norms.py
@@ -0,0 +1,125 @@
+import collections
+
+def normalise(frequencies):
+    """Scale a set of frequenies so they have a unit euclidean length
+    
+    >>> sorted(normalise({1: 1, 2: 0}).items())
+    [(1, 1.0), (2, 0.0)]
+    >>> sorted(normalise({1: 1, 2: 1}).items())
+    [(1, 0.7071067811865475), (2, 0.7071067811865475)]
+    >>> sorted(normalise({1: 1, 2: 1, 3: 1}).items())
+    [(1, 0.5773502691896258), (2, 0.5773502691896258), (3, 0.5773502691896258)]
+    >>> sorted(normalise({1: 1, 2: 2, 3: 1}).items())
+    [(1, 0.4082482904638631), (2, 0.8164965809277261), (3, 0.4082482904638631)]
+   """
+    length = sum([f ** 2 for f in frequencies.values()]) ** 0.5
+    return collections.defaultdict(int, ((k, v / length) for (k, v) in frequencies.items()))
+
+def scale(frequencies):
+    """Scale a set of frequencies so the largest is 1
+    
+    >>> sorted(scale({1: 1, 2: 0}).items())
+    [(1, 1.0), (2, 0.0)]
+    >>> sorted(scale({1: 1, 2: 1}).items())
+    [(1, 1.0), (2, 1.0)]
+    >>> sorted(scale({1: 1, 2: 1, 3: 1}).items())
+    [(1, 1.0), (2, 1.0), (3, 1.0)]
+    >>> sorted(scale({1: 1, 2: 2, 3: 1}).items())
+    [(1, 0.5), (2, 1.0), (3, 0.5)]
+    """
+    largest = max(frequencies.values())
+    return collections.defaultdict(int, ((k, v / largest) for (k, v) in frequencies.items()))
+    
+
+def l2(frequencies1, frequencies2):
+    """Finds the distances between two frequency profiles, expressed as dictionaries.
+    Assumes every key in frequencies1 is also in frequencies2
+    
+    >>> l2({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
+    0.0
+    >>> l2({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
+    1.7320508075688772
+    >>> l2(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
+    0.0
+    >>> l2({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
+    1.7320508075688772
+    >>> l2(normalise({'a':0, 'b':2, 'c':0}), normalise({'a':1, 'b':1, 'c':1}))
+    0.9194016867619662
+    >>> l2({'a':0, 'b':1}, {'a':1, 'b':1})
+    1.0
+    """
+    total = 0
+    for k in frequencies1.keys():
+        total += (frequencies1[k] - frequencies2[k]) ** 2
+    return total ** 0.5
+euclidean_distance = l2
+
+def l1(frequencies1, frequencies2):
+    """Finds the distances between two frequency profiles, expressed as dictionaries.
+    Assumes every key in frequencies1 is also in frequencies2
+
+    >>> l1({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
+    0
+    >>> l1({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
+    3
+    >>> l1(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
+    0.0
+    >>> l1({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
+    3
+    >>> l1({'a':0, 'b':1}, {'a':1, 'b':1})
+    1
+    """
+    total = 0
+    for k in frequencies1.keys():
+        total += abs(frequencies1[k] - frequencies2[k])
+    return total
+
+def l3(frequencies1, frequencies2):
+    """Finds the distances between two frequency profiles, expressed as dictionaries.
+    Assumes every key in frequencies1 is also in frequencies2
+
+    >>> l3({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
+    0.0
+    >>> l3({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
+    1.4422495703074083
+    >>> l3({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
+    1.4422495703074083
+    >>> l3(normalise({'a':0, 'b':2, 'c':0}), normalise({'a':1, 'b':1, 'c':1}))
+    0.7721675487598008
+    >>> l3({'a':0, 'b':1}, {'a':1, 'b':1})
+    1.0
+    >>> l3(normalise({'a':0, 'b':1}), normalise({'a':1, 'b':1}))
+    0.7234757712960591
+    """
+    total = 0
+    for k in frequencies1.keys():
+        total += abs(frequencies1[k] - frequencies2[k]) ** 3
+    return total ** (1/3)
+
+def cosine_distance(frequencies1, frequencies2):
+    """Finds the distances between two frequency profiles, expressed as dictionaries.
+    Assumes every key in frequencies1 is also in frequencies2
+
+    >>> cosine_distance({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
+    -2.220446049250313e-16
+    >>> cosine_distance({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
+    -2.220446049250313e-16
+    >>> cosine_distance({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
+    0.42264973081037416
+    >>> cosine_distance({'a':0, 'b':1}, {'a':1, 'b':1})
+    0.29289321881345254
+    """
+    numerator = 0
+    length1 = 0
+    length2 = 0
+    for k in frequencies1.keys():
+        numerator += frequencies1[k] * frequencies2[k]
+        length1 += frequencies1[k]**2
+    for k in frequencies2.keys():
+        length2 += frequencies2[k]
+    return 1 - (numerator / (length1 ** 0.5 * length2 ** 0.5))
+
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
-- 
2.43.0