import string
import collections
+import norms
+
+english_counts = collections.defaultdict(int)
+with open('count_1l.txt', 'r') as f:
+ for line in f:
+ (letter, count) = line.split("\t")
+ english_counts[letter] = int(count)
+normalised_english_counts = norms.normalise(english_counts)
def sanitise(text):
counts[c] += 1
return counts
-
-def normalise_frequencies(frequencies):
- """Scale a set of letter frequenies so they add to 1
-
- >>> sorted(normalise_frequencies(letter_frequencies('abcdefabc')).items())
- [('a', 0.2222222222222222), ('b', 0.2222222222222222), ('c', 0.2222222222222222), ('d', 0.1111111111111111), ('e', 0.1111111111111111), ('f', 0.1111111111111111)]
- >>> sorted(normalise_frequencies(letter_frequencies('the quick brown fox jumped over the lazy dog')).items())
- [(' ', 0.18181818181818182), ('a', 0.022727272727272728), ('b', 0.022727272727272728), ('c', 0.022727272727272728), ('d', 0.045454545454545456), ('e', 0.09090909090909091), ('f', 0.022727272727272728), ('g', 0.022727272727272728), ('h', 0.045454545454545456), ('i', 0.022727272727272728), ('j', 0.022727272727272728), ('k', 0.022727272727272728), ('l', 0.022727272727272728), ('m', 0.022727272727272728), ('n', 0.022727272727272728), ('o', 0.09090909090909091), ('p', 0.022727272727272728), ('q', 0.022727272727272728), ('r', 0.045454545454545456), ('t', 0.045454545454545456), ('u', 0.045454545454545456), ('v', 0.022727272727272728), ('w', 0.022727272727272728), ('x', 0.022727272727272728), ('y', 0.022727272727272728), ('z', 0.022727272727272728)]
- >>> sorted(normalise_frequencies(letter_frequencies('The Quick BROWN fox jumped! over... the (9lazy) DOG')).items())
- [(' ', 0.1568627450980392), ('!', 0.0196078431372549), ('(', 0.0196078431372549), (')', 0.0196078431372549), ('.', 0.058823529411764705), ('9', 0.0196078431372549), ('B', 0.0196078431372549), ('D', 0.0196078431372549), ('G', 0.0196078431372549), ('N', 0.0196078431372549), ('O', 0.0392156862745098), ('Q', 0.0196078431372549), ('R', 0.0196078431372549), ('T', 0.0196078431372549), ('W', 0.0196078431372549), ('a', 0.0196078431372549), ('c', 0.0196078431372549), ('d', 0.0196078431372549), ('e', 0.0784313725490196), ('f', 0.0196078431372549), ('h', 0.0392156862745098), ('i', 0.0196078431372549), ('j', 0.0196078431372549), ('k', 0.0196078431372549), ('l', 0.0196078431372549), ('m', 0.0196078431372549), ('o', 0.0392156862745098), ('p', 0.0196078431372549), ('r', 0.0196078431372549), ('t', 0.0196078431372549), ('u', 0.0392156862745098), ('v', 0.0196078431372549), ('x', 0.0196078431372549), ('y', 0.0196078431372549), ('z', 0.0196078431372549)]
- >>> sorted(normalise_frequencies(letter_frequencies(sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG'))).items())
- [('a', 0.027777777777777776), ('b', 0.027777777777777776), ('c', 0.027777777777777776), ('d', 0.05555555555555555), ('e', 0.1111111111111111), ('f', 0.027777777777777776), ('g', 0.027777777777777776), ('h', 0.05555555555555555), ('i', 0.027777777777777776), ('j', 0.027777777777777776), ('k', 0.027777777777777776), ('l', 0.027777777777777776), ('m', 0.027777777777777776), ('n', 0.027777777777777776), ('o', 0.1111111111111111), ('p', 0.027777777777777776), ('q', 0.027777777777777776), ('r', 0.05555555555555555), ('t', 0.05555555555555555), ('u', 0.05555555555555555), ('v', 0.027777777777777776), ('w', 0.027777777777777776), ('x', 0.027777777777777776), ('y', 0.027777777777777776), ('z', 0.027777777777777776)]
- """
- total = sum(frequencies.values())
- return dict((k, v / total) for (k, v) in frequencies.items())
-
-def l2_norm(frequencies1, frequencies2):
- """Finds the distances between two frequency profiles, expressed as dictionaries.
- Assumes every key in frequencies1 is also in frequencies2
-
- >>> l2_norm({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
- 0.0
- >>> l2_norm({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
- 0.0
- >>> l2_norm({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
- 0.816496580927726
- >>> l2_norm({'a':0, 'b':1}, {'a':1, 'b':1})
- 0.7071067811865476
- """
- f1n = normalise_frequencies(frequencies1)
- f2n = normalise_frequencies(frequencies2)
- total = 0
- for k in f1n.keys():
- total += (f1n[k] - f2n[k]) ** 2
- return total ** 0.5
-euclidean_distance = l2_norm
-
-def l1_norm(frequencies1, frequencies2):
- """Finds the distances between two frequency profiles, expressed as dictionaries.
- Assumes every key in frequencies1 is also in frequencies2
-
- >>> l1_norm({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
- 0.0
- >>> l1_norm({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
- 0.0
- >>> l1_norm({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
- 1.3333333333333333
- >>> l1_norm({'a':0, 'b':1}, {'a':1, 'b':1})
- 1.0
- """
- f1n = normalise_frequencies(frequencies1)
- f2n = normalise_frequencies(frequencies2)
- total = 0
- for k in f1n.keys():
- total += abs(f1n[k] - f2n[k])
- return total
-
-def l3_norm(frequencies1, frequencies2):
- """Finds the distances between two frequency profiles, expressed as dictionaries.
- Assumes every key in frequencies1 is also in frequencies2
-
- >>> l3_norm({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
- 0.0
- >>> l3_norm({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
- 0.0
- >>> l3_norm({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
- 0.7181448966772946
- >>> l3_norm({'a':0, 'b':1}, {'a':1, 'b':1})
- 0.6299605249474366
- """
- f1n = normalise_frequencies(frequencies1)
- f2n = normalise_frequencies(frequencies2)
- total = 0
- for k in f1n.keys():
- total += abs(f1n[k] - f2n[k]) ** 3
- return total ** (1/3)
-
-def cosine_distance(frequencies1, frequencies2):
- """Finds the distances between two frequency profiles, expressed as dictionaries.
- Assumes every key in frequencies1 is also in frequencies2
-
- >>> cosine_distance({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
- -2.220446049250313e-16
- >>> cosine_distance({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
- -2.220446049250313e-16
- >>> cosine_distance({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
- 0.42264973081037416
- >>> cosine_distance({'a':0, 'b':1}, {'a':1, 'b':1})
- 0.29289321881345254
- """
- numerator = 0
- length1 = 0
- length2 = 0
- for k in frequencies1.keys():
- numerator += frequencies1[k] * frequencies2[k]
- length1 += frequencies1[k]**2
- for k in frequencies2.keys():
- length2 += frequencies2[k]
- return 1 - (numerator / (length1 ** 0.5 * length2 ** 0.5))
-
-
-
-
def caesar_encipher_letter(letter, shift):
"""Encipher a letter, given a shift amount
"""
return caesar_encipher(message, -shift)
-def caesar_break(message, metric=euclidean_distance):
+def caesar_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
sanitised_message = sanitise(message)
best_shift = 0
best_fit = float("inf")
for shift in range(1, 25):
plaintext = caesar_decipher(sanitised_message, shift)
- frequencies = letter_frequencies(plaintext)
- fit = metric(english_counts, frequencies)
+ frequencies = message_frequency_scaling(letter_frequencies(plaintext))
+ fit = metric(target_frequencies, frequencies)
if fit < best_fit:
best_fit = fit
best_shift = shift
return best_shift, best_fit
-
-
-
-english_counts = collections.defaultdict(int)
-with open('count_1l.txt', 'r') as f:
- for line in f:
- (letter, count) = line.split("\t")
- english_counts[letter] = int(count)
-
-
if __name__ == "__main__":
import doctest
doctest.testmod()
--- /dev/null
+import collections
+
+def normalise(frequencies):
+ """Scale a set of frequenies so they have a unit euclidean length
+
+ >>> sorted(normalise({1: 1, 2: 0}).items())
+ [(1, 1.0), (2, 0.0)]
+ >>> sorted(normalise({1: 1, 2: 1}).items())
+ [(1, 0.7071067811865475), (2, 0.7071067811865475)]
+ >>> sorted(normalise({1: 1, 2: 1, 3: 1}).items())
+ [(1, 0.5773502691896258), (2, 0.5773502691896258), (3, 0.5773502691896258)]
+ >>> sorted(normalise({1: 1, 2: 2, 3: 1}).items())
+ [(1, 0.4082482904638631), (2, 0.8164965809277261), (3, 0.4082482904638631)]
+ """
+ length = sum([f ** 2 for f in frequencies.values()]) ** 0.5
+ return collections.defaultdict(int, ((k, v / length) for (k, v) in frequencies.items()))
+
+def scale(frequencies):
+ """Scale a set of frequencies so the largest is 1
+
+ >>> sorted(scale({1: 1, 2: 0}).items())
+ [(1, 1.0), (2, 0.0)]
+ >>> sorted(scale({1: 1, 2: 1}).items())
+ [(1, 1.0), (2, 1.0)]
+ >>> sorted(scale({1: 1, 2: 1, 3: 1}).items())
+ [(1, 1.0), (2, 1.0), (3, 1.0)]
+ >>> sorted(scale({1: 1, 2: 2, 3: 1}).items())
+ [(1, 0.5), (2, 1.0), (3, 0.5)]
+ """
+ largest = max(frequencies.values())
+ return collections.defaultdict(int, ((k, v / largest) for (k, v) in frequencies.items()))
+
+
+def l2(frequencies1, frequencies2):
+ """Finds the distances between two frequency profiles, expressed as dictionaries.
+ Assumes every key in frequencies1 is also in frequencies2
+
+ >>> l2({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
+ 0.0
+ >>> l2({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
+ 1.7320508075688772
+ >>> l2(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
+ 0.0
+ >>> l2({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
+ 1.7320508075688772
+ >>> l2(normalise({'a':0, 'b':2, 'c':0}), normalise({'a':1, 'b':1, 'c':1}))
+ 0.9194016867619662
+ >>> l2({'a':0, 'b':1}, {'a':1, 'b':1})
+ 1.0
+ """
+ total = 0
+ for k in frequencies1.keys():
+ total += (frequencies1[k] - frequencies2[k]) ** 2
+ return total ** 0.5
+euclidean_distance = l2
+
+def l1(frequencies1, frequencies2):
+ """Finds the distances between two frequency profiles, expressed as dictionaries.
+ Assumes every key in frequencies1 is also in frequencies2
+
+ >>> l1({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
+ 0
+ >>> l1({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
+ 3
+ >>> l1(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
+ 0.0
+ >>> l1({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
+ 3
+ >>> l1({'a':0, 'b':1}, {'a':1, 'b':1})
+ 1
+ """
+ total = 0
+ for k in frequencies1.keys():
+ total += abs(frequencies1[k] - frequencies2[k])
+ return total
+
+def l3(frequencies1, frequencies2):
+ """Finds the distances between two frequency profiles, expressed as dictionaries.
+ Assumes every key in frequencies1 is also in frequencies2
+
+ >>> l3({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
+ 0.0
+ >>> l3({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
+ 1.4422495703074083
+ >>> l3({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
+ 1.4422495703074083
+ >>> l3(normalise({'a':0, 'b':2, 'c':0}), normalise({'a':1, 'b':1, 'c':1}))
+ 0.7721675487598008
+ >>> l3({'a':0, 'b':1}, {'a':1, 'b':1})
+ 1.0
+ >>> l3(normalise({'a':0, 'b':1}), normalise({'a':1, 'b':1}))
+ 0.7234757712960591
+ """
+ total = 0
+ for k in frequencies1.keys():
+ total += abs(frequencies1[k] - frequencies2[k]) ** 3
+ return total ** (1/3)
+
+def cosine_distance(frequencies1, frequencies2):
+ """Finds the distances between two frequency profiles, expressed as dictionaries.
+ Assumes every key in frequencies1 is also in frequencies2
+
+ >>> cosine_distance({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
+ -2.220446049250313e-16
+ >>> cosine_distance({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
+ -2.220446049250313e-16
+ >>> cosine_distance({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
+ 0.42264973081037416
+ >>> cosine_distance({'a':0, 'b':1}, {'a':1, 'b':1})
+ 0.29289321881345254
+ """
+ numerator = 0
+ length1 = 0
+ length2 = 0
+ for k in frequencies1.keys():
+ numerator += frequencies1[k] * frequencies2[k]
+ length1 += frequencies1[k]**2
+ for k in frequencies2.keys():
+ length2 += frequencies2[k]
+ return 1 - (numerator / (length1 ** 0.5 * length2 ** 0.5))
+
+
+if __name__ == "__main__":
+ import doctest
+ doctest.testmod()