6 """Remove all non-alphabetic characters and convert the text to lowercase
8 >>> sanitise('The Quick')
10 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
11 'thequickbrownfoxjumpedoverthelazydog'
13 sanitised
= [c
.lower() for c
in text
if c
in string
.ascii_letters
]
14 return ''.join(sanitised
)
16 def letter_frequencies(text
):
17 """Count the number of occurrences of each character in text
19 >>> sorted(letter_frequencies('abcdefabc').items())
20 [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
21 >>> sorted(letter_frequencies('the quick brown fox jumped over the lazy dog').items())
22 [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
23 >>> sorted(letter_frequencies('The Quick BROWN fox jumped! over... the (9lazy) DOG').items())
24 [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1), ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1), ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1), ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
25 >>> sorted(letter_frequencies(sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')).items())
26 [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
28 counts
= collections
.defaultdict(int)
34 def normalise_frequencies(frequencies
):
35 """Scale a set of letter frequenies so they add to 1
37 >>> sorted(normalise_frequencies(letter_frequencies('abcdefabc')).items())
38 [('a', 0.2222222222222222), ('b', 0.2222222222222222), ('c', 0.2222222222222222), ('d', 0.1111111111111111), ('e', 0.1111111111111111), ('f', 0.1111111111111111)]
39 >>> sorted(normalise_frequencies(letter_frequencies('the quick brown fox jumped over the lazy dog')).items())
40 [(' ', 0.18181818181818182), ('a', 0.022727272727272728), ('b', 0.022727272727272728), ('c', 0.022727272727272728), ('d', 0.045454545454545456), ('e', 0.09090909090909091), ('f', 0.022727272727272728), ('g', 0.022727272727272728), ('h', 0.045454545454545456), ('i', 0.022727272727272728), ('j', 0.022727272727272728), ('k', 0.022727272727272728), ('l', 0.022727272727272728), ('m', 0.022727272727272728), ('n', 0.022727272727272728), ('o', 0.09090909090909091), ('p', 0.022727272727272728), ('q', 0.022727272727272728), ('r', 0.045454545454545456), ('t', 0.045454545454545456), ('u', 0.045454545454545456), ('v', 0.022727272727272728), ('w', 0.022727272727272728), ('x', 0.022727272727272728), ('y', 0.022727272727272728), ('z', 0.022727272727272728)]
41 >>> sorted(normalise_frequencies(letter_frequencies('The Quick BROWN fox jumped! over... the (9lazy) DOG')).items())
42 [(' ', 0.1568627450980392), ('!', 0.0196078431372549), ('(', 0.0196078431372549), (')', 0.0196078431372549), ('.', 0.058823529411764705), ('9', 0.0196078431372549), ('B', 0.0196078431372549), ('D', 0.0196078431372549), ('G', 0.0196078431372549), ('N', 0.0196078431372549), ('O', 0.0392156862745098), ('Q', 0.0196078431372549), ('R', 0.0196078431372549), ('T', 0.0196078431372549), ('W', 0.0196078431372549), ('a', 0.0196078431372549), ('c', 0.0196078431372549), ('d', 0.0196078431372549), ('e', 0.0784313725490196), ('f', 0.0196078431372549), ('h', 0.0392156862745098), ('i', 0.0196078431372549), ('j', 0.0196078431372549), ('k', 0.0196078431372549), ('l', 0.0196078431372549), ('m', 0.0196078431372549), ('o', 0.0392156862745098), ('p', 0.0196078431372549), ('r', 0.0196078431372549), ('t', 0.0196078431372549), ('u', 0.0392156862745098), ('v', 0.0196078431372549), ('x', 0.0196078431372549), ('y', 0.0196078431372549), ('z', 0.0196078431372549)]
43 >>> sorted(normalise_frequencies(letter_frequencies(sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG'))).items())
44 [('a', 0.027777777777777776), ('b', 0.027777777777777776), ('c', 0.027777777777777776), ('d', 0.05555555555555555), ('e', 0.1111111111111111), ('f', 0.027777777777777776), ('g', 0.027777777777777776), ('h', 0.05555555555555555), ('i', 0.027777777777777776), ('j', 0.027777777777777776), ('k', 0.027777777777777776), ('l', 0.027777777777777776), ('m', 0.027777777777777776), ('n', 0.027777777777777776), ('o', 0.1111111111111111), ('p', 0.027777777777777776), ('q', 0.027777777777777776), ('r', 0.05555555555555555), ('t', 0.05555555555555555), ('u', 0.05555555555555555), ('v', 0.027777777777777776), ('w', 0.027777777777777776), ('x', 0.027777777777777776), ('y', 0.027777777777777776), ('z', 0.027777777777777776)]
46 total
= sum(frequencies
.values())
47 return dict((k
, v
/ total
) for (k
, v
) in frequencies
.items())
49 def l2_norm(frequencies1
, frequencies2
):
50 """Finds the distances between two frequency profiles, expressed as dictionaries.
51 Assumes every key in frequencies1 is also in frequencies2
53 >>> l2_norm({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
55 >>> l2_norm({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
57 >>> l2_norm({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
59 >>> l2_norm({'a':0, 'b':1}, {'a':1, 'b':1})
62 f1n
= normalise_frequencies(frequencies1
)
63 f2n
= normalise_frequencies(frequencies2
)
66 total
+= (f1n
[k
] - f2n
[k
]) ** 2
68 euclidean_distance
= l2_norm
70 def l1_norm(frequencies1
, frequencies2
):
71 """Finds the distances between two frequency profiles, expressed as dictionaries.
72 Assumes every key in frequencies1 is also in frequencies2
74 >>> l1_norm({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
76 >>> l1_norm({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
78 >>> l1_norm({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
80 >>> l1_norm({'a':0, 'b':1}, {'a':1, 'b':1})
83 f1n
= normalise_frequencies(frequencies1
)
84 f2n
= normalise_frequencies(frequencies2
)
87 total
+= abs(f1n
[k
] - f2n
[k
])
90 def l3_norm(frequencies1
, frequencies2
):
91 """Finds the distances between two frequency profiles, expressed as dictionaries.
92 Assumes every key in frequencies1 is also in frequencies2
94 >>> l3_norm({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
96 >>> l3_norm({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
98 >>> l3_norm({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
100 >>> l3_norm({'a':0, 'b':1}, {'a':1, 'b':1})
103 f1n
= normalise_frequencies(frequencies1
)
104 f2n
= normalise_frequencies(frequencies2
)
107 total
+= abs(f1n
[k
] - f2n
[k
]) ** 3
108 return total
** (1/3)
110 def cosine_distance(frequencies1
, frequencies2
):
111 """Finds the distances between two frequency profiles, expressed as dictionaries.
112 Assumes every key in frequencies1 is also in frequencies2
114 >>> cosine_distance({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
115 -2.220446049250313e-16
116 >>> cosine_distance({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
117 -2.220446049250313e-16
118 >>> cosine_distance({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
120 >>> cosine_distance({'a':0, 'b':1}, {'a':1, 'b':1})
126 for k
in frequencies1
.keys():
127 numerator
+= frequencies1
[k
] * frequencies2
[k
]
128 length1
+= frequencies1
[k
]**2
129 for k
in frequencies2
.keys():
130 length2
+= frequencies2
[k
]
131 return 1 - (numerator
/ (length1
** 0.5 * length2
** 0.5))
136 def caesar_encipher_letter(letter
, shift
):
137 """Encipher a letter, given a shift amount
139 >>> caesar_encipher_letter('a', 1)
141 >>> caesar_encipher_letter('a', 2)
143 >>> caesar_encipher_letter('b', 2)
145 >>> caesar_encipher_letter('x', 2)
147 >>> caesar_encipher_letter('y', 2)
149 >>> caesar_encipher_letter('z', 2)
151 >>> caesar_encipher_letter('z', -1)
153 >>> caesar_encipher_letter('a', -1)
156 if letter
in string
.ascii_letters
:
157 if letter
in string
.ascii_uppercase
:
158 alphabet_start
= ord('A')
160 alphabet_start
= ord('a')
161 return chr(((ord(letter
) - alphabet_start
+ shift
) % 26) + alphabet_start
)
165 def caesar_decipher_letter(letter
, shift
):
166 """Decipher a letter, given a shift amount
168 >>> caesar_decipher_letter('b', 1)
170 >>> caesar_decipher_letter('b', 2)
173 return caesar_encipher_letter(letter
, -shift
)
175 def caesar_encipher(message
, shift
):
176 """Encipher a message with the Caesar cipher of given shift
178 >>> caesar_encipher('abc', 1)
180 >>> caesar_encipher('abc', 2)
182 >>> caesar_encipher('abcxyz', 2)
184 >>> caesar_encipher('ab cx yz', 2)
187 enciphered
= [caesar_encipher_letter(l
, shift
) for l
in message
]
188 return ''.join(enciphered
)
190 def caesar_decipher(message
, shift
):
191 """Encipher a message with the Caesar cipher of given shift
193 >>> caesar_decipher('bcd', 1)
195 >>> caesar_decipher('cde', 2)
197 >>> caesar_decipher('cd ez ab', 2)
200 return caesar_encipher(message
, -shift
)
202 def caesar_break(message
, metric
=euclidean_distance
):
203 sanitised_message
= sanitise(message
)
205 best_fit
= float("inf")
206 for shift
in range(1, 25):
207 plaintext
= caesar_decipher(sanitised_message
, shift
)
208 frequencies
= letter_frequencies(plaintext
)
209 fit
= metric(english_counts
, frequencies
)
213 return best_shift
, best_fit
219 english_counts
= collections
.defaultdict(int)
220 with
open('count_1l.txt', 'r') as f
:
222 (letter
, count
) = line
.split("\t")
223 english_counts
[letter
] = int(count
)
226 if __name__
== "__main__":