5 from itertools
import zip_longest
, cycle
, permutations
6 from segment
import segment
, Pwords
7 from multiprocessing
import Pool
10 import matplotlib
.pyplot
as plt
18 # c5a = open('2012/5a.ciphertext', 'r').read()
19 # timeit.timeit('keyword_break(c5a)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break', number=1)
20 # timeit.repeat('keyword_break_mp(c5a, chunksize=500)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break_mp', repeat=5, number=1)
22 with
open('words.txt', 'r') as f
:
23 keywords
= [line
.rstrip() for line
in f
]
25 transpositions
= collections
.defaultdict(list)
27 transpositions
[transpositions_of(word
)] += [word
]
29 def frequencies(text
):
30 """Count the number of occurrences of each character in text
32 >>> sorted(frequencies('abcdefabc').items())
33 [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
34 >>> sorted(frequencies('the quick brown fox jumped over the lazy ' \
35 'dog').items()) # doctest: +NORMALIZE_WHITESPACE
36 [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1),
37 ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1),
38 ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2),
39 ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
40 >>> sorted(frequencies('The Quick BROWN fox jumped! over... the ' \
41 '(9lazy) DOG').items()) # doctest: +NORMALIZE_WHITESPACE
42 [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1),
43 ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1),
44 ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2),
45 ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1),
46 ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
47 >>> sorted(frequencies(sanitise('The Quick BROWN fox jumped! over... ' \
48 'the (9lazy) DOG')).items()) # doctest: +NORMALIZE_WHITESPACE
49 [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1),
50 ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1),
51 ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1),
52 ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
53 >>> frequencies('abcdefabcdef')['x']
56 #counts = collections.defaultdict(int)
60 return collections
.Counter(c
for c
in text
)
61 letter_frequencies
= frequencies
64 def bigram_likelihood(bigram
, bf
, lf
):
65 return bf
[bigram
] / (lf
[bigram
[0]] * lf
[bigram
[1]])
68 def caesar_break(message
,
69 metric
=norms
.euclidean_distance
,
70 target_counts
=normalised_english_counts
,
71 message_frequency_scaling
=norms
.normalise
):
72 """Breaks a Caesar cipher using frequency analysis
74 >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrh' \
75 'ecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS
76 (4, 0.080345432737...)
77 >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgtee' \
78 'raxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS
79 (19, 0.11189290326...)
80 >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurer' \
81 'svaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS
82 (13, 0.08293968842...)
84 sanitised_message
= sanitise(message
)
86 best_fit
= float("inf")
87 for shift
in range(26):
88 plaintext
= caesar_decipher(sanitised_message
, shift
)
89 counts
= message_frequency_scaling(letter_frequencies(plaintext
))
90 fit
= metric(target_counts
, counts
)
91 logger
.debug('Caesar break attempt using key {0} gives fit of {1} '
92 'and decrypt starting: {2}'.format(shift
, fit
, plaintext
[:50]))
96 logger
.info('Caesar break best fit: key {0} gives fit of {1} and '
97 'decrypt starting: {2}'.format(best_shift
, best_fit
,
98 caesar_decipher(sanitised_message
, best_shift
)[:50]))
99 return best_shift
, best_fit
101 def affine_break(message
,
102 metric
=norms
.euclidean_distance
,
103 target_counts
=normalised_english_counts
,
104 message_frequency_scaling
=norms
.normalise
):
105 """Breaks an affine cipher using frequency analysis
107 >>> affine_break('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls ' \
108 'omytd jlaxe mh jm bfmibj umis hfsul axubafkjamx. ls kffkxwsd jls ' \
109 'ofgbjmwfkiu olfmxmtmwaokttg jlsx ls kffkxwsd jlsi zg tsxwjl. jlsx ' \
110 'ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls ' \
111 'bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai ' \
112 'kxd clm ckuxj.') # doctest: +ELLIPSIS
113 ((15, 22, True), 0.0598745365924...)
115 sanitised_message
= sanitise(message
)
118 best_one_based
= True
119 best_fit
= float("inf")
120 for one_based
in [True, False]:
121 for multiplier
in range(1, 26, 2):
122 for adder
in range(26):
123 plaintext
= affine_decipher(sanitised_message
,
124 multiplier
, adder
, one_based
)
125 counts
= message_frequency_scaling(letter_frequencies(plaintext
))
126 fit
= metric(target_counts
, counts
)
127 logger
.debug('Affine break attempt using key {0}x+{1} ({2}) '
128 'gives fit of {3} and decrypt starting: {4}'.
129 format(multiplier
, adder
, one_based
, fit
,
133 best_multiplier
= multiplier
135 best_one_based
= one_based
136 logger
.info('Affine break best fit with key {0}x+{1} ({2}) gives fit of {3} '
137 'and decrypt starting: {4}'.format(
138 best_multiplier
, best_adder
, best_one_based
, best_fit
,
139 affine_decipher(sanitised_message
, best_multiplier
,
140 best_adder
, best_one_based
)[:50]))
141 return (best_multiplier
, best_adder
, best_one_based
), best_fit
143 def keyword_break(message
,
145 metric
=norms
.euclidean_distance
,
146 target_counts
=normalised_english_counts
,
147 message_frequency_scaling
=norms
.normalise
):
148 """Breaks a keyword substitution cipher using a dictionary and
151 >>> keyword_break(keyword_encipher('this is a test message for the ' \
152 'keyword decipherment', 'elephant', 1), \
153 wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
154 (('elephant', 1), 0.1066453448861...)
157 best_wrap_alphabet
= True
158 best_fit
= float("inf")
159 for wrap_alphabet
in range(3):
160 for keyword
in wordlist
:
161 plaintext
= keyword_decipher(message
, keyword
, wrap_alphabet
)
162 counts
= message_frequency_scaling(letter_frequencies(plaintext
))
163 fit
= metric(target_counts
, counts
)
164 logger
.debug('Keyword break attempt using key {0} (wrap={1}) '
165 'gives fit of {2} and decrypt starting: {3}'.format(
166 keyword
, wrap_alphabet
, fit
,
167 sanitise(plaintext
)[:50]))
170 best_keyword
= keyword
171 best_wrap_alphabet
= wrap_alphabet
172 logger
.info('Keyword break best fit with key {0} (wrap={1}) gives fit of '
173 '{2} and decrypt starting: {3}'.format(best_keyword
,
174 best_wrap_alphabet
, best_fit
, sanitise(
175 keyword_decipher(message
, best_keyword
,
176 best_wrap_alphabet
))[:50]))
177 return (best_keyword
, best_wrap_alphabet
), best_fit
179 def keyword_break_mp(message
,
181 metric
=norms
.euclidean_distance
,
182 target_counts
=normalised_english_counts
,
183 message_frequency_scaling
=norms
.normalise
,
185 """Breaks a keyword substitution cipher using a dictionary and
188 >>> keyword_break_mp(keyword_encipher('this is a test message for the ' \
189 'keyword decipherment', 'elephant', 1), \
190 wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
191 (('elephant', 1), 0.106645344886...)
194 helper_args
= [(message
, word
, wrap
, metric
, target_counts
,
195 message_frequency_scaling
)
196 for word
in wordlist
for wrap
in range(3)]
197 # Gotcha: the helper function here needs to be defined at the top level
198 # (limitation of Pool.starmap)
199 breaks
= pool
.starmap(keyword_break_worker
, helper_args
, chunksize
)
200 return min(breaks
, key
=lambda k
: k
[1])
202 def keyword_break_worker(message
, keyword
, wrap_alphabet
, metric
, target_counts
,
203 message_frequency_scaling
):
204 plaintext
= keyword_decipher(message
, keyword
, wrap_alphabet
)
205 counts
= message_frequency_scaling(letter_frequencies(plaintext
))
206 fit
= metric(target_counts
, counts
)
207 logger
.debug('Keyword break attempt using key {0} (wrap={1}) gives fit of '
208 '{2} and decrypt starting: {3}'.format(keyword
,
209 wrap_alphabet
, fit
, sanitise(plaintext
)[:50]))
210 return (keyword
, wrap_alphabet
), fit
212 def scytale_break(message
,
213 metric
=norms
.euclidean_distance
,
214 target_counts
=normalised_english_bigram_counts
,
215 message_frequency_scaling
=norms
.normalise
):
216 """Breaks a Scytale cipher
218 >>> scytale_break('tfeulchtrtteehwahsdehneoifeayfsondmwpltmaoalhikotoere' \
219 'dcweatehiplwxsnhooacgorrcrcraotohsgullasenylrendaianeplscdriioto' \
220 'aek') # doctest: +ELLIPSIS
221 (6, 0.092599933059...)
224 best_fit
= float("inf")
225 ngram_length
= len(next(iter(target_counts
.keys())))
226 for key
in range(1, 20):
227 if len(message
) % key
== 0:
228 plaintext
= scytale_decipher(message
, key
)
229 counts
= message_frequency_scaling(frequencies(
230 ngrams(sanitise(plaintext
), ngram_length
)))
231 fit
= metric(target_counts
, counts
)
232 logger
.debug('Scytale break attempt using key {0} gives fit of '
233 '{1} and decrypt starting: {2}'.format(key
,
234 fit
, sanitise(plaintext
)[:50]))
238 logger
.info('Scytale break best fit with key {0} gives fit of {1} and '
239 'decrypt starting: {2}'.format(best_key
, best_fit
,
240 sanitise(scytale_decipher(message
, best_key
))[:50]))
241 return best_key
, best_fit
244 def column_transposition_break_mp(message
,
245 translist
=transpositions
,
246 metric
=norms
.euclidean_distance
,
247 target_counts
=normalised_english_bigram_counts
,
248 message_frequency_scaling
=norms
.normalise
,
250 """Breaks a column transposition cipher using a dictionary and
251 n-gram frequency analysis
253 >>> column_transposition_break_mp(column_transposition_encipher(sanitise( \
254 "It is a truth universally acknowledged, that a single man in \
255 possession of a good fortune, must be in want of a wife. However \
256 little known the feelings or views of such a man may be on his \
257 first entering a neighbourhood, this truth is so well fixed in the \
258 minds of the surrounding families, that he is considered the \
259 rightful property of some one or other of their daughters."), \
261 translist={(2, 0, 5, 3, 1, 4, 6): ['encipher'], \
262 (5, 0, 6, 1, 3, 4, 2): ['fourteen'], \
263 (6, 1, 0, 4, 5, 3, 2): ['keyword']}) # doctest: +ELLIPSIS
264 (((2, 0, 5, 3, 1, 4, 6), False), 0.0628106372...)
265 >>> column_transposition_break_mp(column_transposition_encipher(sanitise( \
266 "It is a truth universally acknowledged, that a single man in \
267 possession of a good fortune, must be in want of a wife. However \
268 little known the feelings or views of such a man may be on his \
269 first entering a neighbourhood, this truth is so well fixed in the \
270 minds of the surrounding families, that he is considered the \
271 rightful property of some one or other of their daughters."), \
273 translist={(2, 0, 5, 3, 1, 4, 6): ['encipher'], \
274 (5, 0, 6, 1, 3, 4, 2): ['fourteen'], \
275 (6, 1, 0, 4, 5, 3, 2): ['keyword']}, \
276 target_counts=normalised_english_trigram_counts) # doctest: +ELLIPSIS
277 (((2, 0, 5, 3, 1, 4, 6), False), 0.0592259560...)
279 ngram_length
= len(next(iter(target_counts
.keys())))
281 helper_args
= [(message
, trans
, columnwise
, metric
, target_counts
, ngram_length
,
282 message_frequency_scaling
)
283 for trans
in translist
.keys() for columnwise
in [True, False]]
284 # Gotcha: the helper function here needs to be defined at the top level
285 # (limitation of Pool.starmap)
286 breaks
= pool
.starmap(column_transposition_break_worker
, helper_args
, chunksize
)
287 return min(breaks
, key
=lambda k
: k
[1])
288 column_transposition_break
= column_transposition_break_mp
290 def column_transposition_break_worker(message
, transposition
, columnwise
, metric
, target_counts
,
291 ngram_length
, message_frequency_scaling
):
292 plaintext
= column_transposition_decipher(message
, transposition
, columnwise
=columnwise
)
293 counts
= message_frequency_scaling(frequencies(
294 ngrams(sanitise(plaintext
), ngram_length
)))
295 fit
= metric(target_counts
, counts
)
296 logger
.debug('Column transposition break attempt using key {0} '
297 'gives fit of {1} and decrypt starting: {2}'.format(
299 sanitise(plaintext
)[:50]))
300 return (transposition
, columnwise
), fit
303 def transposition_break_exhaustive(message
):
304 best_transposition
= ''
305 best_pw
= -float('inf')
306 for keylength
in range(1, 21):
307 if len(message
) % keylength
== 0:
308 for transposition
in permutations(range(keylength
)):
309 for columnwise
in [True, False]:
310 plaintext
= column_transposition_decipher(message
,
311 transposition
, columnwise
=columnwise
)
312 # pw = Pwords(segment(plaintext))
313 pw
= sum([log10(bigram_likelihood(b
,
314 normalised_english_bigram_counts
,
315 normalised_english_counts
))
316 for b
in ngrams(plaintext
, 2)])
317 logger
.debug('Column transposition break attempt using key {0} {1} '
318 'gives fit of {2} and decrypt starting: {3}'.format(
319 transposition
, columnwise
, pw
,
320 sanitise(plaintext
)[:50]))
322 best_transposition
= transposition
323 best_columnwise
= columnwise
325 return (best_transposition
, best_columnwise
), best_pw
328 def vigenere_keyword_break(message
,
330 metric
=norms
.euclidean_distance
,
331 target_counts
=normalised_english_counts
,
332 message_frequency_scaling
=norms
.normalise
):
333 """Breaks a vigenere cipher using a dictionary and
336 >>> vigenere_keyword_break(vigenere_encipher(sanitise('this is a test ' \
337 'message for the vigenere decipherment'), 'cat'), \
338 wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
339 ('cat', 0.15965224935...)
342 best_fit
= float("inf")
343 for keyword
in wordlist
:
344 plaintext
= vigenere_decipher(message
, keyword
)
345 counts
= message_frequency_scaling(letter_frequencies(plaintext
))
346 fit
= metric(target_counts
, counts
)
347 logger
.debug('Vigenere break attempt using key {0} '
348 'gives fit of {1} and decrypt starting: {2}'.format(
350 sanitise(plaintext
)[:50]))
353 best_keyword
= keyword
354 logger
.info('Vigenere break best fit with key {0} gives fit '
355 'of {1} and decrypt starting: {2}'.format(best_keyword
,
357 vigenere_decipher(message
, best_keyword
))[:50]))
358 return best_keyword
, best_fit
360 def vigenere_keyword_break_mp(message
,
362 metric
=norms
.euclidean_distance
,
363 target_counts
=normalised_english_counts
,
364 message_frequency_scaling
=norms
.normalise
,
366 """Breaks a vigenere cipher using a dictionary and
369 >>> vigenere_keyword_break_mp(vigenere_encipher(sanitise('this is a test ' \
370 'message for the vigenere decipherment'), 'cat'), \
371 wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
372 ('cat', 0.159652249358...)
375 helper_args
= [(message
, word
, metric
, target_counts
,
376 message_frequency_scaling
)
377 for word
in wordlist
]
378 # Gotcha: the helper function here needs to be defined at the top level
379 # (limitation of Pool.starmap)
380 breaks
= pool
.starmap(vigenere_keyword_break_worker
, helper_args
, chunksize
)
381 return min(breaks
, key
=lambda k
: k
[1])
383 def vigenere_keyword_break_worker(message
, keyword
, metric
, target_counts
,
384 message_frequency_scaling
):
385 plaintext
= vigenere_decipher(message
, keyword
)
386 counts
= message_frequency_scaling(letter_frequencies(plaintext
))
387 fit
= metric(target_counts
, counts
)
388 logger
.debug('Vigenere keyword break attempt using key {0} gives fit of '
389 '{1} and decrypt starting: {2}'.format(keyword
,
390 fit
, sanitise(plaintext
)[:50]))
395 def vigenere_frequency_break(message
,
396 metric
=norms
.euclidean_distance
,
397 target_counts
=normalised_english_counts
,
398 message_frequency_scaling
=norms
.normalise
):
399 """Breaks a Vigenere cipher with frequency analysis
401 >>> vigenere_frequency_break(vigenere_encipher(sanitise("It is time to " \
402 "run. She is ready and so am I. I stole Daniel's pocketbook this " \
403 "afternoon when he left his jacket hanging on the easel in the " \
404 "attic."), 'florence')) # doctest: +ELLIPSIS
405 ('florence', 0.077657073...)
407 best_fit
= float("inf")
409 sanitised_message
= sanitise(message
)
410 for trial_length
in range(1, 20):
411 splits
= every_nth(sanitised_message
, trial_length
)
412 key
= ''.join([chr(caesar_break(s
, target_counts
=target_counts
)[0] + ord('a')) for s
in splits
])
413 plaintext
= vigenere_decipher(sanitised_message
, key
)
414 counts
= message_frequency_scaling(frequencies(plaintext
))
415 fit
= metric(target_counts
, counts
)
416 logger
.debug('Vigenere key length of {0} ({1}) gives fit of {2}'.
417 format(trial_length
, key
, fit
))
421 logger
.info('Vigenere break best fit with key {0} gives fit '
422 'of {1} and decrypt starting: {2}'.format(best_key
,
424 vigenere_decipher(message
, best_key
))[:50]))
425 return best_key
, best_fit
427 def beaufort_frequency_break(message
,
428 metric
=norms
.euclidean_distance
,
429 target_counts
=normalised_english_counts
,
430 message_frequency_scaling
=norms
.normalise
):
431 """Breaks a Beaufort cipher with frequency analysis
433 >>> vigenere_frequency_break(vigenere_encipher(sanitise("It is time to " \
434 "run. She is ready and so am I. I stole Daniel's pocketbook this " \
435 "afternoon when he left his jacket hanging on the easel in the " \
436 "attic."), 'florence')) # doctest: +ELLIPSIS
437 ('florence', 0.077657073...)
439 best_fit
= float("inf")
441 sanitised_message
= sanitise(message
)
442 for trial_length
in range(1, 20):
443 splits
= every_nth(sanitised_message
, trial_length
)
444 key
= ''.join([chr(caesar_break(s
, target_counts
=target_counts
)[0] + ord('a')) for s
in splits
])
445 plaintext
= beaufort_decipher(sanitised_message
, key
)
446 counts
= message_frequency_scaling(frequencies(plaintext
))
447 fit
= metric(target_counts
, counts
)
448 logger
.debug('Beaufort key length of {0} ({1}) gives fit of {2}'.
449 format(trial_length
, key
, fit
))
453 logger
.info('Beaufort break best fit with key {0} gives fit '
454 'of {1} and decrypt starting: {2}'.format(best_key
,
456 beaufort_decipher(message
, best_key
))[:50]))
457 return best_key
, best_fit
461 def plot_frequency_histogram(freqs
, sort_key
=None):
462 x
= range(len(freqs
.keys()))
463 y
= [freqs
[l
] for l
in sorted(freqs
.keys(), key
=sort_key
)]
465 ax
= f
.add_axes([0.1, 0.1, 0.9, 0.9])
466 ax
.bar(x
, y
, align
='center')
468 ax
.set_xticklabels(sorted(freqs
.keys(), key
=sort_key
))
472 if __name__
== "__main__":