5 from itertools
import zip_longest
, cycle
, permutations
6 from segment
import segment
, Pwords
7 from multiprocessing
import Pool
10 import matplotlib
.pyplot
as plt
13 from language_models
import *
18 # c5a = open('2012/5a.ciphertext', 'r').read()
19 # timeit.timeit('keyword_break(c5a)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break', number=1)
20 # timeit.repeat('keyword_break_mp(c5a, chunksize=500)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break_mp', repeat=5, number=1)
22 transpositions
= collections
.defaultdict(list)
24 transpositions
[transpositions_of(word
)] += [word
]
26 def frequencies(text
):
27 """Count the number of occurrences of each character in text
29 >>> sorted(frequencies('abcdefabc').items())
30 [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
31 >>> sorted(frequencies('the quick brown fox jumped over the lazy ' \
32 'dog').items()) # doctest: +NORMALIZE_WHITESPACE
33 [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1),
34 ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1),
35 ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2),
36 ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
37 >>> sorted(frequencies('The Quick BROWN fox jumped! over... the ' \
38 '(9lazy) DOG').items()) # doctest: +NORMALIZE_WHITESPACE
39 [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1),
40 ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1),
41 ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2),
42 ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1),
43 ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
44 >>> sorted(frequencies(sanitise('The Quick BROWN fox jumped! over... ' \
45 'the (9lazy) DOG')).items()) # doctest: +NORMALIZE_WHITESPACE
46 [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1),
47 ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1),
48 ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1),
49 ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
50 >>> frequencies('abcdefabcdef')['x']
53 #counts = collections.defaultdict(int)
57 return collections
.Counter(c
for c
in text
)
58 letter_frequencies
= frequencies
61 def bigram_likelihood(bigram
, bf
, lf
):
62 return bf
[bigram
] / (lf
[bigram
[0]] * lf
[bigram
[1]])
65 def caesar_break(message
,
66 metric
=norms
.euclidean_distance
,
67 target_counts
=normalised_english_counts
,
68 message_frequency_scaling
=norms
.normalise
):
69 """Breaks a Caesar cipher using frequency analysis
71 >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrh' \
72 'ecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS
73 (4, 0.080345432737...)
74 >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgtee' \
75 'raxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS
76 (19, 0.11189290326...)
77 >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurer' \
78 'svaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS
79 (13, 0.08293968842...)
81 sanitised_message
= sanitise(message
)
83 best_fit
= float("inf")
84 for shift
in range(26):
85 plaintext
= caesar_decipher(sanitised_message
, shift
)
86 counts
= message_frequency_scaling(letter_frequencies(plaintext
))
87 fit
= metric(target_counts
, counts
)
88 logger
.debug('Caesar break attempt using key {0} gives fit of {1} '
89 'and decrypt starting: {2}'.format(shift
, fit
, plaintext
[:50]))
93 logger
.info('Caesar break best fit: key {0} gives fit of {1} and '
94 'decrypt starting: {2}'.format(best_shift
, best_fit
,
95 caesar_decipher(sanitised_message
, best_shift
)[:50]))
96 return best_shift
, best_fit
98 def affine_break(message
,
99 metric
=norms
.euclidean_distance
,
100 target_counts
=normalised_english_counts
,
101 message_frequency_scaling
=norms
.normalise
):
102 """Breaks an affine cipher using frequency analysis
104 >>> affine_break('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls ' \
105 'omytd jlaxe mh jm bfmibj umis hfsul axubafkjamx. ls kffkxwsd jls ' \
106 'ofgbjmwfkiu olfmxmtmwaokttg jlsx ls kffkxwsd jlsi zg tsxwjl. jlsx ' \
107 'ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls ' \
108 'bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai ' \
109 'kxd clm ckuxj.') # doctest: +ELLIPSIS
110 ((15, 22, True), 0.0598745365924...)
112 sanitised_message
= sanitise(message
)
115 best_one_based
= True
116 best_fit
= float("inf")
117 for one_based
in [True, False]:
118 for multiplier
in range(1, 26, 2):
119 for adder
in range(26):
120 plaintext
= affine_decipher(sanitised_message
,
121 multiplier
, adder
, one_based
)
122 counts
= message_frequency_scaling(letter_frequencies(plaintext
))
123 fit
= metric(target_counts
, counts
)
124 logger
.debug('Affine break attempt using key {0}x+{1} ({2}) '
125 'gives fit of {3} and decrypt starting: {4}'.
126 format(multiplier
, adder
, one_based
, fit
,
130 best_multiplier
= multiplier
132 best_one_based
= one_based
133 logger
.info('Affine break best fit with key {0}x+{1} ({2}) gives fit of {3} '
134 'and decrypt starting: {4}'.format(
135 best_multiplier
, best_adder
, best_one_based
, best_fit
,
136 affine_decipher(sanitised_message
, best_multiplier
,
137 best_adder
, best_one_based
)[:50]))
138 return (best_multiplier
, best_adder
, best_one_based
), best_fit
140 def keyword_break(message
,
142 metric
=norms
.euclidean_distance
,
143 target_counts
=normalised_english_counts
,
144 message_frequency_scaling
=norms
.normalise
):
145 """Breaks a keyword substitution cipher using a dictionary and
148 >>> keyword_break(keyword_encipher('this is a test message for the ' \
149 'keyword decipherment', 'elephant', 1), \
150 wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
151 (('elephant', 1), 0.1066453448861...)
154 best_wrap_alphabet
= True
155 best_fit
= float("inf")
156 for wrap_alphabet
in range(3):
157 for keyword
in wordlist
:
158 plaintext
= keyword_decipher(message
, keyword
, wrap_alphabet
)
159 counts
= message_frequency_scaling(letter_frequencies(plaintext
))
160 fit
= metric(target_counts
, counts
)
161 logger
.debug('Keyword break attempt using key {0} (wrap={1}) '
162 'gives fit of {2} and decrypt starting: {3}'.format(
163 keyword
, wrap_alphabet
, fit
,
164 sanitise(plaintext
)[:50]))
167 best_keyword
= keyword
168 best_wrap_alphabet
= wrap_alphabet
169 logger
.info('Keyword break best fit with key {0} (wrap={1}) gives fit of '
170 '{2} and decrypt starting: {3}'.format(best_keyword
,
171 best_wrap_alphabet
, best_fit
, sanitise(
172 keyword_decipher(message
, best_keyword
,
173 best_wrap_alphabet
))[:50]))
174 return (best_keyword
, best_wrap_alphabet
), best_fit
176 def keyword_break_mp(message
,
178 metric
=norms
.euclidean_distance
,
179 target_counts
=normalised_english_counts
,
180 message_frequency_scaling
=norms
.normalise
,
182 """Breaks a keyword substitution cipher using a dictionary and
185 >>> keyword_break_mp(keyword_encipher('this is a test message for the ' \
186 'keyword decipherment', 'elephant', 1), \
187 wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
188 (('elephant', 1), 0.106645344886...)
191 helper_args
= [(message
, word
, wrap
, metric
, target_counts
,
192 message_frequency_scaling
)
193 for word
in wordlist
for wrap
in range(3)]
194 # Gotcha: the helper function here needs to be defined at the top level
195 # (limitation of Pool.starmap)
196 breaks
= pool
.starmap(keyword_break_worker
, helper_args
, chunksize
)
197 return min(breaks
, key
=lambda k
: k
[1])
199 def keyword_break_worker(message
, keyword
, wrap_alphabet
, metric
, target_counts
,
200 message_frequency_scaling
):
201 plaintext
= keyword_decipher(message
, keyword
, wrap_alphabet
)
202 counts
= message_frequency_scaling(letter_frequencies(plaintext
))
203 fit
= metric(target_counts
, counts
)
204 logger
.debug('Keyword break attempt using key {0} (wrap={1}) gives fit of '
205 '{2} and decrypt starting: {3}'.format(keyword
,
206 wrap_alphabet
, fit
, sanitise(plaintext
)[:50]))
207 return (keyword
, wrap_alphabet
), fit
209 def scytale_break(message
,
210 metric
=norms
.euclidean_distance
,
211 target_counts
=normalised_english_bigram_counts
,
212 message_frequency_scaling
=norms
.normalise
):
213 """Breaks a Scytale cipher
215 >>> scytale_break('tfeulchtrtteehwahsdehneoifeayfsondmwpltmaoalhikotoere' \
216 'dcweatehiplwxsnhooacgorrcrcraotohsgullasenylrendaianeplscdriioto' \
217 'aek') # doctest: +ELLIPSIS
218 (6, 0.092599933059...)
221 best_fit
= float("inf")
222 ngram_length
= len(next(iter(target_counts
.keys())))
223 for key
in range(1, 20):
224 if len(message
) % key
== 0:
225 plaintext
= scytale_decipher(message
, key
)
226 counts
= message_frequency_scaling(frequencies(
227 ngrams(sanitise(plaintext
), ngram_length
)))
228 fit
= metric(target_counts
, counts
)
229 logger
.debug('Scytale break attempt using key {0} gives fit of '
230 '{1} and decrypt starting: {2}'.format(key
,
231 fit
, sanitise(plaintext
)[:50]))
235 logger
.info('Scytale break best fit with key {0} gives fit of {1} and '
236 'decrypt starting: {2}'.format(best_key
, best_fit
,
237 sanitise(scytale_decipher(message
, best_key
))[:50]))
238 return best_key
, best_fit
241 def column_transposition_break_mp(message
,
242 translist
=transpositions
,
243 metric
=norms
.euclidean_distance
,
244 target_counts
=normalised_english_bigram_counts
,
245 message_frequency_scaling
=norms
.normalise
,
247 """Breaks a column transposition cipher using a dictionary and
248 n-gram frequency analysis
250 # >>> column_transposition_break_mp(column_transposition_encipher(sanitise( \
251 # "It is a truth universally acknowledged, that a single man in \
252 # possession of a good fortune, must be in want of a wife. However \
253 # little known the feelings or views of such a man may be on his \
254 # first entering a neighbourhood, this truth is so well fixed in the \
255 # minds of the surrounding families, that he is considered the \
256 # rightful property of some one or other of their daughters."), \
258 # translist={(2, 0, 5, 3, 1, 4, 6): ['encipher'], \
259 # (5, 0, 6, 1, 3, 4, 2): ['fourteen'], \
260 # (6, 1, 0, 4, 5, 3, 2): ['keyword']}) # doctest: +ELLIPSIS
261 # (((2, 0, 5, 3, 1, 4, 6), False), 0.0628106372...)
262 # >>> column_transposition_break_mp(column_transposition_encipher(sanitise( \
263 # "It is a truth universally acknowledged, that a single man in \
264 # possession of a good fortune, must be in want of a wife. However \
265 # little known the feelings or views of such a man may be on his \
266 # first entering a neighbourhood, this truth is so well fixed in the \
267 # minds of the surrounding families, that he is considered the \
268 # rightful property of some one or other of their daughters."), \
270 # translist={(2, 0, 5, 3, 1, 4, 6): ['encipher'], \
271 # (5, 0, 6, 1, 3, 4, 2): ['fourteen'], \
272 # (6, 1, 0, 4, 5, 3, 2): ['keyword']}, \
273 # target_counts=normalised_english_trigram_counts) # doctest: +ELLIPSIS
274 # (((2, 0, 5, 3, 1, 4, 6), False), 0.0592259560...)
276 ngram_length
= len(next(iter(target_counts
.keys())))
278 helper_args
= [(message
, trans
, columnwise
, metric
, target_counts
, ngram_length
,
279 message_frequency_scaling
)
280 for trans
in translist
.keys() for columnwise
in [True, False]]
281 # Gotcha: the helper function here needs to be defined at the top level
282 # (limitation of Pool.starmap)
283 breaks
= pool
.starmap(column_transposition_break_worker
, helper_args
, chunksize
)
284 return min(breaks
, key
=lambda k
: k
[1])
285 column_transposition_break
= column_transposition_break_mp
287 def column_transposition_break_worker(message
, transposition
, columnwise
, metric
, target_counts
,
288 ngram_length
, message_frequency_scaling
):
289 plaintext
= column_transposition_decipher(message
, transposition
, columnwise
=columnwise
)
290 counts
= message_frequency_scaling(frequencies(
291 ngrams(sanitise(plaintext
), ngram_length
)))
292 fit
= metric(target_counts
, counts
)
293 logger
.debug('Column transposition break attempt using key {0} '
294 'gives fit of {1} and decrypt starting: {2}'.format(
296 sanitise(plaintext
)[:50]))
297 return (transposition
, columnwise
), fit
300 def transposition_break_exhaustive(message
):
301 best_transposition
= ''
302 best_pw
= -float('inf')
303 for keylength
in range(1, 21):
304 if len(message
) % keylength
== 0:
305 for transposition
in permutations(range(keylength
)):
306 for columnwise
in [True, False]:
307 plaintext
= column_transposition_decipher(message
,
308 transposition
, columnwise
=columnwise
)
309 # pw = Pwords(segment(plaintext))
310 pw
= sum([log10(bigram_likelihood(b
,
311 normalised_english_bigram_counts
,
312 normalised_english_counts
))
313 for b
in ngrams(plaintext
, 2)])
314 logger
.debug('Column transposition break attempt using key {0} {1} '
315 'gives fit of {2} and decrypt starting: {3}'.format(
316 transposition
, columnwise
, pw
,
317 sanitise(plaintext
)[:50]))
319 best_transposition
= transposition
320 best_columnwise
= columnwise
322 return (best_transposition
, best_columnwise
), best_pw
325 def vigenere_keyword_break(message
,
327 metric
=norms
.euclidean_distance
,
328 target_counts
=normalised_english_counts
,
329 message_frequency_scaling
=norms
.normalise
):
330 """Breaks a vigenere cipher using a dictionary and
333 >>> vigenere_keyword_break(vigenere_encipher(sanitise('this is a test ' \
334 'message for the vigenere decipherment'), 'cat'), \
335 wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
336 ('cat', 0.15965224935...)
339 best_fit
= float("inf")
340 for keyword
in wordlist
:
341 plaintext
= vigenere_decipher(message
, keyword
)
342 counts
= message_frequency_scaling(letter_frequencies(plaintext
))
343 fit
= metric(target_counts
, counts
)
344 logger
.debug('Vigenere break attempt using key {0} '
345 'gives fit of {1} and decrypt starting: {2}'.format(
347 sanitise(plaintext
)[:50]))
350 best_keyword
= keyword
351 logger
.info('Vigenere break best fit with key {0} gives fit '
352 'of {1} and decrypt starting: {2}'.format(best_keyword
,
354 vigenere_decipher(message
, best_keyword
))[:50]))
355 return best_keyword
, best_fit
357 def vigenere_keyword_break_mp(message
,
359 metric
=norms
.euclidean_distance
,
360 target_counts
=normalised_english_counts
,
361 message_frequency_scaling
=norms
.normalise
,
363 """Breaks a vigenere cipher using a dictionary and
366 >>> vigenere_keyword_break_mp(vigenere_encipher(sanitise('this is a test ' \
367 'message for the vigenere decipherment'), 'cat'), \
368 wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
369 ('cat', 0.159652249358...)
372 helper_args
= [(message
, word
, metric
, target_counts
,
373 message_frequency_scaling
)
374 for word
in wordlist
]
375 # Gotcha: the helper function here needs to be defined at the top level
376 # (limitation of Pool.starmap)
377 breaks
= pool
.starmap(vigenere_keyword_break_worker
, helper_args
, chunksize
)
378 return min(breaks
, key
=lambda k
: k
[1])
380 def vigenere_keyword_break_worker(message
, keyword
, metric
, target_counts
,
381 message_frequency_scaling
):
382 plaintext
= vigenere_decipher(message
, keyword
)
383 counts
= message_frequency_scaling(letter_frequencies(plaintext
))
384 fit
= metric(target_counts
, counts
)
385 logger
.debug('Vigenere keyword break attempt using key {0} gives fit of '
386 '{1} and decrypt starting: {2}'.format(keyword
,
387 fit
, sanitise(plaintext
)[:50]))
392 def vigenere_frequency_break(message
,
393 metric
=norms
.euclidean_distance
,
394 target_counts
=normalised_english_counts
,
395 message_frequency_scaling
=norms
.normalise
):
396 """Breaks a Vigenere cipher with frequency analysis
398 >>> vigenere_frequency_break(vigenere_encipher(sanitise("It is time to " \
399 "run. She is ready and so am I. I stole Daniel's pocketbook this " \
400 "afternoon when he left his jacket hanging on the easel in the " \
401 "attic."), 'florence')) # doctest: +ELLIPSIS
402 ('florence', 0.077657073...)
404 best_fit
= float("inf")
406 sanitised_message
= sanitise(message
)
407 for trial_length
in range(1, 20):
408 splits
= every_nth(sanitised_message
, trial_length
)
409 key
= ''.join([chr(caesar_break(s
, target_counts
=target_counts
)[0] + ord('a')) for s
in splits
])
410 plaintext
= vigenere_decipher(sanitised_message
, key
)
411 counts
= message_frequency_scaling(frequencies(plaintext
))
412 fit
= metric(target_counts
, counts
)
413 logger
.debug('Vigenere key length of {0} ({1}) gives fit of {2}'.
414 format(trial_length
, key
, fit
))
418 logger
.info('Vigenere break best fit with key {0} gives fit '
419 'of {1} and decrypt starting: {2}'.format(best_key
,
421 vigenere_decipher(message
, best_key
))[:50]))
422 return best_key
, best_fit
424 def beaufort_frequency_break(message
,
425 metric
=norms
.euclidean_distance
,
426 target_counts
=normalised_english_counts
,
427 message_frequency_scaling
=norms
.normalise
):
428 """Breaks a Beaufort cipher with frequency analysis
430 >>> vigenere_frequency_break(vigenere_encipher(sanitise("It is time to " \
431 "run. She is ready and so am I. I stole Daniel's pocketbook this " \
432 "afternoon when he left his jacket hanging on the easel in the " \
433 "attic."), 'florence')) # doctest: +ELLIPSIS
434 ('florence', 0.077657073...)
436 best_fit
= float("inf")
438 sanitised_message
= sanitise(message
)
439 for trial_length
in range(1, 20):
440 splits
= every_nth(sanitised_message
, trial_length
)
441 key
= ''.join([chr(caesar_break(s
, target_counts
=target_counts
)[0] + ord('a')) for s
in splits
])
442 plaintext
= beaufort_decipher(sanitised_message
, key
)
443 counts
= message_frequency_scaling(frequencies(plaintext
))
444 fit
= metric(target_counts
, counts
)
445 logger
.debug('Beaufort key length of {0} ({1}) gives fit of {2}'.
446 format(trial_length
, key
, fit
))
450 logger
.info('Beaufort break best fit with key {0} gives fit '
451 'of {1} and decrypt starting: {2}'.format(best_key
,
453 beaufort_decipher(message
, best_key
))[:50]))
454 return best_key
, best_fit
458 def plot_frequency_histogram(freqs
, sort_key
=None):
459 x
= range(len(freqs
.keys()))
460 y
= [freqs
[l
] for l
in sorted(freqs
.keys(), key
=sort_key
)]
462 ax
= f
.add_axes([0.1, 0.1, 0.9, 0.9])
463 ax
.bar(x
, y
, align
='center')
465 ax
.set_xticklabels(sorted(freqs
.keys(), key
=sort_key
))
469 if __name__
== "__main__":