5 from itertools
import zip_longest
, cycle
, permutations
6 from segment
import segment
, Pwords
7 from multiprocessing
import Pool
10 import matplotlib
.pyplot
as plt
17 # c5a = open('2012/5a.ciphertext', 'r').read()
18 # timeit.timeit('keyword_break(c5a)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break', number=1)
19 # timeit.repeat('keyword_break_mp(c5a, chunksize=500)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break_mp', repeat=5, number=1)
22 english_counts
= collections
.defaultdict(int)
23 with
open('count_1l.txt', 'r') as f
:
25 (letter
, count
) = line
.split("\t")
26 english_counts
[letter
] = int(count
)
27 normalised_english_counts
= norms
.normalise(english_counts
)
29 english_bigram_counts
= collections
.defaultdict(int)
30 with
open('count_2l.txt', 'r') as f
:
32 (bigram
, count
) = line
.split("\t")
33 english_bigram_counts
[bigram
] = int(count
)
34 normalised_english_bigram_counts
= norms
.normalise(english_bigram_counts
)
36 english_trigram_counts
= collections
.defaultdict(int)
37 with
open('count_3l.txt', 'r') as f
:
39 (trigram
, count
) = line
.split("\t")
40 english_trigram_counts
[trigram
] = int(count
)
41 normalised_english_trigram_counts
= norms
.normalise(english_trigram_counts
)
44 with
open('words.txt', 'r') as f
:
45 keywords
= [line
.rstrip() for line
in f
]
47 transpositions
= collections
.defaultdict(list)
49 transpositions
[transpositions_of(word
)] += [word
]
51 def frequencies(text
):
52 """Count the number of occurrences of each character in text
54 >>> sorted(frequencies('abcdefabc').items())
55 [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
56 >>> sorted(frequencies('the quick brown fox jumped over the lazy ' \
57 'dog').items()) # doctest: +NORMALIZE_WHITESPACE
58 [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1),
59 ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1),
60 ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2),
61 ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
62 >>> sorted(frequencies('The Quick BROWN fox jumped! over... the ' \
63 '(9lazy) DOG').items()) # doctest: +NORMALIZE_WHITESPACE
64 [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1),
65 ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1),
66 ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2),
67 ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1),
68 ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
69 >>> sorted(frequencies(sanitise('The Quick BROWN fox jumped! over... ' \
70 'the (9lazy) DOG')).items()) # doctest: +NORMALIZE_WHITESPACE
71 [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1),
72 ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1),
73 ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1),
74 ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
75 >>> frequencies('abcdefabcdef')['x']
78 #counts = collections.defaultdict(int)
82 return collections
.Counter(c
for c
in text
)
83 letter_frequencies
= frequencies
86 def bigram_likelihood(bigram
, bf
, lf
):
87 return bf
[bigram
] / (lf
[bigram
[0]] * lf
[bigram
[1]])
90 def caesar_break(message
,
91 metric
=norms
.euclidean_distance
,
92 target_counts
=normalised_english_counts
,
93 message_frequency_scaling
=norms
.normalise
):
94 """Breaks a Caesar cipher using frequency analysis
96 >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrh' \
97 'ecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS
98 (4, 0.080345432737...)
99 >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgtee' \
100 'raxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS
101 (19, 0.11189290326...)
102 >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurer' \
103 'svaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS
104 (13, 0.08293968842...)
106 sanitised_message
= sanitise(message
)
108 best_fit
= float("inf")
109 for shift
in range(26):
110 plaintext
= caesar_decipher(sanitised_message
, shift
)
111 counts
= message_frequency_scaling(letter_frequencies(plaintext
))
112 fit
= metric(target_counts
, counts
)
113 logger
.debug('Caesar break attempt using key {0} gives fit of {1} '
114 'and decrypt starting: {2}'.format(shift
, fit
, plaintext
[:50]))
118 logger
.info('Caesar break best fit: key {0} gives fit of {1} and '
119 'decrypt starting: {2}'.format(best_shift
, best_fit
,
120 caesar_decipher(sanitised_message
, best_shift
)[:50]))
121 return best_shift
, best_fit
123 def affine_break(message
,
124 metric
=norms
.euclidean_distance
,
125 target_counts
=normalised_english_counts
,
126 message_frequency_scaling
=norms
.normalise
):
127 """Breaks an affine cipher using frequency analysis
129 >>> affine_break('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls ' \
130 'omytd jlaxe mh jm bfmibj umis hfsul axubafkjamx. ls kffkxwsd jls ' \
131 'ofgbjmwfkiu olfmxmtmwaokttg jlsx ls kffkxwsd jlsi zg tsxwjl. jlsx ' \
132 'ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls ' \
133 'bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai kxd ' \
134 'clm ckuxj.') # doctest: +ELLIPSIS
135 ((15, 22, True), 0.0598745365924...)
137 sanitised_message
= sanitise(message
)
140 best_one_based
= True
141 best_fit
= float("inf")
142 for one_based
in [True, False]:
143 for multiplier
in range(1, 26, 2):
144 for adder
in range(26):
145 plaintext
= affine_decipher(sanitised_message
,
146 multiplier
, adder
, one_based
)
147 counts
= message_frequency_scaling(letter_frequencies(plaintext
))
148 fit
= metric(target_counts
, counts
)
149 logger
.debug('Affine break attempt using key {0}x+{1} ({2}) '
150 'gives fit of {3} and decrypt starting: {4}'.
151 format(multiplier
, adder
, one_based
, fit
,
155 best_multiplier
= multiplier
157 best_one_based
= one_based
158 logger
.info('Affine break best fit with key {0}x+{1} ({2}) gives fit of {3} '
159 'and decrypt starting: {4}'.format(
160 best_multiplier
, best_adder
, best_one_based
, best_fit
,
161 affine_decipher(sanitised_message
, best_multiplier
,
162 best_adder
, best_one_based
)[:50]))
163 return (best_multiplier
, best_adder
, best_one_based
), best_fit
165 def keyword_break(message
,
167 metric
=norms
.euclidean_distance
,
168 target_counts
=normalised_english_counts
,
169 message_frequency_scaling
=norms
.normalise
):
170 """Breaks a keyword substitution cipher using a dictionary and
173 >>> keyword_break(keyword_encipher('this is a test message for the ' \
174 'keyword decipherment', 'elephant', 1), \
175 wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
176 (('elephant', 1), 0.1066453448861...)
179 best_wrap_alphabet
= True
180 best_fit
= float("inf")
181 for wrap_alphabet
in range(3):
182 for keyword
in wordlist
:
183 plaintext
= keyword_decipher(message
, keyword
, wrap_alphabet
)
184 counts
= message_frequency_scaling(letter_frequencies(plaintext
))
185 fit
= metric(target_counts
, counts
)
186 logger
.debug('Keyword break attempt using key {0} (wrap={1}) '
187 'gives fit of {2} and decrypt starting: {3}'.format(
188 keyword
, wrap_alphabet
, fit
,
189 sanitise(plaintext
)[:50]))
192 best_keyword
= keyword
193 best_wrap_alphabet
= wrap_alphabet
194 logger
.info('Keyword break best fit with key {0} (wrap={1}) gives fit of '
195 '{2} and decrypt starting: {3}'.format(best_keyword
,
196 best_wrap_alphabet
, best_fit
, sanitise(
197 keyword_decipher(message
, best_keyword
,
198 best_wrap_alphabet
))[:50]))
199 return (best_keyword
, best_wrap_alphabet
), best_fit
201 def keyword_break_mp(message
,
203 metric
=norms
.euclidean_distance
,
204 target_counts
=normalised_english_counts
,
205 message_frequency_scaling
=norms
.normalise
,
207 """Breaks a keyword substitution cipher using a dictionary and
210 >>> keyword_break_mp(keyword_encipher('this is a test message for the ' \
211 'keyword decipherment', 'elephant', 1), \
212 wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
213 (('elephant', 1), 0.106645344886...)
216 helper_args
= [(message
, word
, wrap
, metric
, target_counts
,
217 message_frequency_scaling
)
218 for word
in wordlist
for wrap
in range(3)]
219 # Gotcha: the helper function here needs to be defined at the top level
220 # (limitation of Pool.starmap)
221 breaks
= pool
.starmap(keyword_break_worker
, helper_args
, chunksize
)
222 return min(breaks
, key
=lambda k
: k
[1])
224 def keyword_break_worker(message
, keyword
, wrap_alphabet
, metric
, target_counts
,
225 message_frequency_scaling
):
226 plaintext
= keyword_decipher(message
, keyword
, wrap_alphabet
)
227 counts
= message_frequency_scaling(letter_frequencies(plaintext
))
228 fit
= metric(target_counts
, counts
)
229 logger
.debug('Keyword break attempt using key {0} (wrap={1}) gives fit of '
230 '{2} and decrypt starting: {3}'.format(keyword
,
231 wrap_alphabet
, fit
, sanitise(plaintext
)[:50]))
232 return (keyword
, wrap_alphabet
), fit
234 def scytale_break(message
,
235 metric
=norms
.euclidean_distance
,
236 target_counts
=normalised_english_bigram_counts
,
237 message_frequency_scaling
=norms
.normalise
):
238 """Breaks a Scytale cipher
240 >>> scytale_break('tfeulchtrtteehwahsdehneoifeayfsondmwpltmaoalhikotoere' \
241 'dcweatehiplwxsnhooacgorrcrcraotohsgullasenylrendaianeplscdriioto' \
242 'aek') # doctest: +ELLIPSIS
243 (6, 0.092599933059...)
246 best_fit
= float("inf")
247 ngram_length
= len(next(iter(target_counts
.keys())))
248 for key
in range(1, 20):
249 if len(message
) % key
== 0:
250 plaintext
= scytale_decipher(message
, key
)
251 counts
= message_frequency_scaling(frequencies(
252 ngrams(sanitise(plaintext
), ngram_length
)))
253 fit
= metric(target_counts
, counts
)
254 logger
.debug('Scytale break attempt using key {0} gives fit of '
255 '{1} and decrypt starting: {2}'.format(key
,
256 fit
, sanitise(plaintext
)[:50]))
260 logger
.info('Scytale break best fit with key {0} gives fit of {1} and '
261 'decrypt starting: {2}'.format(best_key
, best_fit
,
262 sanitise(scytale_decipher(message
, best_key
))[:50]))
263 return best_key
, best_fit
266 def column_transposition_break_mp(message
,
267 translist
=transpositions
,
268 metric
=norms
.euclidean_distance
,
269 target_counts
=normalised_english_bigram_counts
,
270 message_frequency_scaling
=norms
.normalise
,
272 """Breaks a column transposition cipher using a dictionary and
273 n-gram frequency analysis
275 >>> column_transposition_break_mp(column_transposition_encipher(sanitise( \
276 "It is a truth universally acknowledged, that a single man in \
277 possession of a good fortune, must be in want of a wife. However \
278 little known the feelings or views of such a man may be on his \
279 first entering a neighbourhood, this truth is so well fixed in the \
280 minds of the surrounding families, that he is considered the \
281 rightful property of some one or other of their daughters."), \
283 translist={(2, 0, 5, 3, 1, 4, 6): ['encipher'], \
284 (5, 0, 6, 1, 3, 4, 2): ['fourteen'], \
285 (6, 1, 0, 4, 5, 3, 2): ['keyword']}) # doctest: +ELLIPSIS
286 (((2, 0, 5, 3, 1, 4, 6), False), 0.0628106372...)
287 >>> column_transposition_break_mp(column_transposition_encipher(sanitise( \
288 "It is a truth universally acknowledged, that a single man in \
289 possession of a good fortune, must be in want of a wife. However \
290 little known the feelings or views of such a man may be on his \
291 first entering a neighbourhood, this truth is so well fixed in the \
292 minds of the surrounding families, that he is considered the \
293 rightful property of some one or other of their daughters."), \
295 translist={(2, 0, 5, 3, 1, 4, 6): ['encipher'], \
296 (5, 0, 6, 1, 3, 4, 2): ['fourteen'], \
297 (6, 1, 0, 4, 5, 3, 2): ['keyword']}, \
298 target_counts=normalised_english_trigram_counts) # doctest: +ELLIPSIS
299 (((2, 0, 5, 3, 1, 4, 6), False), 0.0592259560...)
301 ngram_length
= len(next(iter(target_counts
.keys())))
303 helper_args
= [(message
, trans
, columnwise
, metric
, target_counts
, ngram_length
,
304 message_frequency_scaling
)
305 for trans
in translist
.keys() for columnwise
in [True, False]]
306 # Gotcha: the helper function here needs to be defined at the top level
307 # (limitation of Pool.starmap)
308 breaks
= pool
.starmap(column_transposition_break_worker
, helper_args
, chunksize
)
309 return min(breaks
, key
=lambda k
: k
[1])
310 column_transposition_break
= column_transposition_break_mp
312 def column_transposition_break_worker(message
, transposition
, columnwise
, metric
, target_counts
,
313 ngram_length
, message_frequency_scaling
):
314 plaintext
= column_transposition_decipher(message
, transposition
, columnwise
=columnwise
)
315 counts
= message_frequency_scaling(frequencies(
316 ngrams(sanitise(plaintext
), ngram_length
)))
317 fit
= metric(target_counts
, counts
)
318 logger
.debug('Column transposition break attempt using key {0} '
319 'gives fit of {1} and decrypt starting: {2}'.format(
321 sanitise(plaintext
)[:50]))
322 return (transposition
, columnwise
), fit
325 def transposition_break_exhaustive(message
):
326 best_transposition
= ''
327 best_pw
= -float('inf')
328 for keylength
in range(1, 21):
329 if len(message
) % keylength
== 0:
330 for transposition
in permutations(range(keylength
)):
331 for columnwise
in [True, False]:
332 plaintext
= column_transposition_decipher(message
,
333 transposition
, columnwise
=columnwise
)
334 # pw = Pwords(segment(plaintext))
335 pw
= sum([log10(bigram_likelihood(b
,
336 normalised_english_bigram_counts
,
337 normalised_english_counts
))
338 for b
in ngrams(plaintext
, 2)])
339 logger
.debug('Column transposition break attempt using key {0} {1} '
340 'gives fit of {2} and decrypt starting: {3}'.format(
341 transposition
, columnwise
, pw
,
342 sanitise(plaintext
)[:50]))
344 best_transposition
= transposition
345 best_columnwise
= columnwise
347 return (best_transposition
, best_columnwise
), best_pw
350 def vigenere_keyword_break(message
,
352 metric
=norms
.euclidean_distance
,
353 target_counts
=normalised_english_counts
,
354 message_frequency_scaling
=norms
.normalise
):
355 """Breaks a vigenere cipher using a dictionary and
358 >>> vigenere_keyword_break(vigenere_encipher(sanitise('this is a test ' \
359 'message for the vigenere decipherment'), 'cat'), \
360 wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
361 ('cat', 0.15965224935...)
364 best_fit
= float("inf")
365 for keyword
in wordlist
:
366 plaintext
= vigenere_decipher(message
, keyword
)
367 counts
= message_frequency_scaling(letter_frequencies(plaintext
))
368 fit
= metric(target_counts
, counts
)
369 logger
.debug('Vigenere break attempt using key {0} '
370 'gives fit of {1} and decrypt starting: {2}'.format(
372 sanitise(plaintext
)[:50]))
375 best_keyword
= keyword
376 logger
.info('Vigenere break best fit with key {0} gives fit '
377 'of {1} and decrypt starting: {2}'.format(best_keyword
,
379 vigenere_decipher(message
, best_keyword
))[:50]))
380 return best_keyword
, best_fit
382 def vigenere_keyword_break_mp(message
,
384 metric
=norms
.euclidean_distance
,
385 target_counts
=normalised_english_counts
,
386 message_frequency_scaling
=norms
.normalise
,
388 """Breaks a vigenere cipher using a dictionary and
391 >>> vigenere_keyword_break_mp(vigenere_encipher(sanitise('this is a test ' \
392 'message for the vigenere decipherment'), 'cat'), \
393 wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
394 ('cat', 0.159652249358...)
397 helper_args
= [(message
, word
, metric
, target_counts
,
398 message_frequency_scaling
)
399 for word
in wordlist
]
400 # Gotcha: the helper function here needs to be defined at the top level
401 # (limitation of Pool.starmap)
402 breaks
= pool
.starmap(vigenere_keyword_break_worker
, helper_args
, chunksize
)
403 return min(breaks
, key
=lambda k
: k
[1])
405 def vigenere_keyword_break_worker(message
, keyword
, metric
, target_counts
,
406 message_frequency_scaling
):
407 plaintext
= vigenere_decipher(message
, keyword
)
408 counts
= message_frequency_scaling(letter_frequencies(plaintext
))
409 fit
= metric(target_counts
, counts
)
410 logger
.debug('Vigenere keyword break attempt using key {0} gives fit of '
411 '{1} and decrypt starting: {2}'.format(keyword
,
412 fit
, sanitise(plaintext
)[:50]))
417 def vigenere_frequency_break(message
,
418 metric
=norms
.euclidean_distance
,
419 target_counts
=normalised_english_counts
,
420 message_frequency_scaling
=norms
.normalise
):
421 """Breaks a Vigenere cipher with frequency analysis
423 >>> vigenere_frequency_break(vigenere_encipher(sanitise("It is time to " \
424 "run. She is ready and so am I. I stole Daniel's pocketbook this " \
425 "afternoon when he left his jacket hanging on the easel in the " \
426 "attic."), 'florence')) # doctest: +ELLIPSIS
427 ('florence', 0.077657073...)
429 best_fit
= float("inf")
431 sanitised_message
= sanitise(message
)
432 for trial_length
in range(1, 20):
433 splits
= every_nth(sanitised_message
, trial_length
)
434 key
= ''.join([chr(caesar_break(s
, target_counts
=target_counts
)[0] + ord('a')) for s
in splits
])
435 plaintext
= vigenere_decipher(sanitised_message
, key
)
436 counts
= message_frequency_scaling(frequencies(plaintext
))
437 fit
= metric(target_counts
, counts
)
438 logger
.debug('Vigenere key length of {0} ({1}) gives fit of {2}'.
439 format(trial_length
, key
, fit
))
443 logger
.info('Vigenere break best fit with key {0} gives fit '
444 'of {1} and decrypt starting: {2}'.format(best_key
,
446 vigenere_decipher(message
, best_key
))[:50]))
447 return best_key
, best_fit
449 def beaufort_frequency_break(message
,
450 metric
=norms
.euclidean_distance
,
451 target_counts
=normalised_english_counts
,
452 message_frequency_scaling
=norms
.normalise
):
453 """Breaks a Beaufort cipher with frequency analysis
455 >>> vigenere_frequency_break(vigenere_encipher(sanitise("It is time to " \
456 "run. She is ready and so am I. I stole Daniel's pocketbook this " \
457 "afternoon when he left his jacket hanging on the easel in the " \
458 "attic."), 'florence')) # doctest: +ELLIPSIS
459 ('florence', 0.077657073...)
461 best_fit
= float("inf")
463 sanitised_message
= sanitise(message
)
464 for trial_length
in range(1, 20):
465 splits
= every_nth(sanitised_message
, trial_length
)
466 key
= ''.join([chr(caesar_break(s
, target_counts
=target_counts
)[0] + ord('a')) for s
in splits
])
467 plaintext
= beaufort_decipher(sanitised_message
, key
)
468 counts
= message_frequency_scaling(frequencies(plaintext
))
469 fit
= metric(target_counts
, counts
)
470 logger
.debug('Beaufort key length of {0} ({1}) gives fit of {2}'.
471 format(trial_length
, key
, fit
))
475 logger
.info('Beaufort break best fit with key {0} gives fit '
476 'of {1} and decrypt starting: {2}'.format(best_key
,
478 beaufort_decipher(message
, best_key
))[:50]))
479 return best_key
, best_fit
483 def plot_frequency_histogram(freqs
, sort_key
=None):
484 x
= range(len(freqs
.keys()))
485 y
= [freqs
[l
] for l
in sorted(freqs
.keys(), key
=sort_key
)]
487 ax
= f
.add_axes([0.1, 0.1, 0.9, 0.9])
488 ax
.bar(x
, y
, align
='center')
490 ax
.set_xticklabels(sorted(freqs
.keys(), key
=sort_key
))
494 if __name__
== "__main__":