Renamed break.py to cipherbreak.py so it wasn't a reserved word.
[cipher-tools.git] / cipherbreak.py
1 import string
2 import collections
3 import norms
4 import logging
5 from itertools import zip_longest, cycle
6 from segment import segment, Pwords
7 from multiprocessing import Pool
8
9 from cipher import *
10
11 # To time a run:
12 #
13 # import timeit
14 # c5a = open('2012/5a.ciphertext', 'r').read()
15 # timeit.timeit('keyword_break(c5a)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break', number=1)
16 # timeit.repeat('keyword_break_mp(c5a, chunksize=500)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break_mp', repeat=5, number=1)
17
18
19 english_counts = collections.defaultdict(int)
20 with open('count_1l.txt', 'r') as f:
21 for line in f:
22 (letter, count) = line.split("\t")
23 english_counts[letter] = int(count)
24 normalised_english_counts = norms.normalise(english_counts)
25
26 english_bigram_counts = collections.defaultdict(int)
27 with open('count_2l.txt', 'r') as f:
28 for line in f:
29 (bigram, count) = line.split("\t")
30 english_bigram_counts[bigram] = int(count)
31 normalised_english_bigram_counts = norms.normalise(english_bigram_counts)
32
33 english_trigram_counts = collections.defaultdict(int)
34 with open('count_3l.txt', 'r') as f:
35 for line in f:
36 (trigram, count) = line.split("\t")
37 english_trigram_counts[trigram] = int(count)
38 normalised_english_trigram_counts = norms.normalise(english_trigram_counts)
39
40
41 with open('words.txt', 'r') as f:
42 keywords = [line.rstrip() for line in f]
43
44 transpositions = collections.defaultdict(list)
45 for word in keywords:
46 transpositions[transpositions_of(word)] += [word]
47
48 def frequencies(text):
49 """Count the number of occurrences of each character in text
50
51 >>> sorted(frequencies('abcdefabc').items())
52 [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
53 >>> sorted(frequencies('the quick brown fox jumped over the lazy ' \
54 'dog').items()) # doctest: +NORMALIZE_WHITESPACE
55 [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1),
56 ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1),
57 ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2),
58 ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
59 >>> sorted(frequencies('The Quick BROWN fox jumped! over... the ' \
60 '(9lazy) DOG').items()) # doctest: +NORMALIZE_WHITESPACE
61 [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1),
62 ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1),
63 ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2),
64 ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1),
65 ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
66 >>> sorted(frequencies(sanitise('The Quick BROWN fox jumped! over... ' \
67 'the (9lazy) DOG')).items()) # doctest: +NORMALIZE_WHITESPACE
68 [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1),
69 ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1),
70 ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1),
71 ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
72 >>> frequencies('abcdefabcdef')['x']
73 0
74 """
75 #counts = collections.defaultdict(int)
76 #for c in text:
77 # counts[c] += 1
78 #return counts
79 return collections.Counter(c for c in text)
80 letter_frequencies = frequencies
81
82
83
84 def caesar_break(message,
85 metric=norms.euclidean_distance,
86 target_counts=normalised_english_counts,
87 message_frequency_scaling=norms.normalise):
88 """Breaks a Caesar cipher using frequency analysis
89
90 >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrh' \
91 'ecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS
92 (4, 0.080345432737...)
93 >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgtee' \
94 'raxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS
95 (19, 0.11189290326...)
96 >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurer' \
97 'svaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS
98 (13, 0.08293968842...)
99 """
100 sanitised_message = sanitise(message)
101 best_shift = 0
102 best_fit = float("inf")
103 for shift in range(26):
104 plaintext = caesar_decipher(sanitised_message, shift)
105 counts = message_frequency_scaling(letter_frequencies(plaintext))
106 fit = metric(target_counts, counts)
107 logger.debug('Caesar break attempt using key {0} gives fit of {1} '
108 'and decrypt starting: {2}'.format(shift, fit, plaintext[:50]))
109 if fit < best_fit:
110 best_fit = fit
111 best_shift = shift
112 logger.info('Caesar break best fit: key {0} gives fit of {1} and '
113 'decrypt starting: {2}'.format(best_shift, best_fit,
114 caesar_decipher(sanitised_message, best_shift)[:50]))
115 return best_shift, best_fit
116
117 def affine_break(message,
118 metric=norms.euclidean_distance,
119 target_counts=normalised_english_counts,
120 message_frequency_scaling=norms.normalise):
121 """Breaks an affine cipher using frequency analysis
122
123 >>> affine_break('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls ' \
124 'omytd jlaxe mh jm bfmibj umis hfsul axubafkjamx. ls kffkxwsd jls ' \
125 'ofgbjmwfkiu olfmxmtmwaokttg jlsx ls kffkxwsd jlsi zg tsxwjl. jlsx ' \
126 'ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls ' \
127 'bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai kxd ' \
128 'clm ckuxj.') # doctest: +ELLIPSIS
129 ((15, 22, True), 0.0598745365924...)
130 """
131 sanitised_message = sanitise(message)
132 best_multiplier = 0
133 best_adder = 0
134 best_one_based = True
135 best_fit = float("inf")
136 for one_based in [True, False]:
137 for multiplier in range(1, 26, 2):
138 for adder in range(26):
139 plaintext = affine_decipher(sanitised_message,
140 multiplier, adder, one_based)
141 counts = message_frequency_scaling(letter_frequencies(plaintext))
142 fit = metric(target_counts, counts)
143 logger.debug('Affine break attempt using key {0}x+{1} ({2}) '
144 'gives fit of {3} and decrypt starting: {4}'.
145 format(multiplier, adder, one_based, fit,
146 plaintext[:50]))
147 if fit < best_fit:
148 best_fit = fit
149 best_multiplier = multiplier
150 best_adder = adder
151 best_one_based = one_based
152 logger.info('Affine break best fit with key {0}x+{1} ({2}) gives fit of {3} '
153 'and decrypt starting: {4}'.format(
154 best_multiplier, best_adder, best_one_based, best_fit,
155 affine_decipher(sanitised_message, best_multiplier,
156 best_adder, best_one_based)[:50]))
157 return (best_multiplier, best_adder, best_one_based), best_fit
158
159 def keyword_break(message,
160 wordlist=keywords,
161 metric=norms.euclidean_distance,
162 target_counts=normalised_english_counts,
163 message_frequency_scaling=norms.normalise):
164 """Breaks a keyword substitution cipher using a dictionary and
165 frequency analysis
166
167 >>> keyword_break(keyword_encipher('this is a test message for the ' \
168 'keyword decipherment', 'elephant', 1), \
169 wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
170 (('elephant', 1), 0.1066453448861...)
171 """
172 best_keyword = ''
173 best_wrap_alphabet = True
174 best_fit = float("inf")
175 for wrap_alphabet in range(3):
176 for keyword in wordlist:
177 plaintext = keyword_decipher(message, keyword, wrap_alphabet)
178 counts = message_frequency_scaling(letter_frequencies(plaintext))
179 fit = metric(target_counts, counts)
180 logger.debug('Keyword break attempt using key {0} (wrap={1}) '
181 'gives fit of {2} and decrypt starting: {3}'.format(
182 keyword, wrap_alphabet, fit,
183 sanitise(plaintext)[:50]))
184 if fit < best_fit:
185 best_fit = fit
186 best_keyword = keyword
187 best_wrap_alphabet = wrap_alphabet
188 logger.info('Keyword break best fit with key {0} (wrap={1}) gives fit of '
189 '{2} and decrypt starting: {3}'.format(best_keyword,
190 best_wrap_alphabet, best_fit, sanitise(
191 keyword_decipher(message, best_keyword,
192 best_wrap_alphabet))[:50]))
193 return (best_keyword, best_wrap_alphabet), best_fit
194
195 def keyword_break_mp(message,
196 wordlist=keywords,
197 metric=norms.euclidean_distance,
198 target_counts=normalised_english_counts,
199 message_frequency_scaling=norms.normalise,
200 chunksize=500):
201 """Breaks a keyword substitution cipher using a dictionary and
202 frequency analysis
203
204 >>> keyword_break_mp(keyword_encipher('this is a test message for the ' \
205 'keyword decipherment', 'elephant', 1), \
206 wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
207 (('elephant', 1), 0.106645344886...)
208 """
209 with Pool() as pool:
210 helper_args = [(message, word, wrap, metric, target_counts,
211 message_frequency_scaling)
212 for word in wordlist for wrap in range(3)]
213 # Gotcha: the helper function here needs to be defined at the top level
214 # (limitation of Pool.starmap)
215 breaks = pool.starmap(keyword_break_worker, helper_args, chunksize)
216 return min(breaks, key=lambda k: k[1])
217
218 def keyword_break_worker(message, keyword, wrap_alphabet, metric, target_counts,
219 message_frequency_scaling):
220 plaintext = keyword_decipher(message, keyword, wrap_alphabet)
221 counts = message_frequency_scaling(letter_frequencies(plaintext))
222 fit = metric(target_counts, counts)
223 logger.debug('Keyword break attempt using key {0} (wrap={1}) gives fit of '
224 '{2} and decrypt starting: {3}'.format(keyword,
225 wrap_alphabet, fit, sanitise(plaintext)[:50]))
226 return (keyword, wrap_alphabet), fit
227
228 def scytale_break(message,
229 metric=norms.euclidean_distance,
230 target_counts=normalised_english_bigram_counts,
231 message_frequency_scaling=norms.normalise):
232 """Breaks a Scytale cipher
233
234 >>> scytale_break('tfeulchtrtteehwahsdehneoifeayfsondmwpltmaoalhikotoere' \
235 'dcweatehiplwxsnhooacgorrcrcraotohsgullasenylrendaianeplscdriioto' \
236 'aek') # doctest: +ELLIPSIS
237 (6, 0.092599933059...)
238 """
239 best_key = 0
240 best_fit = float("inf")
241 ngram_length = len(next(iter(target_counts.keys())))
242 for key in range(1, 20):
243 if len(message) % key == 0:
244 plaintext = scytale_decipher(message, key)
245 counts = message_frequency_scaling(frequencies(
246 ngrams(sanitise(plaintext), ngram_length)))
247 fit = metric(target_counts, counts)
248 logger.debug('Scytale break attempt using key {0} gives fit of '
249 '{1} and decrypt starting: {2}'.format(key,
250 fit, sanitise(plaintext)[:50]))
251 if fit < best_fit:
252 best_fit = fit
253 best_key = key
254 logger.info('Scytale break best fit with key {0} gives fit of {1} and '
255 'decrypt starting: {2}'.format(best_key, best_fit,
256 sanitise(scytale_decipher(message, best_key))[:50]))
257 return best_key, best_fit
258
259 def column_transposition_break(message,
260 translist=transpositions,
261 metric=norms.euclidean_distance,
262 target_counts=normalised_english_bigram_counts,
263 message_frequency_scaling=norms.normalise):
264 """Breaks a column transposition cipher using a dictionary and
265 n-gram frequency analysis
266
267 >>> column_transposition_break(column_transposition_encipher(sanitise( \
268 "It is a truth universally acknowledged, that a single man in \
269 possession of a good fortune, must be in want of a wife. However \
270 little known the feelings or views of such a man may be on his \
271 first entering a neighbourhood, this truth is so well fixed in the \
272 minds of the surrounding families, that he is considered the \
273 rightful property of some one or other of their daughters."), \
274 'encipher'), \
275 translist={(2, 0, 5, 3, 1, 4, 6): ['encipher'], \
276 (5, 0, 6, 1, 3, 4, 2): ['fourteen'], \
277 (6, 1, 0, 4, 5, 3, 2): ['keyword']}) # doctest: +ELLIPSIS
278 ((2, 0, 5, 3, 1, 4, 6), 0.0628106372...)
279 >>> column_transposition_break(column_transposition_encipher(sanitise( \
280 "It is a truth universally acknowledged, that a single man in \
281 possession of a good fortune, must be in want of a wife. However \
282 little known the feelings or views of such a man may be on his \
283 first entering a neighbourhood, this truth is so well fixed in the \
284 minds of the surrounding families, that he is considered the \
285 rightful property of some one or other of their daughters."), \
286 'encipher'), \
287 translist={(2, 0, 5, 3, 1, 4, 6): ['encipher'], \
288 (5, 0, 6, 1, 3, 4, 2): ['fourteen'], \
289 (6, 1, 0, 4, 5, 3, 2): ['keyword']}, \
290 target_counts=normalised_english_trigram_counts) # doctest: +ELLIPSIS
291 ((2, 0, 5, 3, 1, 4, 6), 0.0592259560...)
292 """
293 best_transposition = ''
294 best_fit = float("inf")
295 ngram_length = len(next(iter(target_counts.keys())))
296 for transposition in translist.keys():
297 if len(message) % len(transposition) == 0:
298 plaintext = column_transposition_decipher(message, transposition)
299 counts = message_frequency_scaling(frequencies(
300 ngrams(sanitise(plaintext), ngram_length)))
301 fit = metric(target_counts, counts)
302 logger.debug('Column transposition break attempt using key {0} '
303 'gives fit of {1} and decrypt starting: {2}'.format(
304 translist[transposition][0], fit,
305 sanitise(plaintext)[:50]))
306 if fit < best_fit:
307 best_fit = fit
308 best_transposition = transposition
309 logger.info('Column transposition break best fit with key {0} gives fit '
310 'of {1} and decrypt starting: {2}'.format(
311 translist[best_transposition][0],
312 best_fit, sanitise(
313 column_transposition_decipher(message,
314 best_transposition))[:50]))
315 return best_transposition, best_fit
316
317
318 def column_transposition_break_mp(message,
319 translist=transpositions,
320 metric=norms.euclidean_distance,
321 target_counts=normalised_english_bigram_counts,
322 message_frequency_scaling=norms.normalise,
323 chunksize=500):
324 """Breaks a column transposition cipher using a dictionary and
325 n-gram frequency analysis
326
327 >>> column_transposition_break_mp(column_transposition_encipher(sanitise( \
328 "It is a truth universally acknowledged, that a single man in \
329 possession of a good fortune, must be in want of a wife. However \
330 little known the feelings or views of such a man may be on his \
331 first entering a neighbourhood, this truth is so well fixed in the \
332 minds of the surrounding families, that he is considered the \
333 rightful property of some one or other of their daughters."), \
334 'encipher'), \
335 translist={(2, 0, 5, 3, 1, 4, 6): ['encipher'], \
336 (5, 0, 6, 1, 3, 4, 2): ['fourteen'], \
337 (6, 1, 0, 4, 5, 3, 2): ['keyword']}) # doctest: +ELLIPSIS
338 ((2, 0, 5, 3, 1, 4, 6), 0.0628106372...)
339 >>> column_transposition_break_mp(column_transposition_encipher(sanitise( \
340 "It is a truth universally acknowledged, that a single man in \
341 possession of a good fortune, must be in want of a wife. However \
342 little known the feelings or views of such a man may be on his \
343 first entering a neighbourhood, this truth is so well fixed in the \
344 minds of the surrounding families, that he is considered the \
345 rightful property of some one or other of their daughters."), \
346 'encipher'), \
347 translist={(2, 0, 5, 3, 1, 4, 6): ['encipher'], \
348 (5, 0, 6, 1, 3, 4, 2): ['fourteen'], \
349 (6, 1, 0, 4, 5, 3, 2): ['keyword']}, \
350 target_counts=normalised_english_trigram_counts) # doctest: +ELLIPSIS
351 ((2, 0, 5, 3, 1, 4, 6), 0.0592259560...)
352 """
353 ngram_length = len(next(iter(target_counts.keys())))
354 with Pool() as pool:
355 helper_args = [(message, trans, metric, target_counts, ngram_length,
356 message_frequency_scaling)
357 for trans in translist.keys()]
358 # Gotcha: the helper function here needs to be defined at the top level
359 # (limitation of Pool.starmap)
360 breaks = pool.starmap(column_transposition_break_worker, helper_args, chunksize)
361 return min(breaks, key=lambda k: k[1])
362
363 def column_transposition_break_worker(message, transposition, metric, target_counts,
364 ngram_length, message_frequency_scaling):
365 plaintext = column_transposition_decipher(message, transposition)
366 counts = message_frequency_scaling(frequencies(
367 ngrams(sanitise(plaintext), ngram_length)))
368 fit = metric(target_counts, counts)
369 logger.debug('Column transposition break attempt using key {0} '
370 'gives fit of {1} and decrypt starting: {2}'.format(
371 transposition, fit,
372 sanitise(plaintext)[:50]))
373 return transposition, fit
374
375 def vigenere_keyword_break(message,
376 wordlist=keywords,
377 metric=norms.euclidean_distance,
378 target_counts=normalised_english_counts,
379 message_frequency_scaling=norms.normalise):
380 """Breaks a vigenere cipher using a dictionary and
381 frequency analysis
382
383 >>> vigenere_keyword_break(vigenere_encipher(sanitise('this is a test ' \
384 'message for the vigenere decipherment'), 'cat'), \
385 wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
386 ('cat', 0.15965224935...)
387 """
388 best_keyword = ''
389 best_fit = float("inf")
390 for keyword in wordlist:
391 plaintext = vigenere_decipher(message, keyword)
392 counts = message_frequency_scaling(letter_frequencies(plaintext))
393 fit = metric(target_counts, counts)
394 logger.debug('Vigenere break attempt using key {0} '
395 'gives fit of {1} and decrypt starting: {2}'.format(
396 keyword, fit,
397 sanitise(plaintext)[:50]))
398 if fit < best_fit:
399 best_fit = fit
400 best_keyword = keyword
401 logger.info('Vigenere break best fit with key {0} gives fit '
402 'of {1} and decrypt starting: {2}'.format(best_keyword,
403 best_fit, sanitise(
404 vigenere_decipher(message, best_keyword))[:50]))
405 return best_keyword, best_fit
406
407 def vigenere_keyword_break_mp(message,
408 wordlist=keywords,
409 metric=norms.euclidean_distance,
410 target_counts=normalised_english_counts,
411 message_frequency_scaling=norms.normalise,
412 chunksize=500):
413 """Breaks a vigenere cipher using a dictionary and
414 frequency analysis
415
416 >>> vigenere_keyword_break_mp(vigenere_encipher(sanitise('this is a test ' \
417 'message for the vigenere decipherment'), 'cat'), \
418 wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
419 ('cat', 0.159652249358...)
420 """
421 with Pool() as pool:
422 helper_args = [(message, word, metric, target_counts,
423 message_frequency_scaling)
424 for word in wordlist]
425 # Gotcha: the helper function here needs to be defined at the top level
426 # (limitation of Pool.starmap)
427 breaks = pool.starmap(vigenere_keyword_break_worker, helper_args, chunksize)
428 return min(breaks, key=lambda k: k[1])
429
430 def vigenere_keyword_break_worker(message, keyword, metric, target_counts,
431 message_frequency_scaling):
432 plaintext = vigenere_decipher(message, keyword)
433 counts = message_frequency_scaling(letter_frequencies(plaintext))
434 fit = metric(target_counts, counts)
435 logger.debug('Vigenere keyword break attempt using key {0} gives fit of '
436 '{1} and decrypt starting: {2}'.format(keyword,
437 fit, sanitise(plaintext)[:50]))
438 return keyword, fit
439
440
441
442 def vigenere_frequency_break(message,
443 metric=norms.euclidean_distance,
444 target_counts=normalised_english_counts,
445 message_frequency_scaling=norms.normalise):
446 """Breaks a Vigenere cipher with frequency analysis
447
448 >>> vigenere_frequency_break(vigenere_encipher(sanitise("It is time to " \
449 "run. She is ready and so am I. I stole Daniel's pocketbook this " \
450 "afternoon when he left his jacket hanging on the easel in the " \
451 "attic."), 'florence')) # doctest: +ELLIPSIS
452 ('florence', 0.077657073...)
453 """
454 best_fit = float("inf")
455 best_key = ''
456 sanitised_message = sanitise(message)
457 for trial_length in range(1, 20):
458 splits = every_nth(sanitised_message, trial_length)
459 key = ''.join([chr(caesar_break(s, target_counts=target_counts)[0] + ord('a')) for s in splits])
460 plaintext = vigenere_decipher(sanitised_message, key)
461 counts = message_frequency_scaling(frequencies(plaintext))
462 fit = metric(target_counts, counts)
463 logger.debug('Vigenere key length of {0} ({1}) gives fit of {2}'.
464 format(trial_length, key, fit))
465 if fit < best_fit:
466 best_fit = fit
467 best_key = key
468 logger.info('Vigenere break best fit with key {0} gives fit '
469 'of {1} and decrypt starting: {2}'.format(best_key,
470 best_fit, sanitise(
471 vigenere_decipher(message, best_key))[:50]))
472 return best_key, best_fit
473
474
475
476 if __name__ == "__main__":
477 import doctest
478 doctest.testmod()
479