6 from itertools
import zip_longest
7 from segment
import segment
8 from multiprocessing
import Pool
13 # c5a = open('2012/5a.ciphertext', 'r').read()
14 # timeit.timeit('keyword_break(c5a)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break', number=1)
15 # timeit.repeat('keyword_break_mp(c5a, chunksize=500)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break_mp', repeat=5, number=1
17 logger
= logging
.getLogger(__name__
)
18 logger
.addHandler(logging
.FileHandler('cipher.log'))
19 logger
.setLevel(logging
.WARNING
)
20 #logger.setLevel(logging.INFO)
21 #logger.setLevel(logging.DEBUG)
23 english_counts
= collections
.defaultdict(int)
24 with
open('count_1l.txt', 'r') as f
:
26 (letter
, count
) = line
.split("\t")
27 english_counts
[letter
] = int(count
)
28 normalised_english_counts
= norms
.normalise(english_counts
)
30 english_bigram_counts
= collections
.defaultdict(int)
31 with
open('count_2l.txt', 'r') as f
:
33 (bigram
, count
) = line
.split("\t")
34 english_bigram_counts
[bigram
] = int(count
)
35 normalised_english_bigram_counts
= norms
.normalise(english_bigram_counts
)
37 with
open('words.txt', 'r') as f
:
38 keywords
= [line
.rstrip() for line
in f
]
40 modular_division_table
= [[0]*26 for x
in range(26)]
44 modular_division_table
[b
][c
] = a
48 """Remove all non-alphabetic characters and convert the text to lowercase
50 >>> sanitise('The Quick')
52 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
53 'thequickbrownfoxjumpedoverthelazydog'
55 sanitised
= [c
.lower() for c
in text
if c
in string
.ascii_letters
]
56 return ''.join(sanitised
)
59 """Returns all n-grams of a text
61 >>> ngrams(sanitise('the quick brown fox'), 2) # doctest: +NORMALIZE_WHITESPACE
62 ['th', 'he', 'eq', 'qu', 'ui', 'ic', 'ck', 'kb', 'br', 'ro', 'ow', 'wn',
64 >>> ngrams(sanitise('the quick brown fox'), 4) # doctest: +NORMALIZE_WHITESPACE
65 ['theq', 'hequ', 'equi', 'quic', 'uick', 'ickb', 'ckbr', 'kbro', 'brow',
66 'rown', 'ownf', 'wnfo', 'nfox']
68 return [text
[i
:i
+n
] for i
in range(len(text
)-n
+1)]
70 def every_nth(text
, n
):
71 """Returns n strings, each of which consists of every nth character,
72 starting with the 0th, 1st, 2nd, ... (n-1)th character
74 >>> every_nth(string.ascii_lowercase, 5)
75 ['afkpuz', 'bglqv', 'chmrw', 'dinsx', 'ejoty']
76 >>> every_nth(string.ascii_lowercase, 1)
77 ['abcdefghijklmnopqrstuvwxyz']
78 >>> every_nth(string.ascii_lowercase, 26) # doctest: +NORMALIZE_WHITESPACE
79 ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
80 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
82 split_text
= [text
[i
:i
+n
] for i
in range(0, len(text
), n
)]
83 return [''.join(l
) for l
in zip_longest(*split_text
, fillvalue
='')]
85 def combine_every_nth(split_text
):
86 """Reforms a text split into every_nth strings
88 >>> combine_every_nth(every_nth(string.ascii_lowercase, 5))
89 'abcdefghijklmnopqrstuvwxyz'
90 >>> combine_every_nth(every_nth(string.ascii_lowercase, 1))
91 'abcdefghijklmnopqrstuvwxyz'
92 >>> combine_every_nth(every_nth(string.ascii_lowercase, 26))
93 'abcdefghijklmnopqrstuvwxyz'
95 return ''.join([''.join(l
)
96 for l
in zip_longest(*split_text
, fillvalue
='')])
99 def frequencies(text
):
100 """Count the number of occurrences of each character in text
102 >>> sorted(frequencies('abcdefabc').items())
103 [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
104 >>> sorted(frequencies('the quick brown fox jumped over the lazy dog').items()) # doctest: +NORMALIZE_WHITESPACE
105 [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1),
106 ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1),
107 ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2),
108 ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
109 >>> sorted(frequencies('The Quick BROWN fox jumped! over... the (9lazy) DOG').items()) # doctest: +NORMALIZE_WHITESPACE
110 [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1),
111 ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1),
112 ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2),
113 ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1),
114 ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
115 >>> sorted(frequencies(sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')).items()) # doctest: +NORMALIZE_WHITESPACE
116 [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1),
117 ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1),
118 ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1),
119 ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
121 counts
= collections
.defaultdict(int)
125 letter_frequencies
= frequencies
127 def deduplicate(text
):
128 return list(collections
.OrderedDict
.fromkeys(text
))
132 def caesar_encipher_letter(letter
, shift
):
133 """Encipher a letter, given a shift amount
135 >>> caesar_encipher_letter('a', 1)
137 >>> caesar_encipher_letter('a', 2)
139 >>> caesar_encipher_letter('b', 2)
141 >>> caesar_encipher_letter('x', 2)
143 >>> caesar_encipher_letter('y', 2)
145 >>> caesar_encipher_letter('z', 2)
147 >>> caesar_encipher_letter('z', -1)
149 >>> caesar_encipher_letter('a', -1)
152 if letter
in string
.ascii_letters
:
153 if letter
in string
.ascii_uppercase
:
154 alphabet_start
= ord('A')
156 alphabet_start
= ord('a')
157 return chr(((ord(letter
) - alphabet_start
+ shift
) % 26) +
162 def caesar_decipher_letter(letter
, shift
):
163 """Decipher a letter, given a shift amount
165 >>> caesar_decipher_letter('b', 1)
167 >>> caesar_decipher_letter('b', 2)
170 return caesar_encipher_letter(letter
, -shift
)
172 def caesar_encipher(message
, shift
):
173 """Encipher a message with the Caesar cipher of given shift
175 >>> caesar_encipher('abc', 1)
177 >>> caesar_encipher('abc', 2)
179 >>> caesar_encipher('abcxyz', 2)
181 >>> caesar_encipher('ab cx yz', 2)
184 enciphered
= [caesar_encipher_letter(l
, shift
) for l
in message
]
185 return ''.join(enciphered
)
187 def caesar_decipher(message
, shift
):
188 """Encipher a message with the Caesar cipher of given shift
190 >>> caesar_decipher('bcd', 1)
192 >>> caesar_decipher('cde', 2)
194 >>> caesar_decipher('cd ez ab', 2)
197 return caesar_encipher(message
, -shift
)
199 def affine_encipher_letter(letter
, multiplier
=1, adder
=0, one_based
=True):
200 """Encipher a letter, given a multiplier and adder
202 >>> ''.join([affine_encipher_letter(l, 3, 5, True) for l in string.ascii_uppercase])
203 'HKNQTWZCFILORUXADGJMPSVYBE'
204 >>> ''.join([affine_encipher_letter(l, 3, 5, False) for l in string.ascii_uppercase])
205 'FILORUXADGJMPSVYBEHKNQTWZC'
207 if letter
in string
.ascii_letters
:
208 if letter
in string
.ascii_uppercase
:
209 alphabet_start
= ord('A')
211 alphabet_start
= ord('a')
212 letter_number
= ord(letter
) - alphabet_start
213 if one_based
: letter_number
+= 1
214 cipher_number
= (letter_number
* multiplier
+ adder
) % 26
215 if one_based
: cipher_number
-= 1
216 return chr(cipher_number
% 26 + alphabet_start
)
220 def affine_decipher_letter(letter
, multiplier
=1, adder
=0, one_based
=True):
221 """Encipher a letter, given a multiplier and adder
223 >>> ''.join([affine_decipher_letter(l, 3, 5, True) for l in 'HKNQTWZCFILORUXADGJMPSVYBE'])
224 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
225 >>> ''.join([affine_decipher_letter(l, 3, 5, False) for l in 'FILORUXADGJMPSVYBEHKNQTWZC'])
226 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
228 if letter
in string
.ascii_letters
:
229 if letter
in string
.ascii_uppercase
:
230 alphabet_start
= ord('A')
232 alphabet_start
= ord('a')
233 cipher_number
= ord(letter
) - alphabet_start
234 if one_based
: cipher_number
+= 1
235 plaintext_number
= ( modular_division_table
[multiplier
]
236 [(cipher_number
- adder
) % 26] )
237 if one_based
: plaintext_number
-= 1
238 return chr(plaintext_number
% 26 + alphabet_start
)
242 def affine_encipher(message
, multiplier
=1, adder
=0, one_based
=True):
243 """Encipher a message
245 >>> affine_encipher('hours passed during which jerico tried every trick he could think of', 15, 22, True)
246 'lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh'
248 enciphered
= [affine_encipher_letter(l
, multiplier
, adder
, one_based
)
250 return ''.join(enciphered
)
252 def affine_decipher(message
, multiplier
=1, adder
=0, one_based
=True):
253 """Decipher a message
255 >>> affine_decipher('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh', 15, 22, True)
256 'hours passed during which jerico tried every trick he could think of'
258 enciphered
= [affine_decipher_letter(l
, multiplier
, adder
, one_based
)
260 return ''.join(enciphered
)
263 def keyword_cipher_alphabet_of(keyword
, wrap_alphabet
=0):
264 """Find the cipher alphabet given a keyword.
265 wrap_alphabet controls how the rest of the alphabet is added
268 1 : from the last letter in the sanitised keyword
269 2 : from the largest letter in the sanitised keyword
271 >>> keyword_cipher_alphabet_of('bayes')
272 'bayescdfghijklmnopqrtuvwxz'
273 >>> keyword_cipher_alphabet_of('bayes', 0)
274 'bayescdfghijklmnopqrtuvwxz'
275 >>> keyword_cipher_alphabet_of('bayes', 1)
276 'bayestuvwxzcdfghijklmnopqr'
277 >>> keyword_cipher_alphabet_of('bayes', 2)
278 'bayeszcdfghijklmnopqrtuvwx'
280 if wrap_alphabet
== 0:
281 cipher_alphabet
= ''.join(deduplicate(sanitise(keyword
) +
282 string
.ascii_lowercase
))
284 if wrap_alphabet
== 1:
285 last_keyword_letter
= deduplicate(sanitise(keyword
))[-1]
287 last_keyword_letter
= sorted(sanitise(keyword
))[-1]
288 last_keyword_position
= string
.ascii_lowercase
.find(
289 last_keyword_letter
) + 1
290 cipher_alphabet
= ''.join(
291 deduplicate(sanitise(keyword
) +
292 string
.ascii_lowercase
[last_keyword_position
:] +
293 string
.ascii_lowercase
))
294 return cipher_alphabet
297 def keyword_encipher(message
, keyword
, wrap_alphabet
=0):
298 """Enciphers a message with a keyword substitution cipher.
299 wrap_alphabet controls how the rest of the alphabet is added
302 1 : from the last letter in the sanitised keyword
303 2 : from the largest letter in the sanitised keyword
305 >>> keyword_encipher('test message', 'bayes')
307 >>> keyword_encipher('test message', 'bayes', 0)
309 >>> keyword_encipher('test message', 'bayes', 1)
311 >>> keyword_encipher('test message', 'bayes', 2)
314 cipher_alphabet
= keyword_cipher_alphabet_of(keyword
, wrap_alphabet
)
315 cipher_translation
= ''.maketrans(string
.ascii_lowercase
, cipher_alphabet
)
316 return message
.lower().translate(cipher_translation
)
318 def keyword_decipher(message
, keyword
, wrap_alphabet
=0):
319 """Deciphers a message with a keyword substitution cipher.
320 wrap_alphabet controls how the rest of the alphabet is added
323 1 : from the last letter in the sanitised keyword
324 2 : from the largest letter in the sanitised keyword
326 >>> keyword_decipher('rsqr ksqqbds', 'bayes')
328 >>> keyword_decipher('rsqr ksqqbds', 'bayes', 0)
330 >>> keyword_decipher('lskl dskkbus', 'bayes', 1)
332 >>> keyword_decipher('qspq jsppbcs', 'bayes', 2)
335 cipher_alphabet
= keyword_cipher_alphabet_of(keyword
, wrap_alphabet
)
336 cipher_translation
= ''.maketrans(cipher_alphabet
, string
.ascii_lowercase
)
337 return message
.lower().translate(cipher_translation
)
339 def scytale_encipher(message
, rows
):
340 """Enciphers using the scytale transposition cipher.
341 Message is padded with spaces to allow all rows to be the same length.
343 >>> scytale_encipher('thequickbrownfox', 3)
345 >>> scytale_encipher('thequickbrownfox', 4)
347 >>> scytale_encipher('thequickbrownfox', 5)
348 'tubn hirf ecoo qkwx '
349 >>> scytale_encipher('thequickbrownfox', 6)
351 >>> scytale_encipher('thequickbrownfox', 7)
352 'tqcrnx hukof eibwo '
354 if len(message
) % rows
!= 0:
355 message
+= ' '*(rows
- len(message
) % rows
)
356 row_length
= round(len(message
) / rows
)
357 slices
= [message
[i
:i
+row_length
]
358 for i
in range(0, len(message
), row_length
)]
359 return ''.join([''.join(r
) for r
in zip_longest(*slices
, fillvalue
='')])
361 def scytale_decipher(message
, rows
):
362 """Deciphers using the scytale transposition cipher.
363 Assumes the message is padded so that all rows are the same length.
365 >>> scytale_decipher('tcnhkfeboqrxuo iw ', 3)
367 >>> scytale_decipher('tubnhirfecooqkwx', 4)
369 >>> scytale_decipher('tubn hirf ecoo qkwx ', 5)
371 >>> scytale_decipher('tqcrnxhukof eibwo ', 6)
373 >>> scytale_decipher('tqcrnx hukof eibwo ', 7)
376 cols
= round(len(message
) / rows
)
377 columns
= [message
[i
:i
+rows
] for i
in range(0, cols
* rows
, rows
)]
378 return ''.join([''.join(c
) for c
in zip_longest(*columns
, fillvalue
='')])
381 def caesar_break(message
,
382 metric
=norms
.euclidean_distance
,
383 target_counts
=normalised_english_counts
,
384 message_frequency_scaling
=norms
.normalise
):
385 """Breaks a Caesar cipher using frequency analysis
387 >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrhecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS
388 (4, 0.31863952890183...)
389 >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgteeraxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS
390 (19, 0.42152901235832...)
391 >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurersvaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS
392 (13, 0.316029208075451...)
394 sanitised_message
= sanitise(message
)
396 best_fit
= float("inf")
397 for shift
in range(26):
398 plaintext
= caesar_decipher(sanitised_message
, shift
)
399 counts
= message_frequency_scaling(letter_frequencies(plaintext
))
400 fit
= metric(target_counts
, counts
)
401 logger
.debug('Caesar break attempt using key {0} gives fit of {1} '
402 'and decrypt starting: {2}'.format(shift
, fit
, plaintext
[:50]))
406 logger
.info('Caesar break best fit: key {0} gives fit of {1} and '
407 'decrypt starting: {2}'.format(best_shift
, best_fit
,
408 caesar_decipher(sanitised_message
, best_shift
)[:50]))
409 return best_shift
, best_fit
411 def affine_break(message
,
412 metric
=norms
.euclidean_distance
,
413 target_counts
=normalised_english_counts
,
414 message_frequency_scaling
=norms
.normalise
):
415 """Breaks an affine cipher using frequency analysis
417 >>> affine_break('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh jm bfmibj umis hfsul axubafkjamx. ls kffkxwsd jls ofgbjmwfkiu olfmxmtmwaokttg jlsx ls kffkxwsd jlsi zg tsxwjl. jlsx ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai kxd clm ckuxj.') # doctest: +ELLIPSIS
418 ((15, 22, True), 0.23570361818655...)
420 sanitised_message
= sanitise(message
)
423 best_one_based
= True
424 best_fit
= float("inf")
425 for one_based
in [True, False]:
426 for multiplier
in range(1, 26, 2):
427 for adder
in range(26):
428 plaintext
= affine_decipher(sanitised_message
,
429 multiplier
, adder
, one_based
)
430 counts
= message_frequency_scaling(letter_frequencies(plaintext
))
431 fit
= metric(target_counts
, counts
)
432 logger
.debug('Affine break attempt using key {0}x+{1} ({2}) '
433 'gives fit of {3} and decrypt starting: {4}'.
434 format(multiplier
, adder
, one_based
, fit
,
438 best_multiplier
= multiplier
440 best_one_based
= one_based
441 logger
.info('Affine break best fit with key {0}x+{1} ({2}) gives fit of {3} '
442 'and decrypt starting: {4}'.format(
443 best_multiplier
, best_adder
, best_one_based
, best_fit
,
444 affine_decipher(sanitised_message
, best_multiplier
,
445 best_adder
, best_one_based
)[:50]))
446 return (best_multiplier
, best_adder
, best_one_based
), best_fit
448 def keyword_break(message
,
450 metric
=norms
.euclidean_distance
,
451 target_counts
=normalised_english_counts
,
452 message_frequency_scaling
=norms
.normalise
):
453 """Breaks a keyword substitution cipher using a dictionary and
456 >>> keyword_break(keyword_encipher('this is a test message for the keyword decipherment', 'elephant', 1), wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
457 (('elephant', 1), 0.41643991598441...)
460 best_wrap_alphabet
= True
461 best_fit
= float("inf")
462 for wrap_alphabet
in range(3):
463 for keyword
in wordlist
:
464 plaintext
= keyword_decipher(message
, keyword
, wrap_alphabet
)
465 counts
= message_frequency_scaling(letter_frequencies(plaintext
))
466 fit
= metric(target_counts
, counts
)
467 logger
.debug('Keyword break attempt using key {0} (wrap={1}) '
468 'gives fit of {2} and decrypt starting: {3}'.format(
469 keyword
, wrap_alphabet
, fit
,
470 sanitise(plaintext
)[:50]))
473 best_keyword
= keyword
474 best_wrap_alphabet
= wrap_alphabet
475 logger
.info('Keyword break best fit with key {0} (wrap={1}) gives fit of '
476 '{2} and decrypt starting: {3}'.format(best_keyword
,
477 best_wrap_alphabet
, best_fit
, sanitise(
478 keyword_decipher(message
, best_keyword
,
479 best_wrap_alphabet
))[:50]))
480 return (best_keyword
, best_wrap_alphabet
), best_fit
482 def keyword_break_mp(message
,
484 metric
=norms
.euclidean_distance
,
485 target_counts
=normalised_english_counts
,
486 message_frequency_scaling
=norms
.normalise
,
488 """Breaks a keyword substitution cipher using a dictionary and
491 >>> keyword_break_mp(keyword_encipher('this is a test message for the keyword decipherment', 'elephant', 1), wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
492 (('elephant', 1), 0.41643991598441...)
495 helper_args
= [(message
, word
, wrap
, metric
, target_counts
,
496 message_frequency_scaling
)
497 for word
in wordlist
for wrap
in range(3)]
498 breaks
= pool
.starmap(keyword_break_one
, helper_args
, chunksize
) # Gotcha: the helper function here needs to be defined at the top level (limitation of Pool.starmap)
499 return min(breaks
, key
=lambda k
: k
[1])
501 def keyword_break_one(message
, keyword
, wrap_alphabet
, metric
, target_counts
,
502 message_frequency_scaling
):
503 plaintext
= keyword_decipher(message
, keyword
, wrap_alphabet
)
504 counts
= message_frequency_scaling(letter_frequencies(plaintext
))
505 fit
= metric(target_counts
, counts
)
506 logger
.debug('Keyword break attempt using key {0} (wrap={1}) gives fit of '
507 '{2} and decrypt starting: {3}'.format(keyword
,
508 wrap_alphabet
, fit
, sanitise(plaintext
)[:50]))
509 return (keyword
, wrap_alphabet
), fit
511 def scytale_break(message
,
512 metric
=norms
.euclidean_distance
,
513 target_counts
=normalised_english_bigram_counts
,
514 message_frequency_scaling
=norms
.normalise
):
515 """Breaks a Scytale cipher
517 >>> scytale_break('tfeulchtrtteehwahsdehneoifeayfsondmwpltmaoalhikotoeredcweatehiplwxsnhooacgorrcrcraotohsgullasenylrendaianeplscdriiotoaek') # doctest: +ELLIPSIS
518 (6, 0.83453041115025...)
521 best_fit
= float("inf")
522 for key
in range(1, 20):
523 if len(message
) % key
== 0:
524 plaintext
= scytale_decipher(message
, key
)
525 counts
= message_frequency_scaling(frequencies(
526 ngrams(sanitise(plaintext
), 2)))
527 fit
= metric(target_counts
, counts
)
528 logger
.debug('Scytale break attempt using key {0} gives fit of '
529 '{1} and decrypt starting: {2}'.format(key
,
530 fit
, sanitise(plaintext
)[:50]))
534 logger
.info('Scytale break best fit with key {0} gives fit of {1} and '
535 'decrypt starting: {2}'.format(best_key
, best_fit
,
536 sanitise(scytale_decipher(message
, best_key
))[:50]))
537 return best_key
, best_fit
540 if __name__
== "__main__":