6 from itertools
import zip_longest
7 from segment
import segment
8 from multiprocessing
import Pool
13 # c5a = open('2012/5a.ciphertext', 'r').read()
14 # timeit.timeit('keyword_break(c5a)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break', number=1)
17 logger
= logging
.getLogger(__name__
)
18 logger
.addHandler(logging
.FileHandler('cipher.log'))
19 logger
.setLevel(logging
.WARNING
)
20 #logger.setLevel(logging.INFO)
21 #logger.setLevel(logging.DEBUG)
23 english_counts
= collections
.defaultdict(int)
24 with
open('count_1l.txt', 'r') as f
:
26 (letter
, count
) = line
.split("\t")
27 english_counts
[letter
] = int(count
)
28 normalised_english_counts
= norms
.normalise(english_counts
)
30 english_bigram_counts
= collections
.defaultdict(int)
31 with
open('count_2l.txt', 'r') as f
:
33 (bigram
, count
) = line
.split("\t")
34 english_bigram_counts
[bigram
] = int(count
)
35 normalised_english_bigram_counts
= norms
.normalise(english_bigram_counts
)
37 with
open('words.txt', 'r') as f
:
38 keywords
= [line
.rstrip() for line
in f
]
40 modular_division_table
= [[0]*26 for x
in range(26)]
44 modular_division_table
[b
][c
] = a
48 """Remove all non-alphabetic characters and convert the text to lowercase
50 >>> sanitise('The Quick')
52 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
53 'thequickbrownfoxjumpedoverthelazydog'
55 sanitised
= [c
.lower() for c
in text
if c
in string
.ascii_letters
]
56 return ''.join(sanitised
)
59 """Returns all n-grams of a text
61 >>> ngrams(sanitise('the quick brown fox'), 2)
62 ['th', 'he', 'eq', 'qu', 'ui', 'ic', 'ck', 'kb', 'br', 'ro', 'ow', 'wn', 'nf', 'fo', 'ox']
63 >>> ngrams(sanitise('the quick brown fox'), 4)
64 ['theq', 'hequ', 'equi', 'quic', 'uick', 'ickb', 'ckbr', 'kbro', 'brow', 'rown', 'ownf', 'wnfo', 'nfox']
66 return [text
[i
:i
+n
] for i
in range(len(text
)-n
+1)]
68 def every_nth(text
, n
):
69 """Returns n strings, each of which consists of every nth character,
70 starting with the 0th, 1st, 2nd, ... (n-1)th character
72 >>> every_nth(string.ascii_lowercase, 5)
73 ['afkpuz', 'bglqv', 'chmrw', 'dinsx', 'ejoty']
74 >>> every_nth(string.ascii_lowercase, 1)
75 ['abcdefghijklmnopqrstuvwxyz']
76 >>> every_nth(string.ascii_lowercase, 26)
77 ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
79 split_text
= [text
[i
:i
+n
] for i
in range(0, len(text
), n
)]
80 return [''.join(l
) for l
in zip_longest(*split_text
, fillvalue
='')]
82 def combine_every_nth(split_text
):
83 """Reforms a text split into every_nth strings
85 >>> combine_every_nth(every_nth(string.ascii_lowercase, 5))
86 'abcdefghijklmnopqrstuvwxyz'
87 >>> combine_every_nth(every_nth(string.ascii_lowercase, 1))
88 'abcdefghijklmnopqrstuvwxyz'
89 >>> combine_every_nth(every_nth(string.ascii_lowercase, 26))
90 'abcdefghijklmnopqrstuvwxyz'
92 return ''.join([''.join(l
) for l
in zip_longest(*split_text
, fillvalue
='')])
95 def frequencies(text
):
96 """Count the number of occurrences of each character in text
98 >>> sorted(frequencies('abcdefabc').items())
99 [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
100 >>> sorted(frequencies('the quick brown fox jumped over the lazy dog').items())
101 [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
102 >>> sorted(frequencies('The Quick BROWN fox jumped! over... the (9lazy) DOG').items())
103 [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1), ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1), ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1), ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
104 >>> sorted(frequencies(sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')).items())
105 [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
107 counts
= collections
.defaultdict(int)
111 letter_frequencies
= frequencies
113 def deduplicate(text
):
114 return list(collections
.OrderedDict
.fromkeys(text
))
118 def caesar_encipher_letter(letter
, shift
):
119 """Encipher a letter, given a shift amount
121 >>> caesar_encipher_letter('a', 1)
123 >>> caesar_encipher_letter('a', 2)
125 >>> caesar_encipher_letter('b', 2)
127 >>> caesar_encipher_letter('x', 2)
129 >>> caesar_encipher_letter('y', 2)
131 >>> caesar_encipher_letter('z', 2)
133 >>> caesar_encipher_letter('z', -1)
135 >>> caesar_encipher_letter('a', -1)
138 if letter
in string
.ascii_letters
:
139 if letter
in string
.ascii_uppercase
:
140 alphabet_start
= ord('A')
142 alphabet_start
= ord('a')
143 return chr(((ord(letter
) - alphabet_start
+ shift
) % 26) + alphabet_start
)
147 def caesar_decipher_letter(letter
, shift
):
148 """Decipher a letter, given a shift amount
150 >>> caesar_decipher_letter('b', 1)
152 >>> caesar_decipher_letter('b', 2)
155 return caesar_encipher_letter(letter
, -shift
)
157 def caesar_encipher(message
, shift
):
158 """Encipher a message with the Caesar cipher of given shift
160 >>> caesar_encipher('abc', 1)
162 >>> caesar_encipher('abc', 2)
164 >>> caesar_encipher('abcxyz', 2)
166 >>> caesar_encipher('ab cx yz', 2)
169 enciphered
= [caesar_encipher_letter(l
, shift
) for l
in message
]
170 return ''.join(enciphered
)
172 def caesar_decipher(message
, shift
):
173 """Encipher a message with the Caesar cipher of given shift
175 >>> caesar_decipher('bcd', 1)
177 >>> caesar_decipher('cde', 2)
179 >>> caesar_decipher('cd ez ab', 2)
182 return caesar_encipher(message
, -shift
)
184 def affine_encipher_letter(letter
, multiplier
=1, adder
=0, one_based
=True):
185 """Encipher a letter, given a multiplier and adder
187 >>> ''.join([affine_encipher_letter(l, 3, 5, True) for l in string.ascii_uppercase])
188 'HKNQTWZCFILORUXADGJMPSVYBE'
189 >>> ''.join([affine_encipher_letter(l, 3, 5, False) for l in string.ascii_uppercase])
190 'FILORUXADGJMPSVYBEHKNQTWZC'
192 if letter
in string
.ascii_letters
:
193 if letter
in string
.ascii_uppercase
:
194 alphabet_start
= ord('A')
196 alphabet_start
= ord('a')
197 letter_number
= ord(letter
) - alphabet_start
198 if one_based
: letter_number
+= 1
199 cipher_number
= (letter_number
* multiplier
+ adder
) % 26
200 if one_based
: cipher_number
-= 1
201 return chr(cipher_number
% 26 + alphabet_start
)
205 def affine_decipher_letter(letter
, multiplier
=1, adder
=0, one_based
=True):
206 """Encipher a letter, given a multiplier and adder
208 >>> ''.join([affine_decipher_letter(l, 3, 5, True) for l in 'HKNQTWZCFILORUXADGJMPSVYBE'])
209 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
210 >>> ''.join([affine_decipher_letter(l, 3, 5, False) for l in 'FILORUXADGJMPSVYBEHKNQTWZC'])
211 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
213 if letter
in string
.ascii_letters
:
214 if letter
in string
.ascii_uppercase
:
215 alphabet_start
= ord('A')
217 alphabet_start
= ord('a')
218 cipher_number
= ord(letter
) - alphabet_start
219 if one_based
: cipher_number
+= 1
220 plaintext_number
= modular_division_table
[multiplier
][(cipher_number
- adder
) % 26]
221 if one_based
: plaintext_number
-= 1
222 return chr(plaintext_number
% 26 + alphabet_start
)
226 def affine_encipher(message
, multiplier
=1, adder
=0, one_based
=True):
227 """Encipher a message
229 >>> affine_encipher('hours passed during which jerico tried every trick he could think of', 15, 22, True)
230 'lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh'
232 enciphered
= [affine_encipher_letter(l
, multiplier
, adder
, one_based
) for l
in message
]
233 return ''.join(enciphered
)
235 def affine_decipher(message
, multiplier
=1, adder
=0, one_based
=True):
236 """Decipher a message
238 >>> affine_decipher('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh', 15, 22, True)
239 'hours passed during which jerico tried every trick he could think of'
241 enciphered
= [affine_decipher_letter(l
, multiplier
, adder
, one_based
) for l
in message
]
242 return ''.join(enciphered
)
245 def keyword_cipher_alphabet_of(keyword
, wrap_alphabet
=0):
246 """Find the cipher alphabet given a keyword.
247 wrap_alphabet controls how the rest of the alphabet is added
250 1 : from the last letter in the sanitised keyword
251 2 : from the largest letter in the sanitised keyword
253 >>> keyword_cipher_alphabet_of('bayes')
254 'bayescdfghijklmnopqrtuvwxz'
255 >>> keyword_cipher_alphabet_of('bayes', 0)
256 'bayescdfghijklmnopqrtuvwxz'
257 >>> keyword_cipher_alphabet_of('bayes', 1)
258 'bayestuvwxzcdfghijklmnopqr'
259 >>> keyword_cipher_alphabet_of('bayes', 2)
260 'bayeszcdfghijklmnopqrtuvwx'
262 if wrap_alphabet
== 0:
263 cipher_alphabet
= ''.join(deduplicate(sanitise(keyword
) + string
.ascii_lowercase
))
265 if wrap_alphabet
== 1:
266 last_keyword_letter
= deduplicate(sanitise(keyword
))[-1]
268 last_keyword_letter
= sorted(sanitise(keyword
))[-1]
269 last_keyword_position
= string
.ascii_lowercase
.find(last_keyword_letter
) + 1
270 cipher_alphabet
= ''.join(deduplicate(sanitise(keyword
) +
271 string
.ascii_lowercase
[last_keyword_position
:] +
272 string
.ascii_lowercase
))
273 return cipher_alphabet
276 def keyword_encipher(message
, keyword
, wrap_alphabet
=0):
277 """Enciphers a message with a keyword substitution cipher.
278 wrap_alphabet controls how the rest of the alphabet is added
281 1 : from the last letter in the sanitised keyword
282 2 : from the largest letter in the sanitised keyword
284 >>> keyword_encipher('test message', 'bayes')
286 >>> keyword_encipher('test message', 'bayes', 0)
288 >>> keyword_encipher('test message', 'bayes', 1)
290 >>> keyword_encipher('test message', 'bayes', 2)
293 cipher_alphabet
= keyword_cipher_alphabet_of(keyword
, wrap_alphabet
)
294 cipher_translation
= ''.maketrans(string
.ascii_lowercase
, cipher_alphabet
)
295 return message
.lower().translate(cipher_translation
)
297 def keyword_decipher(message
, keyword
, wrap_alphabet
=0):
298 """Deciphers a message with a keyword substitution cipher.
299 wrap_alphabet controls how the rest of the alphabet is added
302 1 : from the last letter in the sanitised keyword
303 2 : from the largest letter in the sanitised keyword
305 >>> keyword_decipher('rsqr ksqqbds', 'bayes')
307 >>> keyword_decipher('rsqr ksqqbds', 'bayes', 0)
309 >>> keyword_decipher('lskl dskkbus', 'bayes', 1)
311 >>> keyword_decipher('qspq jsppbcs', 'bayes', 2)
314 cipher_alphabet
= keyword_cipher_alphabet_of(keyword
, wrap_alphabet
)
315 cipher_translation
= ''.maketrans(cipher_alphabet
, string
.ascii_lowercase
)
316 return message
.lower().translate(cipher_translation
)
318 def scytale_encipher(message
, rows
):
319 """Enciphers using the scytale transposition cipher.
320 Message is padded with spaces to allow all rows to be the same length.
322 >>> scytale_encipher('thequickbrownfox', 3)
324 >>> scytale_encipher('thequickbrownfox', 4)
326 >>> scytale_encipher('thequickbrownfox', 5)
327 'tubn hirf ecoo qkwx '
328 >>> scytale_encipher('thequickbrownfox', 6)
330 >>> scytale_encipher('thequickbrownfox', 7)
331 'tqcrnx hukof eibwo '
333 if len(message
) % rows
!= 0:
334 message
+= ' '*(rows
- len(message
) % rows
)
335 row_length
= round(len(message
) / rows
)
336 slices
= [message
[i
:i
+row_length
] for i
in range(0, len(message
), row_length
)]
337 return ''.join([''.join(r
) for r
in zip_longest(*slices
, fillvalue
='')])
339 def scytale_decipher(message
, rows
):
340 """Deciphers using the scytale transposition cipher.
341 Assumes the message is padded so that all rows are the same length.
343 >>> scytale_decipher('tcnhkfeboqrxuo iw ', 3)
345 >>> scytale_decipher('tubnhirfecooqkwx', 4)
347 >>> scytale_decipher('tubn hirf ecoo qkwx ', 5)
349 >>> scytale_decipher('tqcrnxhukof eibwo ', 6)
351 >>> scytale_decipher('tqcrnx hukof eibwo ', 7)
354 cols
= round(len(message
) / rows
)
355 columns
= [message
[i
:i
+rows
] for i
in range(0, cols
* rows
, rows
)]
356 return ''.join([''.join(c
) for c
in zip_longest(*columns
, fillvalue
='')])
359 def caesar_break(message
, metric
=norms
.euclidean_distance
, target_counts
=normalised_english_counts
, message_frequency_scaling
=norms
.normalise
):
360 """Breaks a Caesar cipher using frequency analysis
362 >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrhecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS
363 (4, 0.31863952890183...)
364 >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgteeraxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS
365 (19, 0.42152901235832...)
366 >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurersvaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS
367 (13, 0.316029208075451...)
369 sanitised_message
= sanitise(message
)
371 best_fit
= float("inf")
372 for shift
in range(26):
373 plaintext
= caesar_decipher(sanitised_message
, shift
)
374 counts
= message_frequency_scaling(letter_frequencies(plaintext
))
375 fit
= metric(target_counts
, counts
)
376 logger
.debug('Caesar break attempt using key {0} gives fit of {1} and decrypt starting: {2}'.format(shift
, fit
, plaintext
[:50]))
380 logger
.info('Caesar break best fit: key {0} gives fit of {1} and decrypt starting: {2}'.format(best_shift
, best_fit
, caesar_decipher(sanitised_message
, best_shift
)[:50]))
381 return best_shift
, best_fit
383 def affine_break(message
, metric
=norms
.euclidean_distance
, target_counts
=normalised_english_counts
, message_frequency_scaling
=norms
.normalise
):
384 """Breaks an affine cipher using frequency analysis
386 >>> affine_break('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh jm bfmibj umis hfsul axubafkjamx. ls kffkxwsd jls ofgbjmwfkiu olfmxmtmwaokttg jlsx ls kffkxwsd jlsi zg tsxwjl. jlsx ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai kxd clm ckuxj.') # doctest: +ELLIPSIS
387 ((15, 22, True), 0.23570361818655...)
389 sanitised_message
= sanitise(message
)
392 best_one_based
= True
393 best_fit
= float("inf")
394 for one_based
in [True, False]:
395 for multiplier
in range(1, 26, 2):
396 for adder
in range(26):
397 plaintext
= affine_decipher(sanitised_message
, multiplier
, adder
, one_based
)
398 counts
= message_frequency_scaling(letter_frequencies(plaintext
))
399 fit
= metric(target_counts
, counts
)
400 logger
.debug('Affine break attempt using key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(multiplier
, adder
, one_based
, fit
, plaintext
[:50]))
403 best_multiplier
= multiplier
405 best_one_based
= one_based
406 logger
.info('Affine break best fit with key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(best_multiplier
, best_adder
, best_one_based
, best_fit
, affine_decipher(sanitised_message
, best_multiplier
, best_adder
, best_one_based
)[:50]))
407 return (best_multiplier
, best_adder
, best_one_based
), best_fit
410 def keyword_break(message
, wordlist
=keywords
, metric
=norms
.euclidean_distance
, target_counts
=normalised_english_counts
, message_frequency_scaling
=norms
.normalise
):
411 """Breaks a keyword substitution cipher using a dictionary and frequency analysis
413 >>> keyword_break(keyword_encipher('this is a test message for the keyword decipherment', 'elephant', 1), wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
414 (('elephant', 1), 0.41643991598441...)
417 best_wrap_alphabet
= True
418 best_fit
= float("inf")
419 for wrap_alphabet
in range(3):
420 for keyword
in wordlist
:
421 plaintext
= keyword_decipher(message
, keyword
, wrap_alphabet
)
422 counts
= message_frequency_scaling(letter_frequencies(plaintext
))
423 fit
= metric(target_counts
, counts
)
424 logger
.debug('Keyword break attempt using key {0} (wrap={1}) gives fit of {2} and decrypt starting: {3}'.format(keyword
, wrap_alphabet
, fit
, sanitise(plaintext
)[:50]))
427 best_keyword
= keyword
428 best_wrap_alphabet
= wrap_alphabet
429 logger
.info('Keyword break best fit with key {0} (wrap={1}) gives fit of {2} and decrypt starting: {3}'.format(best_keyword
, best_wrap_alphabet
, best_fit
, sanitise(keyword_decipher(message
, best_keyword
))[:50]))
430 return (best_keyword
, best_wrap_alphabet
), best_fit
432 def keyword_break_mp(message
, wordlist
=keywords
, metric
=norms
.euclidean_distance
, target_counts
=normalised_english_counts
, message_frequency_scaling
=norms
.normalise
, chunksize
=500):
433 """Breaks a keyword substitution cipher using a dictionary and frequency analysis
435 >>> keyword_break_mp(keyword_encipher('this is a test message for the keyword decipherment', 'elephant', 1), wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
436 (('elephant', 1), 0.41643991598441...)
439 helper_args
= [(message
, word
, wrap
, metric
, target_counts
, message_frequency_scaling
) for word
in wordlist
for wrap
in range(3)]
440 # breaks = map(lambda kw: keyword_break_one(message, kw[0], kw[1], metric, target_counts, message_frequency_scaling), keys)
441 breaks
= pool
.starmap(keyword_break_one
, helper_args
, chunksize
)
442 return min(breaks
, key
=lambda k
: k
[1])
444 def keyword_break_one(message
, keyword
, wrap_alphabet
, metric
, target_counts
, message_frequency_scaling
):
445 plaintext
= keyword_decipher(message
, keyword
, wrap_alphabet
)
446 counts
= message_frequency_scaling(letter_frequencies(plaintext
))
447 fit
= metric(target_counts
, counts
)
448 logger
.debug('Keyword break attempt using key {0} (wrap={1}) gives fit of {2} and decrypt starting: {3}'.format(keyword
, wrap_alphabet
, fit
, sanitise(plaintext
)[:50]))
449 return (keyword
, wrap_alphabet
), fit
452 def scytale_break(message
, metric
=norms
.euclidean_distance
, target_counts
=normalised_english_bigram_counts
, message_frequency_scaling
=norms
.normalise
):
453 """Breaks a Scytale cipher
455 >>> scytale_break('tfeulchtrtteehwahsdehneoifeayfsondmwpltmaoalhikotoeredcweatehiplwxsnhooacgorrcrcraotohsgullasenylrendaianeplscdriiotoaek') # doctest: +ELLIPSIS
456 (6, 0.83453041115025...)
459 best_fit
= float("inf")
460 for key
in range(1, 20):
461 if len(message
) % key
== 0:
462 plaintext
= scytale_decipher(message
, key
)
463 counts
= message_frequency_scaling(frequencies(ngrams(sanitise(plaintext
), 2)))
464 fit
= metric(target_counts
, counts
)
465 logger
.debug('Scytale break attempt using key {0} gives fit of {1} and decrypt starting: {2}'.format(key
, fit
, sanitise(plaintext
)[:50]))
469 logger
.info('Scytale break best fit with key {0} gives fit of {1} and decrypt starting: {2}'.format(best_key
, best_fit
, sanitise(scytale_decipher(message
, best_key
))[:50]))
470 return best_key
, best_fit
473 if __name__
== "__main__":