Done scytale breaking
[cipher-tools.git] / cipher.py
1 import string
2 import collections
3 import norms
4 import logging
5 import math
6 from itertools import zip_longest
7 from segment import segment
8
9 # To time a run:
10 #
11 # import timeit
12 # c5a = open('2012/5a.ciphertext', 'r').read()
13 # timeit.timeit('keyword_break(c5a)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break', number=1)
14
15
16 logger = logging.getLogger(__name__)
17 logger.addHandler(logging.FileHandler('cipher.log'))
18 logger.setLevel(logging.WARNING)
19 #logger.setLevel(logging.INFO)
20 #logger.setLevel(logging.DEBUG)
21
22 english_counts = collections.defaultdict(int)
23 with open('count_1l.txt', 'r') as f:
24 for line in f:
25 (letter, count) = line.split("\t")
26 english_counts[letter] = int(count)
27 normalised_english_counts = norms.normalise(english_counts)
28
29 english_bigram_counts = collections.defaultdict(int)
30 with open('count_2l.txt', 'r') as f:
31 for line in f:
32 (bigram, count) = line.split("\t")
33 english_bigram_counts[bigram] = int(count)
34 normalised_english_bigram_counts = norms.normalise(english_bigram_counts)
35
36 with open('words.txt', 'r') as f:
37 keywords = [line.rstrip() for line in f]
38
39 modular_division_table = [[0]*26 for x in range(26)]
40 for a in range(26):
41 for b in range(26):
42 c = (a * b) % 26
43 modular_division_table[b][c] = a
44
45
46 def sanitise(text):
47 """Remove all non-alphabetic characters and convert the text to lowercase
48
49 >>> sanitise('The Quick')
50 'thequick'
51 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
52 'thequickbrownfoxjumpedoverthelazydog'
53 """
54 sanitised = [c.lower() for c in text if c in string.ascii_letters]
55 return ''.join(sanitised)
56
57 def ngrams(text, n):
58 """Returns all n-grams of a text
59
60 >>> ngrams(sanitise('the quick brown fox'), 2)
61 ['th', 'he', 'eq', 'qu', 'ui', 'ic', 'ck', 'kb', 'br', 'ro', 'ow', 'wn', 'nf', 'fo', 'ox']
62 >>> ngrams(sanitise('the quick brown fox'), 4)
63 ['theq', 'hequ', 'equi', 'quic', 'uick', 'ickb', 'ckbr', 'kbro', 'brow', 'rown', 'ownf', 'wnfo', 'nfox']
64 """
65 return [text[i:i+n] for i in range(len(text)-n+1)]
66
67 def every_nth(text, n):
68 """Returns n strings, each of which consists of every nth character,
69 starting with the 0th, 1st, 2nd, ... (n-1)th character
70
71 >>> every_nth(string.ascii_lowercase, 5)
72 ['afkpuz', 'bglqv', 'chmrw', 'dinsx', 'ejoty']
73 >>> every_nth(string.ascii_lowercase, 1)
74 ['abcdefghijklmnopqrstuvwxyz']
75 >>> every_nth(string.ascii_lowercase, 26)
76 ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
77 """
78 split_text = [text[i:i+n] for i in range(0, len(text), n)]
79 return [''.join(l) for l in zip_longest(*split_text, fillvalue='')]
80
81 def combine_every_nth(split_text):
82 """Reforms a text split into every_nth strings
83
84 >>> combine_every_nth(every_nth(string.ascii_lowercase, 5))
85 'abcdefghijklmnopqrstuvwxyz'
86 >>> combine_every_nth(every_nth(string.ascii_lowercase, 1))
87 'abcdefghijklmnopqrstuvwxyz'
88 >>> combine_every_nth(every_nth(string.ascii_lowercase, 26))
89 'abcdefghijklmnopqrstuvwxyz'
90 """
91 return ''.join([''.join(l) for l in zip_longest(*split_text, fillvalue='')])
92
93
94 def frequencies(text):
95 """Count the number of occurrences of each character in text
96
97 >>> sorted(frequencies('abcdefabc').items())
98 [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
99 >>> sorted(frequencies('the quick brown fox jumped over the lazy dog').items())
100 [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
101 >>> sorted(frequencies('The Quick BROWN fox jumped! over... the (9lazy) DOG').items())
102 [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1), ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1), ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1), ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
103 >>> sorted(frequencies(sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')).items())
104 [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
105 """
106 counts = collections.defaultdict(int)
107 for c in text:
108 counts[c] += 1
109 return counts
110 letter_frequencies = frequencies
111
112 def deduplicate(text):
113 return list(collections.OrderedDict.fromkeys(text))
114
115
116
117 def caesar_encipher_letter(letter, shift):
118 """Encipher a letter, given a shift amount
119
120 >>> caesar_encipher_letter('a', 1)
121 'b'
122 >>> caesar_encipher_letter('a', 2)
123 'c'
124 >>> caesar_encipher_letter('b', 2)
125 'd'
126 >>> caesar_encipher_letter('x', 2)
127 'z'
128 >>> caesar_encipher_letter('y', 2)
129 'a'
130 >>> caesar_encipher_letter('z', 2)
131 'b'
132 >>> caesar_encipher_letter('z', -1)
133 'y'
134 >>> caesar_encipher_letter('a', -1)
135 'z'
136 """
137 if letter in string.ascii_letters:
138 if letter in string.ascii_uppercase:
139 alphabet_start = ord('A')
140 else:
141 alphabet_start = ord('a')
142 return chr(((ord(letter) - alphabet_start + shift) % 26) + alphabet_start)
143 else:
144 return letter
145
146 def caesar_decipher_letter(letter, shift):
147 """Decipher a letter, given a shift amount
148
149 >>> caesar_decipher_letter('b', 1)
150 'a'
151 >>> caesar_decipher_letter('b', 2)
152 'z'
153 """
154 return caesar_encipher_letter(letter, -shift)
155
156 def caesar_encipher(message, shift):
157 """Encipher a message with the Caesar cipher of given shift
158
159 >>> caesar_encipher('abc', 1)
160 'bcd'
161 >>> caesar_encipher('abc', 2)
162 'cde'
163 >>> caesar_encipher('abcxyz', 2)
164 'cdezab'
165 >>> caesar_encipher('ab cx yz', 2)
166 'cd ez ab'
167 """
168 enciphered = [caesar_encipher_letter(l, shift) for l in message]
169 return ''.join(enciphered)
170
171 def caesar_decipher(message, shift):
172 """Encipher a message with the Caesar cipher of given shift
173
174 >>> caesar_decipher('bcd', 1)
175 'abc'
176 >>> caesar_decipher('cde', 2)
177 'abc'
178 >>> caesar_decipher('cd ez ab', 2)
179 'ab cx yz'
180 """
181 return caesar_encipher(message, -shift)
182
183 def affine_encipher_letter(letter, multiplier=1, adder=0, one_based=True):
184 """Encipher a letter, given a multiplier and adder
185
186 >>> ''.join([affine_encipher_letter(l, 3, 5, True) for l in string.ascii_uppercase])
187 'HKNQTWZCFILORUXADGJMPSVYBE'
188 >>> ''.join([affine_encipher_letter(l, 3, 5, False) for l in string.ascii_uppercase])
189 'FILORUXADGJMPSVYBEHKNQTWZC'
190 """
191 if letter in string.ascii_letters:
192 if letter in string.ascii_uppercase:
193 alphabet_start = ord('A')
194 else:
195 alphabet_start = ord('a')
196 letter_number = ord(letter) - alphabet_start
197 if one_based: letter_number += 1
198 cipher_number = (letter_number * multiplier + adder) % 26
199 if one_based: cipher_number -= 1
200 return chr(cipher_number % 26 + alphabet_start)
201 else:
202 return letter
203
204 def affine_decipher_letter(letter, multiplier=1, adder=0, one_based=True):
205 """Encipher a letter, given a multiplier and adder
206
207 >>> ''.join([affine_decipher_letter(l, 3, 5, True) for l in 'HKNQTWZCFILORUXADGJMPSVYBE'])
208 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
209 >>> ''.join([affine_decipher_letter(l, 3, 5, False) for l in 'FILORUXADGJMPSVYBEHKNQTWZC'])
210 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
211 """
212 if letter in string.ascii_letters:
213 if letter in string.ascii_uppercase:
214 alphabet_start = ord('A')
215 else:
216 alphabet_start = ord('a')
217 cipher_number = ord(letter) - alphabet_start
218 if one_based: cipher_number += 1
219 plaintext_number = modular_division_table[multiplier][(cipher_number - adder) % 26]
220 if one_based: plaintext_number -= 1
221 return chr(plaintext_number % 26 + alphabet_start)
222 else:
223 return letter
224
225 def affine_encipher(message, multiplier=1, adder=0, one_based=True):
226 """Encipher a message
227
228 >>> affine_encipher('hours passed during which jerico tried every trick he could think of', 15, 22, True)
229 'lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh'
230 """
231 enciphered = [affine_encipher_letter(l, multiplier, adder, one_based) for l in message]
232 return ''.join(enciphered)
233
234 def affine_decipher(message, multiplier=1, adder=0, one_based=True):
235 """Decipher a message
236
237 >>> affine_decipher('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh', 15, 22, True)
238 'hours passed during which jerico tried every trick he could think of'
239 """
240 enciphered = [affine_decipher_letter(l, multiplier, adder, one_based) for l in message]
241 return ''.join(enciphered)
242
243
244 def keyword_cipher_alphabet_of(keyword, wrap_alphabet=0):
245 """Find the cipher alphabet given a keyword.
246 wrap_alphabet controls how the rest of the alphabet is added
247 after the keyword.
248 0 : from 'a'
249 1 : from the last letter in the sanitised keyword
250 2 : from the largest letter in the sanitised keyword
251
252 >>> keyword_cipher_alphabet_of('bayes')
253 'bayescdfghijklmnopqrtuvwxz'
254 >>> keyword_cipher_alphabet_of('bayes', 0)
255 'bayescdfghijklmnopqrtuvwxz'
256 >>> keyword_cipher_alphabet_of('bayes', 1)
257 'bayestuvwxzcdfghijklmnopqr'
258 >>> keyword_cipher_alphabet_of('bayes', 2)
259 'bayeszcdfghijklmnopqrtuvwx'
260 """
261 if wrap_alphabet == 0:
262 cipher_alphabet = ''.join(deduplicate(sanitise(keyword) + string.ascii_lowercase))
263 else:
264 if wrap_alphabet == 1:
265 last_keyword_letter = deduplicate(sanitise(keyword))[-1]
266 else:
267 last_keyword_letter = sorted(sanitise(keyword))[-1]
268 last_keyword_position = string.ascii_lowercase.find(last_keyword_letter) + 1
269 cipher_alphabet = ''.join(deduplicate(sanitise(keyword) +
270 string.ascii_lowercase[last_keyword_position:] +
271 string.ascii_lowercase))
272 return cipher_alphabet
273
274
275 def keyword_encipher(message, keyword, wrap_alphabet=0):
276 """Enciphers a message with a keyword substitution cipher.
277 wrap_alphabet controls how the rest of the alphabet is added
278 after the keyword.
279 0 : from 'a'
280 1 : from the last letter in the sanitised keyword
281 2 : from the largest letter in the sanitised keyword
282
283 >>> keyword_encipher('test message', 'bayes')
284 'rsqr ksqqbds'
285 >>> keyword_encipher('test message', 'bayes', 0)
286 'rsqr ksqqbds'
287 >>> keyword_encipher('test message', 'bayes', 1)
288 'lskl dskkbus'
289 >>> keyword_encipher('test message', 'bayes', 2)
290 'qspq jsppbcs'
291 """
292 cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
293 cipher_translation = ''.maketrans(string.ascii_lowercase, cipher_alphabet)
294 return message.lower().translate(cipher_translation)
295
296 def keyword_decipher(message, keyword, wrap_alphabet=0):
297 """Deciphers a message with a keyword substitution cipher.
298 wrap_alphabet controls how the rest of the alphabet is added
299 after the keyword.
300 0 : from 'a'
301 1 : from the last letter in the sanitised keyword
302 2 : from the largest letter in the sanitised keyword
303
304 >>> keyword_decipher('rsqr ksqqbds', 'bayes')
305 'test message'
306 >>> keyword_decipher('rsqr ksqqbds', 'bayes', 0)
307 'test message'
308 >>> keyword_decipher('lskl dskkbus', 'bayes', 1)
309 'test message'
310 >>> keyword_decipher('qspq jsppbcs', 'bayes', 2)
311 'test message'
312 """
313 cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
314 cipher_translation = ''.maketrans(cipher_alphabet, string.ascii_lowercase)
315 return message.lower().translate(cipher_translation)
316
317 def scytale_encipher(message, rows):
318 """Enciphers using the scytale transposition cipher.
319 Message is padded with spaces to allow all rows to be the same length.
320
321 >>> scytale_encipher('thequickbrownfox', 3)
322 'tcnhkfeboqrxuo iw '
323 >>> scytale_encipher('thequickbrownfox', 4)
324 'tubnhirfecooqkwx'
325 >>> scytale_encipher('thequickbrownfox', 5)
326 'tubn hirf ecoo qkwx '
327 >>> scytale_encipher('thequickbrownfox', 6)
328 'tqcrnxhukof eibwo '
329 >>> scytale_encipher('thequickbrownfox', 7)
330 'tqcrnx hukof eibwo '
331 """
332 if len(message) % rows != 0:
333 message += ' '*(rows - len(message) % rows)
334 row_length = round(len(message) / rows)
335 slices = [message[i:i+row_length] for i in range(0, len(message), row_length)]
336 return ''.join([''.join(r) for r in zip_longest(*slices, fillvalue='')])
337
338 def scytale_decipher(message, rows):
339 """Deciphers using the scytale transposition cipher.
340 Assumes the message is padded so that all rows are the same length.
341
342 >>> scytale_decipher('tcnhkfeboqrxuo iw ', 3)
343 'thequickbrownfox '
344 >>> scytale_decipher('tubnhirfecooqkwx', 4)
345 'thequickbrownfox'
346 >>> scytale_decipher('tubn hirf ecoo qkwx ', 5)
347 'thequickbrownfox '
348 >>> scytale_decipher('tqcrnxhukof eibwo ', 6)
349 'thequickbrownfox '
350 >>> scytale_decipher('tqcrnx hukof eibwo ', 7)
351 'thequickbrownfox '
352 """
353 cols = round(len(message) / rows)
354 columns = [message[i:i+rows] for i in range(0, cols * rows, rows)]
355 return ''.join([''.join(c) for c in zip_longest(*columns, fillvalue='')])
356
357
358 def caesar_break(message, metric=norms.euclidean_distance, target_counts=normalised_english_counts, message_frequency_scaling=norms.normalise):
359 """Breaks a Caesar cipher using frequency analysis
360
361 >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrhecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS
362 (4, 0.31863952890183...)
363 >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgteeraxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS
364 (19, 0.42152901235832...)
365 >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurersvaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS
366 (13, 0.316029208075451...)
367 """
368 sanitised_message = sanitise(message)
369 best_shift = 0
370 best_fit = float("inf")
371 for shift in range(26):
372 plaintext = caesar_decipher(sanitised_message, shift)
373 counts = message_frequency_scaling(letter_frequencies(plaintext))
374 fit = metric(target_counts, counts)
375 logger.debug('Caesar break attempt using key {0} gives fit of {1} and decrypt starting: {2}'.format(shift, fit, plaintext[:50]))
376 if fit < best_fit:
377 best_fit = fit
378 best_shift = shift
379 logger.info('Caesar break best fit: key {0} gives fit of {1} and decrypt starting: {2}'.format(best_shift, best_fit, caesar_decipher(sanitised_message, best_shift)[:50]))
380 return best_shift, best_fit
381
382 def affine_break(message, metric=norms.euclidean_distance, target_counts=normalised_english_counts, message_frequency_scaling=norms.normalise):
383 """Breaks an affine cipher using frequency analysis
384
385 >>> affine_break('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh jm bfmibj umis hfsul axubafkjamx. ls kffkxwsd jls ofgbjmwfkiu olfmxmtmwaokttg jlsx ls kffkxwsd jlsi zg tsxwjl. jlsx ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai kxd clm ckuxj.') # doctest: +ELLIPSIS
386 ((15, 22, True), 0.23570361818655...)
387 """
388 sanitised_message = sanitise(message)
389 best_multiplier = 0
390 best_adder = 0
391 best_one_based = True
392 best_fit = float("inf")
393 for one_based in [True, False]:
394 for multiplier in range(1, 26, 2):
395 for adder in range(26):
396 plaintext = affine_decipher(sanitised_message, multiplier, adder, one_based)
397 counts = message_frequency_scaling(letter_frequencies(plaintext))
398 fit = metric(target_counts, counts)
399 logger.debug('Affine break attempt using key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(multiplier, adder, one_based, fit, plaintext[:50]))
400 if fit < best_fit:
401 best_fit = fit
402 best_multiplier = multiplier
403 best_adder = adder
404 best_one_based = one_based
405 logger.info('Affine break best fit with key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(best_multiplier, best_adder, best_one_based, best_fit, affine_decipher(sanitised_message, best_multiplier, best_adder, best_one_based)[:50]))
406 return (best_multiplier, best_adder, best_one_based), best_fit
407
408
409 def keyword_break(message, wordlist=keywords, metric=norms.euclidean_distance, target_counts=normalised_english_counts, message_frequency_scaling=norms.normalise):
410 """Breaks a keyword substitution cipher using a dictionary and frequency analysis
411
412 >>> keyword_break(keyword_encipher('this is a test message for the keyword decipherment', 'elephant', 1), wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
413 (('elephant', 1), 0.41643991598441...)
414 """
415 best_keyword = ''
416 best_wrap_alphabet = True
417 best_fit = float("inf")
418 for wrap_alphabet in range(3):
419 for keyword in wordlist:
420 plaintext = keyword_decipher(message, keyword, wrap_alphabet)
421 counts = message_frequency_scaling(letter_frequencies(plaintext))
422 fit = metric(target_counts, counts)
423 logger.debug('Keyword break attempt using key {0} (wrap={1}) gives fit of {2} and decrypt starting: {3}'.format(keyword, wrap_alphabet, fit, sanitise(plaintext)[:50]))
424 if fit < best_fit:
425 best_fit = fit
426 best_keyword = keyword
427 best_wrap_alphabet = wrap_alphabet
428 logger.info('Keyword break best fit with key {0} (wrap={1}) gives fit of {2} and decrypt starting: {3}'.format(best_keyword, best_wrap_alphabet, best_fit, sanitise(keyword_decipher(message, best_keyword))[:50]))
429 return (best_keyword, best_wrap_alphabet), best_fit
430
431 def scytale_break(message, metric=norms.euclidean_distance, target_counts=normalised_english_bigram_counts, message_frequency_scaling=norms.normalise):
432 """Breaks a Scytale cipher
433
434 >>> scytale_break('tfeulchtrtteehwahsdehneoifeayfsondmwpltmaoalhikotoeredcweatehiplwxsnhooacgorrcrcraotohsgullasenylrendaianeplscdriiotoaek') # doctest: +ELLIPSIS
435 (6, 0.83453041115025...)
436 """
437 best_key = 0
438 best_fit = float("inf")
439 for key in range(1, 20):
440 if len(message) % key == 0:
441 plaintext = scytale_decipher(message, key)
442 counts = message_frequency_scaling(frequencies(ngrams(sanitise(plaintext), 2)))
443 fit = metric(target_counts, counts)
444 logger.debug('Scytale break attempt using key {0} gives fit of {1} and decrypt starting: {2}'.format(key, fit, sanitise(plaintext)[:50]))
445 if fit < best_fit:
446 best_fit = fit
447 best_key = key
448 logger.info('Scytale break best fit with key {0} gives fit of {1} and decrypt starting: {2}'.format(best_key, best_fit, sanitise(scytale_decipher(message, best_key))[:50]))
449 return best_key, best_fit
450
451
452 if __name__ == "__main__":
453 import doctest
454 doctest.testmod()