Scytale fixed by padding message during enciphering and assuming all rows are the...
[cipher-tools.git] / cipher.py
1 import string
2 import collections
3 import norms
4 import logging
5 import math
6 from itertools import zip_longest
7 from segment import segment
8
9 # To time a run:
10 #
11 # import timeit
12 # c5a = open('2012/5a.ciphertext', 'r').read()
13 # timeit.timeit('keyword_break(c5a)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break', number=1)
14
15
16 logger = logging.getLogger(__name__)
17 logger.addHandler(logging.FileHandler('cipher.log'))
18 logger.setLevel(logging.WARNING)
19 #logger.setLevel(logging.INFO)
20
21 english_counts = collections.defaultdict(int)
22 with open('count_1l.txt', 'r') as f:
23 for line in f:
24 (letter, count) = line.split("\t")
25 english_counts[letter] = int(count)
26 normalised_english_counts = norms.normalise(english_counts)
27
28 with open('words.txt', 'r') as f:
29 keywords = [line.rstrip() for line in f]
30
31 modular_division_table = [[0]*26 for x in range(26)]
32 for a in range(26):
33 for b in range(26):
34 c = (a * b) % 26
35 modular_division_table[b][c] = a
36
37
38 def sanitise(text):
39 """Remove all non-alphabetic characters and convert the text to lowercase
40
41 >>> sanitise('The Quick')
42 'thequick'
43 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
44 'thequickbrownfoxjumpedoverthelazydog'
45 """
46 sanitised = [c.lower() for c in text if c in string.ascii_letters]
47 return ''.join(sanitised)
48
49 def ngrams(text, n):
50 """Returns all n-grams of a text
51
52 >>> ngrams(sanitise('the quick brown fox'), 2)
53 [('t', 'h'), ('h', 'e'), ('e', 'q'), ('q', 'u'), ('u', 'i'), ('i', 'c'), ('c', 'k'), ('k', 'b'), ('b', 'r'), ('r', 'o'), ('o', 'w'), ('w', 'n'), ('n', 'f'), ('f', 'o'), ('o', 'x')]
54 >>> ngrams(sanitise('the quick brown fox'), 4)
55 [('t', 'h', 'e', 'q'), ('h', 'e', 'q', 'u'), ('e', 'q', 'u', 'i'), ('q', 'u', 'i', 'c'), ('u', 'i', 'c', 'k'), ('i', 'c', 'k', 'b'), ('c', 'k', 'b', 'r'), ('k', 'b', 'r', 'o'), ('b', 'r', 'o', 'w'), ('r', 'o', 'w', 'n'), ('o', 'w', 'n', 'f'), ('w', 'n', 'f', 'o'), ('n', 'f', 'o', 'x')]
56 """
57 return [tuple(text[i:i+n]) for i in range(len(text)-n+1)]
58
59 def every_nth(text, n):
60 """Returns n strings, each of which consists of every nth character,
61 starting with the 0th, 1st, 2nd, ... (n-1)th character
62
63 >>> every_nth(string.ascii_lowercase, 5)
64 ['afkpuz', 'bglqv', 'chmrw', 'dinsx', 'ejoty']
65 >>> every_nth(string.ascii_lowercase, 1)
66 ['abcdefghijklmnopqrstuvwxyz']
67 >>> every_nth(string.ascii_lowercase, 26)
68 ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
69 """
70 split_text = [text[i:i+n] for i in range(0, len(text), n)]
71 return [''.join(l) for l in zip_longest(*split_text, fillvalue='')]
72
73 def combine_every_nth(split_text):
74 """Reforms a text split into every_nth strings
75
76 >>> combine_every_nth(every_nth(string.ascii_lowercase, 5))
77 'abcdefghijklmnopqrstuvwxyz'
78 >>> combine_every_nth(every_nth(string.ascii_lowercase, 1))
79 'abcdefghijklmnopqrstuvwxyz'
80 >>> combine_every_nth(every_nth(string.ascii_lowercase, 26))
81 'abcdefghijklmnopqrstuvwxyz'
82 """
83 return ''.join([''.join(l) for l in zip_longest(*split_text, fillvalue='')])
84
85
86 def letter_frequencies(text):
87 """Count the number of occurrences of each character in text
88
89 >>> sorted(letter_frequencies('abcdefabc').items())
90 [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
91 >>> sorted(letter_frequencies('the quick brown fox jumped over the lazy dog').items())
92 [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
93 >>> sorted(letter_frequencies('The Quick BROWN fox jumped! over... the (9lazy) DOG').items())
94 [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1), ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1), ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1), ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
95 >>> sorted(letter_frequencies(sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')).items())
96 [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
97 """
98 counts = collections.defaultdict(int)
99 for c in text:
100 counts[c] += 1
101 return counts
102
103 def deduplicate(text):
104 return list(collections.OrderedDict.fromkeys(text))
105
106
107
108 def caesar_encipher_letter(letter, shift):
109 """Encipher a letter, given a shift amount
110
111 >>> caesar_encipher_letter('a', 1)
112 'b'
113 >>> caesar_encipher_letter('a', 2)
114 'c'
115 >>> caesar_encipher_letter('b', 2)
116 'd'
117 >>> caesar_encipher_letter('x', 2)
118 'z'
119 >>> caesar_encipher_letter('y', 2)
120 'a'
121 >>> caesar_encipher_letter('z', 2)
122 'b'
123 >>> caesar_encipher_letter('z', -1)
124 'y'
125 >>> caesar_encipher_letter('a', -1)
126 'z'
127 """
128 if letter in string.ascii_letters:
129 if letter in string.ascii_uppercase:
130 alphabet_start = ord('A')
131 else:
132 alphabet_start = ord('a')
133 return chr(((ord(letter) - alphabet_start + shift) % 26) + alphabet_start)
134 else:
135 return letter
136
137 def caesar_decipher_letter(letter, shift):
138 """Decipher a letter, given a shift amount
139
140 >>> caesar_decipher_letter('b', 1)
141 'a'
142 >>> caesar_decipher_letter('b', 2)
143 'z'
144 """
145 return caesar_encipher_letter(letter, -shift)
146
147 def caesar_encipher(message, shift):
148 """Encipher a message with the Caesar cipher of given shift
149
150 >>> caesar_encipher('abc', 1)
151 'bcd'
152 >>> caesar_encipher('abc', 2)
153 'cde'
154 >>> caesar_encipher('abcxyz', 2)
155 'cdezab'
156 >>> caesar_encipher('ab cx yz', 2)
157 'cd ez ab'
158 """
159 enciphered = [caesar_encipher_letter(l, shift) for l in message]
160 return ''.join(enciphered)
161
162 def caesar_decipher(message, shift):
163 """Encipher a message with the Caesar cipher of given shift
164
165 >>> caesar_decipher('bcd', 1)
166 'abc'
167 >>> caesar_decipher('cde', 2)
168 'abc'
169 >>> caesar_decipher('cd ez ab', 2)
170 'ab cx yz'
171 """
172 return caesar_encipher(message, -shift)
173
174 def affine_encipher_letter(letter, multiplier=1, adder=0, one_based=True):
175 """Encipher a letter, given a multiplier and adder
176
177 >>> ''.join([affine_encipher_letter(l, 3, 5, True) for l in string.ascii_uppercase])
178 'HKNQTWZCFILORUXADGJMPSVYBE'
179 >>> ''.join([affine_encipher_letter(l, 3, 5, False) for l in string.ascii_uppercase])
180 'FILORUXADGJMPSVYBEHKNQTWZC'
181 """
182 if letter in string.ascii_letters:
183 if letter in string.ascii_uppercase:
184 alphabet_start = ord('A')
185 else:
186 alphabet_start = ord('a')
187 letter_number = ord(letter) - alphabet_start
188 if one_based: letter_number += 1
189 cipher_number = (letter_number * multiplier + adder) % 26
190 if one_based: cipher_number -= 1
191 return chr(cipher_number % 26 + alphabet_start)
192 else:
193 return letter
194
195 def affine_decipher_letter(letter, multiplier=1, adder=0, one_based=True):
196 """Encipher a letter, given a multiplier and adder
197
198 >>> ''.join([affine_decipher_letter(l, 3, 5, True) for l in 'HKNQTWZCFILORUXADGJMPSVYBE'])
199 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
200 >>> ''.join([affine_decipher_letter(l, 3, 5, False) for l in 'FILORUXADGJMPSVYBEHKNQTWZC'])
201 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
202 """
203 if letter in string.ascii_letters:
204 if letter in string.ascii_uppercase:
205 alphabet_start = ord('A')
206 else:
207 alphabet_start = ord('a')
208 cipher_number = ord(letter) - alphabet_start
209 if one_based: cipher_number += 1
210 plaintext_number = modular_division_table[multiplier][(cipher_number - adder) % 26]
211 if one_based: plaintext_number -= 1
212 return chr(plaintext_number % 26 + alphabet_start)
213 else:
214 return letter
215
216 def affine_encipher(message, multiplier=1, adder=0, one_based=True):
217 """Encipher a message
218
219 >>> affine_encipher('hours passed during which jerico tried every trick he could think of', 15, 22, True)
220 'lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh'
221 """
222 enciphered = [affine_encipher_letter(l, multiplier, adder, one_based) for l in message]
223 return ''.join(enciphered)
224
225 def affine_decipher(message, multiplier=1, adder=0, one_based=True):
226 """Decipher a message
227
228 >>> affine_decipher('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh', 15, 22, True)
229 'hours passed during which jerico tried every trick he could think of'
230 """
231 enciphered = [affine_decipher_letter(l, multiplier, adder, one_based) for l in message]
232 return ''.join(enciphered)
233
234
235 def keyword_cipher_alphabet_of(keyword, wrap_alphabet=0):
236 """Find the cipher alphabet given a keyword.
237 wrap_alphabet controls how the rest of the alphabet is added
238 after the keyword.
239 0 : from 'a'
240 1 : from the last letter in the sanitised keyword
241 2 : from the largest letter in the sanitised keyword
242
243 >>> keyword_cipher_alphabet_of('bayes')
244 'bayescdfghijklmnopqrtuvwxz'
245 >>> keyword_cipher_alphabet_of('bayes', 0)
246 'bayescdfghijklmnopqrtuvwxz'
247 >>> keyword_cipher_alphabet_of('bayes', 1)
248 'bayestuvwxzcdfghijklmnopqr'
249 >>> keyword_cipher_alphabet_of('bayes', 2)
250 'bayeszcdfghijklmnopqrtuvwx'
251 """
252 if wrap_alphabet == 0:
253 cipher_alphabet = ''.join(deduplicate(sanitise(keyword) + string.ascii_lowercase))
254 else:
255 if wrap_alphabet == 1:
256 last_keyword_letter = deduplicate(sanitise(keyword))[-1]
257 else:
258 last_keyword_letter = sorted(sanitise(keyword))[-1]
259 last_keyword_position = string.ascii_lowercase.find(last_keyword_letter) + 1
260 cipher_alphabet = ''.join(deduplicate(sanitise(keyword) +
261 string.ascii_lowercase[last_keyword_position:] +
262 string.ascii_lowercase))
263 return cipher_alphabet
264
265
266 def keyword_encipher(message, keyword, wrap_alphabet=0):
267 """Enciphers a message with a keyword substitution cipher.
268 wrap_alphabet controls how the rest of the alphabet is added
269 after the keyword.
270 0 : from 'a'
271 1 : from the last letter in the sanitised keyword
272 2 : from the largest letter in the sanitised keyword
273
274 >>> keyword_encipher('test message', 'bayes')
275 'rsqr ksqqbds'
276 >>> keyword_encipher('test message', 'bayes', 0)
277 'rsqr ksqqbds'
278 >>> keyword_encipher('test message', 'bayes', 1)
279 'lskl dskkbus'
280 >>> keyword_encipher('test message', 'bayes', 2)
281 'qspq jsppbcs'
282 """
283 cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
284 cipher_translation = ''.maketrans(string.ascii_lowercase, cipher_alphabet)
285 return message.lower().translate(cipher_translation)
286
287 def keyword_decipher(message, keyword, wrap_alphabet=0):
288 """Deciphers a message with a keyword substitution cipher.
289 wrap_alphabet controls how the rest of the alphabet is added
290 after the keyword.
291 0 : from 'a'
292 1 : from the last letter in the sanitised keyword
293 2 : from the largest letter in the sanitised keyword
294
295 >>> keyword_decipher('rsqr ksqqbds', 'bayes')
296 'test message'
297 >>> keyword_decipher('rsqr ksqqbds', 'bayes', 0)
298 'test message'
299 >>> keyword_decipher('lskl dskkbus', 'bayes', 1)
300 'test message'
301 >>> keyword_decipher('qspq jsppbcs', 'bayes', 2)
302 'test message'
303 """
304 cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
305 cipher_translation = ''.maketrans(cipher_alphabet, string.ascii_lowercase)
306 return message.lower().translate(cipher_translation)
307
308 def scytale_encipher(message, rows):
309 """Enciphers using the scytale transposition cipher.
310 Message is padded with spaces to allow all rows to be the same length.
311
312 >>> scytale_encipher('thequickbrownfox', 3)
313 'tcnhkfeboqrxuo iw '
314 >>> scytale_encipher('thequickbrownfox', 4)
315 'tubnhirfecooqkwx'
316 >>> scytale_encipher('thequickbrownfox', 5)
317 'tubn hirf ecoo qkwx '
318 >>> scytale_encipher('thequickbrownfox', 6)
319 'tqcrnxhukof eibwo '
320 >>> scytale_encipher('thequickbrownfox', 7)
321 'tqcrnx hukof eibwo '
322 """
323 if len(message) % rows != 0:
324 message += ' '*(rows - len(message) % rows)
325 row_length = round(len(message) / rows)
326 slices = [message[i:i+row_length] for i in range(0, len(message), row_length)]
327 return ''.join([''.join(r) for r in zip_longest(*slices, fillvalue='')])
328
329 def scytale_decipher(message, rows):
330 """Deciphers using the scytale transposition cipher.
331 Assumes the message is padded so that all rows are the same length.
332
333 >>> scytale_decipher('tcnhkfeboqrxuo iw ', 3)
334 'thequickbrownfox '
335 >>> scytale_decipher('tubnhirfecooqkwx', 4)
336 'thequickbrownfox'
337 >>> scytale_decipher('tubn hirf ecoo qkwx ', 5)
338 'thequickbrownfox '
339 >>> scytale_decipher('tqcrnxhukof eibwo ', 6)
340 'thequickbrownfox '
341 >>> scytale_decipher('tqcrnx hukof eibwo ', 7)
342 'thequickbrownfox '
343 """
344 cols = round(len(message) / rows)
345 columns = [message[i:i+rows] for i in range(0, cols * rows, rows)]
346 return ''.join([''.join(c) for c in zip_longest(*columns, fillvalue='')])
347
348
349 def caesar_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
350 """Breaks a Caesar cipher using frequency analysis
351
352 >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrhecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS
353 (4, 0.31863952890183...)
354 >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgteeraxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS
355 (19, 0.42152901235832...)
356 >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurersvaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS
357 (13, 0.316029208075451...)
358 """
359 sanitised_message = sanitise(message)
360 best_shift = 0
361 best_fit = float("inf")
362 for shift in range(26):
363 plaintext = caesar_decipher(sanitised_message, shift)
364 frequencies = message_frequency_scaling(letter_frequencies(plaintext))
365 fit = metric(target_frequencies, frequencies)
366 logger.debug('Caesar break attempt using key {0} gives fit of {1} and decrypt starting: {2}'.format(shift, fit, plaintext[:50]))
367 if fit < best_fit:
368 best_fit = fit
369 best_shift = shift
370 logger.info('Caesar break best fit: key {0} gives fit of {1} and decrypt starting: {2}'.format(best_shift, best_fit, caesar_decipher(sanitised_message, best_shift)[:50]))
371 return best_shift, best_fit
372
373 def affine_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
374 """Breaks an affine cipher using frequency analysis
375
376 >>> affine_break('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh jm bfmibj umis hfsul axubafkjamx. ls kffkxwsd jls ofgbjmwfkiu olfmxmtmwaokttg jlsx ls kffkxwsd jlsi zg tsxwjl. jlsx ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai kxd clm ckuxj.') # doctest: +ELLIPSIS
377 ((15, 22, True), 0.23570361818655...)
378 """
379 sanitised_message = sanitise(message)
380 best_multiplier = 0
381 best_adder = 0
382 best_one_based = True
383 best_fit = float("inf")
384 for one_based in [True, False]:
385 for multiplier in range(1, 26, 2):
386 for adder in range(26):
387 plaintext = affine_decipher(sanitised_message, multiplier, adder, one_based)
388 frequencies = message_frequency_scaling(letter_frequencies(plaintext))
389 fit = metric(target_frequencies, frequencies)
390 logger.debug('Affine break attempt using key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(multiplier, adder, one_based, fit, plaintext[:50]))
391 if fit < best_fit:
392 best_fit = fit
393 best_multiplier = multiplier
394 best_adder = adder
395 best_one_based = one_based
396 logger.info('Affine break best fit with key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(best_multiplier, best_adder, best_one_based, best_fit, affine_decipher(sanitised_message, best_multiplier, best_adder, best_one_based)[:50]))
397 return (best_multiplier, best_adder, best_one_based), best_fit
398
399
400 def keyword_break(message, wordlist=keywords, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
401 """Breaks a keyword substitution cipher using a dictionary and frequency analysis
402
403 >>> keyword_break(keyword_encipher('this is a test message for the keyword decipherment', 'elephant', 1), wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
404 (('elephant', 1), 0.41643991598441...)
405 """
406 best_keyword = ''
407 best_wrap_alphabet = True
408 best_fit = float("inf")
409 for wrap_alphabet in range(3):
410 for keyword in wordlist:
411 plaintext = keyword_decipher(message, keyword, wrap_alphabet)
412 frequencies = message_frequency_scaling(letter_frequencies(plaintext))
413 fit = metric(target_frequencies, frequencies)
414 logger.debug('Keyword break attempt using key {0} (wrap={1}) gives fit of {2} and decrypt starting: {3}'.format(keyword, wrap_alphabet, fit, sanitise(plaintext)[:50]))
415 if fit < best_fit:
416 best_fit = fit
417 best_keyword = keyword
418 best_wrap_alphabet = wrap_alphabet
419 logger.info('Keyword break best fit with key {0} (wrap={1}) gives fit of {2} and decrypt starting: {3}'.format(best_keyword, best_wrap_alphabet, best_fit, sanitise(keyword_decipher(message, best_keyword))[:50]))
420 return (best_keyword, best_wrap_alphabet), best_fit
421
422
423 if __name__ == "__main__":
424 import doctest
425 doctest.testmod()