0e89a9c01ebdbe60e9e29b16c0822093caca983c
[cipher-tools.git] / cipher.py
1 import string
2 import collections
3 import norms
4 import logging
5 from itertools import zip_longest
6 from segment import segment
7
8 # To time a run:
9 #
10 # import timeit
11 # c5a = open('2012/5a.ciphertext', 'r').read()
12 # timeit.timeit('keyword_break(c5a)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break', number=1)
13
14
15 logger = logging.getLogger(__name__)
16 logger.addHandler(logging.FileHandler('cipher.log'))
17 logger.setLevel(logging.WARNING)
18 #logger.setLevel(logging.INFO)
19
20 english_counts = collections.defaultdict(int)
21 with open('count_1l.txt', 'r') as f:
22 for line in f:
23 (letter, count) = line.split("\t")
24 english_counts[letter] = int(count)
25 normalised_english_counts = norms.normalise(english_counts)
26
27 with open('words.txt', 'r') as f:
28 keywords = [line.rstrip() for line in f]
29
30 modular_division_table = [[0]*26 for x in range(26)]
31 for a in range(26):
32 for b in range(26):
33 c = (a * b) % 26
34 modular_division_table[b][c] = a
35
36 modular_division_table_one_based = [[0]*27 for x in range(27)]
37 for a in range(27):
38 for b in range(27):
39 c = ((a * b)-1) % 26 + 1
40 modular_division_table_one_based[b][c] = a
41
42
43
44 def sanitise(text):
45 """Remove all non-alphabetic characters and convert the text to lowercase
46
47 >>> sanitise('The Quick')
48 'thequick'
49 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
50 'thequickbrownfoxjumpedoverthelazydog'
51 """
52 sanitised = [c.lower() for c in text if c in string.ascii_letters]
53 return ''.join(sanitised)
54
55 def ngrams(text, n):
56 """Returns all n-grams of a text
57
58 >>> ngrams(sanitise('the quick brown fox'), 2)
59 [('t', 'h'), ('h', 'e'), ('e', 'q'), ('q', 'u'), ('u', 'i'), ('i', 'c'), ('c', 'k'), ('k', 'b'), ('b', 'r'), ('r', 'o'), ('o', 'w'), ('w', 'n'), ('n', 'f'), ('f', 'o'), ('o', 'x')]
60 >>> ngrams(sanitise('the quick brown fox'), 4)
61 [('t', 'h', 'e', 'q'), ('h', 'e', 'q', 'u'), ('e', 'q', 'u', 'i'), ('q', 'u', 'i', 'c'), ('u', 'i', 'c', 'k'), ('i', 'c', 'k', 'b'), ('c', 'k', 'b', 'r'), ('k', 'b', 'r', 'o'), ('b', 'r', 'o', 'w'), ('r', 'o', 'w', 'n'), ('o', 'w', 'n', 'f'), ('w', 'n', 'f', 'o'), ('n', 'f', 'o', 'x')]
62 """
63 return [tuple(text[i:i+n]) for i in range(len(text)-n+1)]
64
65 def every_nth(text, n):
66 """Returns n strings, each of which consists of every nth character,
67 starting with the 0th, 1st, 2nd, ... (n-1)th character
68
69 >>> every_nth(string.ascii_lowercase, 5)
70 ['afkpuz', 'bglqv', 'chmrw', 'dinsx', 'ejoty']
71 >>> every_nth(string.ascii_lowercase, 1)
72 ['abcdefghijklmnopqrstuvwxyz']
73 >>> every_nth(string.ascii_lowercase, 26)
74 ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
75 """
76 split_text = [text[i:i+n] for i in range(0, len(text), n)]
77 return [''.join(l) for l in zip_longest(*split_text, fillvalue='')]
78
79 def combine_every_nth(split_text):
80 """Reforms a text split into every_nth strings
81
82 >>> combine_every_nth(every_nth(string.ascii_lowercase, 5))
83 'abcdefghijklmnopqrstuvwxyz'
84 >>> combine_every_nth(every_nth(string.ascii_lowercase, 1))
85 'abcdefghijklmnopqrstuvwxyz'
86 >>> combine_every_nth(every_nth(string.ascii_lowercase, 26))
87 'abcdefghijklmnopqrstuvwxyz'
88 """
89 return ''.join([''.join(l) for l in zip_longest(*split_text, fillvalue='')])
90
91
92 def letter_frequencies(text):
93 """Count the number of occurrences of each character in text
94
95 >>> sorted(letter_frequencies('abcdefabc').items())
96 [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
97 >>> sorted(letter_frequencies('the quick brown fox jumped over the lazy dog').items())
98 [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
99 >>> sorted(letter_frequencies('The Quick BROWN fox jumped! over... the (9lazy) DOG').items())
100 [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1), ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1), ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1), ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
101 >>> sorted(letter_frequencies(sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')).items())
102 [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
103 """
104 counts = collections.defaultdict(int)
105 for c in text:
106 counts[c] += 1
107 return counts
108
109 def deduplicate(text):
110 return list(collections.OrderedDict.fromkeys(text))
111
112
113
114 def caesar_encipher_letter(letter, shift):
115 """Encipher a letter, given a shift amount
116
117 >>> caesar_encipher_letter('a', 1)
118 'b'
119 >>> caesar_encipher_letter('a', 2)
120 'c'
121 >>> caesar_encipher_letter('b', 2)
122 'd'
123 >>> caesar_encipher_letter('x', 2)
124 'z'
125 >>> caesar_encipher_letter('y', 2)
126 'a'
127 >>> caesar_encipher_letter('z', 2)
128 'b'
129 >>> caesar_encipher_letter('z', -1)
130 'y'
131 >>> caesar_encipher_letter('a', -1)
132 'z'
133 """
134 if letter in string.ascii_letters:
135 if letter in string.ascii_uppercase:
136 alphabet_start = ord('A')
137 else:
138 alphabet_start = ord('a')
139 return chr(((ord(letter) - alphabet_start + shift) % 26) + alphabet_start)
140 else:
141 return letter
142
143 def caesar_decipher_letter(letter, shift):
144 """Decipher a letter, given a shift amount
145
146 >>> caesar_decipher_letter('b', 1)
147 'a'
148 >>> caesar_decipher_letter('b', 2)
149 'z'
150 """
151 return caesar_encipher_letter(letter, -shift)
152
153 def caesar_encipher(message, shift):
154 """Encipher a message with the Caesar cipher of given shift
155
156 >>> caesar_encipher('abc', 1)
157 'bcd'
158 >>> caesar_encipher('abc', 2)
159 'cde'
160 >>> caesar_encipher('abcxyz', 2)
161 'cdezab'
162 >>> caesar_encipher('ab cx yz', 2)
163 'cd ez ab'
164 """
165 enciphered = [caesar_encipher_letter(l, shift) for l in message]
166 return ''.join(enciphered)
167
168 def caesar_decipher(message, shift):
169 """Encipher a message with the Caesar cipher of given shift
170
171 >>> caesar_decipher('bcd', 1)
172 'abc'
173 >>> caesar_decipher('cde', 2)
174 'abc'
175 >>> caesar_decipher('cd ez ab', 2)
176 'ab cx yz'
177 """
178 return caesar_encipher(message, -shift)
179
180 def affine_encipher_letter(letter, multiplier=1, adder=0, one_based=True):
181 """Encipher a letter, given a multiplier and adder
182
183 >>> ''.join([affine_encipher_letter(l, 3, 5, True) for l in string.ascii_uppercase])
184 'HKNQTWZCFILORUXADGJMPSVYBE'
185 >>> ''.join([affine_encipher_letter(l, 3, 5, False) for l in string.ascii_uppercase])
186 'FILORUXADGJMPSVYBEHKNQTWZC'
187 """
188 if letter in string.ascii_letters:
189 if letter in string.ascii_uppercase:
190 alphabet_start = ord('A')
191 else:
192 alphabet_start = ord('a')
193 letter_number = ord(letter) - alphabet_start
194 if one_based:
195 letter_number += 1
196 raw_cipher_number = (letter_number * multiplier + adder)
197 cipher_number = 0
198 if one_based:
199 cipher_number = (raw_cipher_number - 1) % 26
200 else:
201 cipher_number = raw_cipher_number % 26
202 return chr(cipher_number + alphabet_start)
203 else:
204 return letter
205
206 def affine_decipher_letter(letter, multiplier=1, adder=0, one_based=True):
207 """Encipher a letter, given a multiplier and adder
208
209 >>> ''.join([affine_decipher_letter(l, 3, 5, True) for l in 'HKNQTWZCFILORUXADGJMPSVYBE'])
210 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
211 >>> ''.join([affine_decipher_letter(l, 3, 5, False) for l in 'FILORUXADGJMPSVYBEHKNQTWZC'])
212 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
213 """
214 if letter in string.ascii_letters:
215 if letter in string.ascii_uppercase:
216 alphabet_start = ord('A')
217 else:
218 alphabet_start = ord('a')
219 cipher_number = ord(letter) - alphabet_start
220 if one_based:
221 cipher_number += 1
222 plaintext_number = 0
223 if one_based:
224 plaintext_number = (modular_division_table_one_based[multiplier][(cipher_number - adder + 26) % 26] - 1) % 26
225 else:
226 #plaintext_number = (modular_division_table[multiplier][cipher_number] - adder) % 26
227 plaintext_number = modular_division_table[multiplier][(cipher_number - adder + 26) % 26]
228 return chr(plaintext_number + alphabet_start)
229 else:
230 return letter
231
232 def affine_encipher(message, multiplier=1, adder=0, one_based=True):
233 """Encipher a message
234
235 >>> affine_encipher('hours passed during which jerico tried every trick he could think of', 15, 22, True)
236 'lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh'
237 """
238 enciphered = [affine_encipher_letter(l, multiplier, adder, one_based) for l in message]
239 return ''.join(enciphered)
240
241 def affine_decipher(message, multiplier=1, adder=0, one_based=True):
242 """Decipher a message
243
244 >>> affine_decipher('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh', 15, 22, True)
245 'hours passed during which jerico tried every trick he could think of'
246 """
247 enciphered = [affine_decipher_letter(l, multiplier, adder, one_based) for l in message]
248 return ''.join(enciphered)
249
250
251 def keyword_cipher_alphabet_of(keyword, wrap_alphabet=0):
252 """Find the cipher alphabet given a keyword.
253 wrap_alphabet controls how the rest of the alphabet is added
254 after the keyword.
255 0 : from 'a'
256 1 : from the last letter in the sanitised keyword
257 2 : from the largest letter in the sanitised keyword
258
259 >>> keyword_cipher_alphabet_of('bayes')
260 'bayescdfghijklmnopqrtuvwxz'
261 >>> keyword_cipher_alphabet_of('bayes', 0)
262 'bayescdfghijklmnopqrtuvwxz'
263 >>> keyword_cipher_alphabet_of('bayes', 1)
264 'bayestuvwxzcdfghijklmnopqr'
265 >>> keyword_cipher_alphabet_of('bayes', 2)
266 'bayeszcdfghijklmnopqrtuvwx'
267 """
268 if wrap_alphabet == 0:
269 cipher_alphabet = ''.join(deduplicate(sanitise(keyword) + string.ascii_lowercase))
270 else:
271 if wrap_alphabet == 1:
272 last_keyword_letter = deduplicate(sanitise(keyword))[-1]
273 else:
274 last_keyword_letter = sorted(sanitise(keyword))[-1]
275 last_keyword_position = string.ascii_lowercase.find(last_keyword_letter) + 1
276 cipher_alphabet = ''.join(deduplicate(sanitise(keyword) +
277 string.ascii_lowercase[last_keyword_position:] +
278 string.ascii_lowercase))
279 return cipher_alphabet
280
281
282 def keyword_encipher(message, keyword, wrap_alphabet=0):
283 """Enciphers a message with a keyword substitution cipher.
284 wrap_alphabet controls how the rest of the alphabet is added
285 after the keyword.
286 0 : from 'a'
287 1 : from the last letter in the sanitised keyword
288 2 : from the largest letter in the sanitised keyword
289
290 >>> keyword_encipher('test message', 'bayes')
291 'rsqr ksqqbds'
292 >>> keyword_encipher('test message', 'bayes', 0)
293 'rsqr ksqqbds'
294 >>> keyword_encipher('test message', 'bayes', 1)
295 'lskl dskkbus'
296 >>> keyword_encipher('test message', 'bayes', 2)
297 'qspq jsppbcs'
298 """
299 cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
300 cipher_translation = ''.maketrans(string.ascii_lowercase, cipher_alphabet)
301 return message.lower().translate(cipher_translation)
302
303 def keyword_decipher(message, keyword, wrap_alphabet=0):
304 """Deciphers a message with a keyword substitution cipher.
305 wrap_alphabet controls how the rest of the alphabet is added
306 after the keyword.
307 0 : from 'a'
308 1 : from the last letter in the sanitised keyword
309 2 : from the largest letter in the sanitised keyword
310
311 >>> keyword_decipher('rsqr ksqqbds', 'bayes')
312 'test message'
313 >>> keyword_decipher('rsqr ksqqbds', 'bayes', 0)
314 'test message'
315 >>> keyword_decipher('lskl dskkbus', 'bayes', 1)
316 'test message'
317 >>> keyword_decipher('qspq jsppbcs', 'bayes', 2)
318 'test message'
319 """
320 cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
321 cipher_translation = ''.maketrans(cipher_alphabet, string.ascii_lowercase)
322 return message.lower().translate(cipher_translation)
323
324
325 def caesar_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
326 """Breaks a Caesar cipher using frequency analysis
327
328 >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrhecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS
329 (4, 0.31863952890183...)
330 >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgteeraxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS
331 (19, 0.42152901235832...)
332 >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurersvaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS
333 (13, 0.316029208075451...)
334 """
335 sanitised_message = sanitise(message)
336 best_shift = 0
337 best_fit = float("inf")
338 for shift in range(26):
339 plaintext = caesar_decipher(sanitised_message, shift)
340 frequencies = message_frequency_scaling(letter_frequencies(plaintext))
341 fit = metric(target_frequencies, frequencies)
342 logger.debug('Caesar break attempt using key {0} gives fit of {1} and decrypt starting: {2}'.format(shift, fit, plaintext[:50]))
343 if fit < best_fit:
344 best_fit = fit
345 best_shift = shift
346 logger.info('Caesar break best fit: key {0} gives fit of {1} and decrypt starting: {2}'.format(best_shift, best_fit, caesar_decipher(sanitised_message, best_shift)[:50]))
347 return best_shift, best_fit
348
349 def affine_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
350 """Breaks an affine cipher using frequency analysis
351
352 >>> affine_break('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh jm bfmibj umis hfsul axubafkjamx. ls kffkxwsd jls ofgbjmwfkiu olfmxmtmwaokttg jlsx ls kffkxwsd jlsi zg tsxwjl. jlsx ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai kxd clm ckuxj.') # doctest: +ELLIPSIS
353 ((15, 22, True), 0.23570361818655...)
354 """
355 sanitised_message = sanitise(message)
356 best_multiplier = 0
357 best_adder = 0
358 best_one_based = True
359 best_fit = float("inf")
360 for one_based in [True, False]:
361 for multiplier in range(1, 26, 2):
362 for adder in range(26):
363 plaintext = affine_decipher(sanitised_message, multiplier, adder, one_based)
364 frequencies = message_frequency_scaling(letter_frequencies(plaintext))
365 fit = metric(target_frequencies, frequencies)
366 logger.debug('Affine break attempt using key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(multiplier, adder, one_based, fit, plaintext[:50]))
367 if fit < best_fit:
368 best_fit = fit
369 best_multiplier = multiplier
370 best_adder = adder
371 best_one_based = one_based
372 logger.info('Affine break best fit with key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(best_multiplier, best_adder, best_one_based, best_fit, affine_decipher(sanitised_message, best_multiplier, best_adder, best_one_based)[:50]))
373 return (best_multiplier, best_adder, best_one_based), best_fit
374
375
376 def keyword_break(message, wordlist=keywords, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
377 """Breaks a keyword substitution cipher using a dictionary and frequency analysis
378
379 >>> keyword_break(keyword_encipher('this is a test message for the keyword decipherment', 'elephant', 1), wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
380 (('elephant', 1), 0.41643991598441...)
381 """
382 best_keyword = ''
383 best_wrap_alphabet = True
384 best_fit = float("inf")
385 for wrap_alphabet in range(3):
386 for keyword in wordlist:
387 plaintext = keyword_decipher(message, keyword, wrap_alphabet)
388 frequencies = message_frequency_scaling(letter_frequencies(plaintext))
389 fit = metric(target_frequencies, frequencies)
390 logger.debug('Keyword break attempt using key {0} (wrap={1}) gives fit of {2} and decrypt starting: {3}'.format(keyword, wrap_alphabet, fit, sanitise(plaintext)[:50]))
391 if fit < best_fit:
392 best_fit = fit
393 best_keyword = keyword
394 best_wrap_alphabet = wrap_alphabet
395 logger.info('Keyword break best fit with key {0} (wrap={1}) gives fit of {2} and decrypt starting: {3}'.format(best_keyword, best_wrap_alphabet, best_fit, sanitise(keyword_decipher(message, best_keyword))[:50]))
396 return (best_keyword, best_wrap_alphabet), best_fit
397
398
399 if __name__ == "__main__":
400 import doctest
401 doctest.testmod()