Tidied affine cipher and decipher
[cipher-tools.git] / cipher.py
1 import string
2 import collections
3 import norms
4 import logging
5 from itertools import zip_longest
6 from segment import segment
7
8 # To time a run:
9 #
10 # import timeit
11 # c5a = open('2012/5a.ciphertext', 'r').read()
12 # timeit.timeit('keyword_break(c5a)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break', number=1)
13
14
15 logger = logging.getLogger(__name__)
16 logger.addHandler(logging.FileHandler('cipher.log'))
17 logger.setLevel(logging.WARNING)
18 #logger.setLevel(logging.INFO)
19
20 english_counts = collections.defaultdict(int)
21 with open('count_1l.txt', 'r') as f:
22 for line in f:
23 (letter, count) = line.split("\t")
24 english_counts[letter] = int(count)
25 normalised_english_counts = norms.normalise(english_counts)
26
27 with open('words.txt', 'r') as f:
28 keywords = [line.rstrip() for line in f]
29
30 modular_division_table = [[0]*26 for x in range(26)]
31 for a in range(26):
32 for b in range(26):
33 c = (a * b) % 26
34 modular_division_table[b][c] = a
35
36 modular_division_table_one_based = [[0]*27 for x in range(27)]
37 for a in range(27):
38 for b in range(27):
39 c = ((a * b)-1) % 26 + 1
40 modular_division_table_one_based[b][c] = a
41
42
43
44 def sanitise(text):
45 """Remove all non-alphabetic characters and convert the text to lowercase
46
47 >>> sanitise('The Quick')
48 'thequick'
49 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
50 'thequickbrownfoxjumpedoverthelazydog'
51 """
52 sanitised = [c.lower() for c in text if c in string.ascii_letters]
53 return ''.join(sanitised)
54
55 def ngrams(text, n):
56 """Returns all n-grams of a text
57
58 >>> ngrams(sanitise('the quick brown fox'), 2)
59 [('t', 'h'), ('h', 'e'), ('e', 'q'), ('q', 'u'), ('u', 'i'), ('i', 'c'), ('c', 'k'), ('k', 'b'), ('b', 'r'), ('r', 'o'), ('o', 'w'), ('w', 'n'), ('n', 'f'), ('f', 'o'), ('o', 'x')]
60 >>> ngrams(sanitise('the quick brown fox'), 4)
61 [('t', 'h', 'e', 'q'), ('h', 'e', 'q', 'u'), ('e', 'q', 'u', 'i'), ('q', 'u', 'i', 'c'), ('u', 'i', 'c', 'k'), ('i', 'c', 'k', 'b'), ('c', 'k', 'b', 'r'), ('k', 'b', 'r', 'o'), ('b', 'r', 'o', 'w'), ('r', 'o', 'w', 'n'), ('o', 'w', 'n', 'f'), ('w', 'n', 'f', 'o'), ('n', 'f', 'o', 'x')]
62 """
63 return [tuple(text[i:i+n]) for i in range(len(text)-n+1)]
64
65 def every_nth(text, n):
66 """Returns n strings, each of which consists of every nth character,
67 starting with the 0th, 1st, 2nd, ... (n-1)th character
68
69 >>> every_nth(string.ascii_lowercase, 5)
70 ['afkpuz', 'bglqv', 'chmrw', 'dinsx', 'ejoty']
71 >>> every_nth(string.ascii_lowercase, 1)
72 ['abcdefghijklmnopqrstuvwxyz']
73 >>> every_nth(string.ascii_lowercase, 26)
74 ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
75 """
76 split_text = [text[i:i+n] for i in range(0, len(text), n)]
77 return [''.join(l) for l in zip_longest(*split_text, fillvalue='')]
78
79 def combine_every_nth(split_text):
80 """Reforms a text split into every_nth strings
81
82 >>> combine_every_nth(every_nth(string.ascii_lowercase, 5))
83 'abcdefghijklmnopqrstuvwxyz'
84 >>> combine_every_nth(every_nth(string.ascii_lowercase, 1))
85 'abcdefghijklmnopqrstuvwxyz'
86 >>> combine_every_nth(every_nth(string.ascii_lowercase, 26))
87 'abcdefghijklmnopqrstuvwxyz'
88 """
89 return ''.join([''.join(l) for l in zip_longest(*split_text, fillvalue='')])
90
91
92 def letter_frequencies(text):
93 """Count the number of occurrences of each character in text
94
95 >>> sorted(letter_frequencies('abcdefabc').items())
96 [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
97 >>> sorted(letter_frequencies('the quick brown fox jumped over the lazy dog').items())
98 [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
99 >>> sorted(letter_frequencies('The Quick BROWN fox jumped! over... the (9lazy) DOG').items())
100 [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1), ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1), ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1), ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
101 >>> sorted(letter_frequencies(sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')).items())
102 [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
103 """
104 counts = collections.defaultdict(int)
105 for c in text:
106 counts[c] += 1
107 return counts
108
109 def deduplicate(text):
110 return list(collections.OrderedDict.fromkeys(text))
111
112
113
114 def caesar_encipher_letter(letter, shift):
115 """Encipher a letter, given a shift amount
116
117 >>> caesar_encipher_letter('a', 1)
118 'b'
119 >>> caesar_encipher_letter('a', 2)
120 'c'
121 >>> caesar_encipher_letter('b', 2)
122 'd'
123 >>> caesar_encipher_letter('x', 2)
124 'z'
125 >>> caesar_encipher_letter('y', 2)
126 'a'
127 >>> caesar_encipher_letter('z', 2)
128 'b'
129 >>> caesar_encipher_letter('z', -1)
130 'y'
131 >>> caesar_encipher_letter('a', -1)
132 'z'
133 """
134 if letter in string.ascii_letters:
135 if letter in string.ascii_uppercase:
136 alphabet_start = ord('A')
137 else:
138 alphabet_start = ord('a')
139 return chr(((ord(letter) - alphabet_start + shift) % 26) + alphabet_start)
140 else:
141 return letter
142
143 def caesar_decipher_letter(letter, shift):
144 """Decipher a letter, given a shift amount
145
146 >>> caesar_decipher_letter('b', 1)
147 'a'
148 >>> caesar_decipher_letter('b', 2)
149 'z'
150 """
151 return caesar_encipher_letter(letter, -shift)
152
153 def caesar_encipher(message, shift):
154 """Encipher a message with the Caesar cipher of given shift
155
156 >>> caesar_encipher('abc', 1)
157 'bcd'
158 >>> caesar_encipher('abc', 2)
159 'cde'
160 >>> caesar_encipher('abcxyz', 2)
161 'cdezab'
162 >>> caesar_encipher('ab cx yz', 2)
163 'cd ez ab'
164 """
165 enciphered = [caesar_encipher_letter(l, shift) for l in message]
166 return ''.join(enciphered)
167
168 def caesar_decipher(message, shift):
169 """Encipher a message with the Caesar cipher of given shift
170
171 >>> caesar_decipher('bcd', 1)
172 'abc'
173 >>> caesar_decipher('cde', 2)
174 'abc'
175 >>> caesar_decipher('cd ez ab', 2)
176 'ab cx yz'
177 """
178 return caesar_encipher(message, -shift)
179
180 def affine_encipher_letter(letter, multiplier=1, adder=0, one_based=True):
181 """Encipher a letter, given a multiplier and adder
182
183 >>> ''.join([affine_encipher_letter(l, 3, 5, True) for l in string.ascii_uppercase])
184 'HKNQTWZCFILORUXADGJMPSVYBE'
185 >>> ''.join([affine_encipher_letter(l, 3, 5, False) for l in string.ascii_uppercase])
186 'FILORUXADGJMPSVYBEHKNQTWZC'
187 """
188 if letter in string.ascii_letters:
189 if letter in string.ascii_uppercase:
190 alphabet_start = ord('A')
191 else:
192 alphabet_start = ord('a')
193 letter_number = ord(letter) - alphabet_start
194 if one_based:
195 letter_number += 1
196 raw_cipher_number = (letter_number * multiplier + adder)
197 if one_based:
198 cipher_number = (raw_cipher_number - 1) % 26
199 else:
200 cipher_number = raw_cipher_number % 26
201 return chr(cipher_number + alphabet_start)
202 else:
203 return letter
204
205 def affine_decipher_letter(letter, multiplier=1, adder=0, one_based=True):
206 """Encipher a letter, given a multiplier and adder
207
208 >>> ''.join([affine_decipher_letter(l, 3, 5, True) for l in 'HKNQTWZCFILORUXADGJMPSVYBE'])
209 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
210 >>> ''.join([affine_decipher_letter(l, 3, 5, False) for l in 'FILORUXADGJMPSVYBEHKNQTWZC'])
211 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
212 """
213 if letter in string.ascii_letters:
214 if letter in string.ascii_uppercase:
215 alphabet_start = ord('A')
216 else:
217 alphabet_start = ord('a')
218 cipher_number = ord(letter) - alphabet_start
219 if one_based:
220 plaintext_number = (modular_division_table_one_based[multiplier][(cipher_number + 1 - adder + 26) % 26] - 1) % 26
221 else:
222 plaintext_number = modular_division_table[multiplier][(cipher_number - adder + 26) % 26]
223 return chr(plaintext_number + alphabet_start)
224 else:
225 return letter
226
227 def affine_encipher(message, multiplier=1, adder=0, one_based=True):
228 """Encipher a message
229
230 >>> affine_encipher('hours passed during which jerico tried every trick he could think of', 15, 22, True)
231 'lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh'
232 """
233 enciphered = [affine_encipher_letter(l, multiplier, adder, one_based) for l in message]
234 return ''.join(enciphered)
235
236 def affine_decipher(message, multiplier=1, adder=0, one_based=True):
237 """Decipher a message
238
239 >>> affine_decipher('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh', 15, 22, True)
240 'hours passed during which jerico tried every trick he could think of'
241 """
242 enciphered = [affine_decipher_letter(l, multiplier, adder, one_based) for l in message]
243 return ''.join(enciphered)
244
245
246 def keyword_cipher_alphabet_of(keyword, wrap_alphabet=0):
247 """Find the cipher alphabet given a keyword.
248 wrap_alphabet controls how the rest of the alphabet is added
249 after the keyword.
250 0 : from 'a'
251 1 : from the last letter in the sanitised keyword
252 2 : from the largest letter in the sanitised keyword
253
254 >>> keyword_cipher_alphabet_of('bayes')
255 'bayescdfghijklmnopqrtuvwxz'
256 >>> keyword_cipher_alphabet_of('bayes', 0)
257 'bayescdfghijklmnopqrtuvwxz'
258 >>> keyword_cipher_alphabet_of('bayes', 1)
259 'bayestuvwxzcdfghijklmnopqr'
260 >>> keyword_cipher_alphabet_of('bayes', 2)
261 'bayeszcdfghijklmnopqrtuvwx'
262 """
263 if wrap_alphabet == 0:
264 cipher_alphabet = ''.join(deduplicate(sanitise(keyword) + string.ascii_lowercase))
265 else:
266 if wrap_alphabet == 1:
267 last_keyword_letter = deduplicate(sanitise(keyword))[-1]
268 else:
269 last_keyword_letter = sorted(sanitise(keyword))[-1]
270 last_keyword_position = string.ascii_lowercase.find(last_keyword_letter) + 1
271 cipher_alphabet = ''.join(deduplicate(sanitise(keyword) +
272 string.ascii_lowercase[last_keyword_position:] +
273 string.ascii_lowercase))
274 return cipher_alphabet
275
276
277 def keyword_encipher(message, keyword, wrap_alphabet=0):
278 """Enciphers a message with a keyword substitution cipher.
279 wrap_alphabet controls how the rest of the alphabet is added
280 after the keyword.
281 0 : from 'a'
282 1 : from the last letter in the sanitised keyword
283 2 : from the largest letter in the sanitised keyword
284
285 >>> keyword_encipher('test message', 'bayes')
286 'rsqr ksqqbds'
287 >>> keyword_encipher('test message', 'bayes', 0)
288 'rsqr ksqqbds'
289 >>> keyword_encipher('test message', 'bayes', 1)
290 'lskl dskkbus'
291 >>> keyword_encipher('test message', 'bayes', 2)
292 'qspq jsppbcs'
293 """
294 cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
295 cipher_translation = ''.maketrans(string.ascii_lowercase, cipher_alphabet)
296 return message.lower().translate(cipher_translation)
297
298 def keyword_decipher(message, keyword, wrap_alphabet=0):
299 """Deciphers a message with a keyword substitution cipher.
300 wrap_alphabet controls how the rest of the alphabet is added
301 after the keyword.
302 0 : from 'a'
303 1 : from the last letter in the sanitised keyword
304 2 : from the largest letter in the sanitised keyword
305
306 >>> keyword_decipher('rsqr ksqqbds', 'bayes')
307 'test message'
308 >>> keyword_decipher('rsqr ksqqbds', 'bayes', 0)
309 'test message'
310 >>> keyword_decipher('lskl dskkbus', 'bayes', 1)
311 'test message'
312 >>> keyword_decipher('qspq jsppbcs', 'bayes', 2)
313 'test message'
314 """
315 cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
316 cipher_translation = ''.maketrans(cipher_alphabet, string.ascii_lowercase)
317 return message.lower().translate(cipher_translation)
318
319
320 def caesar_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
321 """Breaks a Caesar cipher using frequency analysis
322
323 >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrhecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS
324 (4, 0.31863952890183...)
325 >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgteeraxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS
326 (19, 0.42152901235832...)
327 >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurersvaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS
328 (13, 0.316029208075451...)
329 """
330 sanitised_message = sanitise(message)
331 best_shift = 0
332 best_fit = float("inf")
333 for shift in range(26):
334 plaintext = caesar_decipher(sanitised_message, shift)
335 frequencies = message_frequency_scaling(letter_frequencies(plaintext))
336 fit = metric(target_frequencies, frequencies)
337 logger.debug('Caesar break attempt using key {0} gives fit of {1} and decrypt starting: {2}'.format(shift, fit, plaintext[:50]))
338 if fit < best_fit:
339 best_fit = fit
340 best_shift = shift
341 logger.info('Caesar break best fit: key {0} gives fit of {1} and decrypt starting: {2}'.format(best_shift, best_fit, caesar_decipher(sanitised_message, best_shift)[:50]))
342 return best_shift, best_fit
343
344 def affine_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
345 """Breaks an affine cipher using frequency analysis
346
347 >>> affine_break('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh jm bfmibj umis hfsul axubafkjamx. ls kffkxwsd jls ofgbjmwfkiu olfmxmtmwaokttg jlsx ls kffkxwsd jlsi zg tsxwjl. jlsx ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai kxd clm ckuxj.') # doctest: +ELLIPSIS
348 ((15, 22, True), 0.23570361818655...)
349 """
350 sanitised_message = sanitise(message)
351 best_multiplier = 0
352 best_adder = 0
353 best_one_based = True
354 best_fit = float("inf")
355 for one_based in [True, False]:
356 for multiplier in range(1, 26, 2):
357 for adder in range(26):
358 plaintext = affine_decipher(sanitised_message, multiplier, adder, one_based)
359 frequencies = message_frequency_scaling(letter_frequencies(plaintext))
360 fit = metric(target_frequencies, frequencies)
361 logger.debug('Affine break attempt using key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(multiplier, adder, one_based, fit, plaintext[:50]))
362 if fit < best_fit:
363 best_fit = fit
364 best_multiplier = multiplier
365 best_adder = adder
366 best_one_based = one_based
367 logger.info('Affine break best fit with key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(best_multiplier, best_adder, best_one_based, best_fit, affine_decipher(sanitised_message, best_multiplier, best_adder, best_one_based)[:50]))
368 return (best_multiplier, best_adder, best_one_based), best_fit
369
370
371 def keyword_break(message, wordlist=keywords, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
372 """Breaks a keyword substitution cipher using a dictionary and frequency analysis
373
374 >>> keyword_break(keyword_encipher('this is a test message for the keyword decipherment', 'elephant', 1), wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
375 (('elephant', 1), 0.41643991598441...)
376 """
377 best_keyword = ''
378 best_wrap_alphabet = True
379 best_fit = float("inf")
380 for wrap_alphabet in range(3):
381 for keyword in wordlist:
382 plaintext = keyword_decipher(message, keyword, wrap_alphabet)
383 frequencies = message_frequency_scaling(letter_frequencies(plaintext))
384 fit = metric(target_frequencies, frequencies)
385 logger.debug('Keyword break attempt using key {0} (wrap={1}) gives fit of {2} and decrypt starting: {3}'.format(keyword, wrap_alphabet, fit, sanitise(plaintext)[:50]))
386 if fit < best_fit:
387 best_fit = fit
388 best_keyword = keyword
389 best_wrap_alphabet = wrap_alphabet
390 logger.info('Keyword break best fit with key {0} (wrap={1}) gives fit of {2} and decrypt starting: {3}'.format(best_keyword, best_wrap_alphabet, best_fit, sanitise(keyword_decipher(message, best_keyword))[:50]))
391 return (best_keyword, best_wrap_alphabet), best_fit
392
393
394 if __name__ == "__main__":
395 import doctest
396 doctest.testmod()