Simplified affine ciphers
[cipher-tools.git] / cipher.py
1 import string
2 import collections
3 import norms
4 import logging
5 from itertools import zip_longest
6 from segment import segment
7
8 # To time a run:
9 #
10 # import timeit
11 # c5a = open('2012/5a.ciphertext', 'r').read()
12 # timeit.timeit('keyword_break(c5a)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break', number=1)
13
14
15 logger = logging.getLogger(__name__)
16 logger.addHandler(logging.FileHandler('cipher.log'))
17 logger.setLevel(logging.WARNING)
18 #logger.setLevel(logging.INFO)
19
20 english_counts = collections.defaultdict(int)
21 with open('count_1l.txt', 'r') as f:
22 for line in f:
23 (letter, count) = line.split("\t")
24 english_counts[letter] = int(count)
25 normalised_english_counts = norms.normalise(english_counts)
26
27 with open('words.txt', 'r') as f:
28 keywords = [line.rstrip() for line in f]
29
30 modular_division_table = [[0]*26 for x in range(26)]
31 for a in range(26):
32 for b in range(26):
33 c = (a * b) % 26
34 modular_division_table[b][c] = a
35
36
37 def sanitise(text):
38 """Remove all non-alphabetic characters and convert the text to lowercase
39
40 >>> sanitise('The Quick')
41 'thequick'
42 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
43 'thequickbrownfoxjumpedoverthelazydog'
44 """
45 sanitised = [c.lower() for c in text if c in string.ascii_letters]
46 return ''.join(sanitised)
47
48 def ngrams(text, n):
49 """Returns all n-grams of a text
50
51 >>> ngrams(sanitise('the quick brown fox'), 2)
52 [('t', 'h'), ('h', 'e'), ('e', 'q'), ('q', 'u'), ('u', 'i'), ('i', 'c'), ('c', 'k'), ('k', 'b'), ('b', 'r'), ('r', 'o'), ('o', 'w'), ('w', 'n'), ('n', 'f'), ('f', 'o'), ('o', 'x')]
53 >>> ngrams(sanitise('the quick brown fox'), 4)
54 [('t', 'h', 'e', 'q'), ('h', 'e', 'q', 'u'), ('e', 'q', 'u', 'i'), ('q', 'u', 'i', 'c'), ('u', 'i', 'c', 'k'), ('i', 'c', 'k', 'b'), ('c', 'k', 'b', 'r'), ('k', 'b', 'r', 'o'), ('b', 'r', 'o', 'w'), ('r', 'o', 'w', 'n'), ('o', 'w', 'n', 'f'), ('w', 'n', 'f', 'o'), ('n', 'f', 'o', 'x')]
55 """
56 return [tuple(text[i:i+n]) for i in range(len(text)-n+1)]
57
58 def every_nth(text, n):
59 """Returns n strings, each of which consists of every nth character,
60 starting with the 0th, 1st, 2nd, ... (n-1)th character
61
62 >>> every_nth(string.ascii_lowercase, 5)
63 ['afkpuz', 'bglqv', 'chmrw', 'dinsx', 'ejoty']
64 >>> every_nth(string.ascii_lowercase, 1)
65 ['abcdefghijklmnopqrstuvwxyz']
66 >>> every_nth(string.ascii_lowercase, 26)
67 ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
68 """
69 split_text = [text[i:i+n] for i in range(0, len(text), n)]
70 return [''.join(l) for l in zip_longest(*split_text, fillvalue='')]
71
72 def combine_every_nth(split_text):
73 """Reforms a text split into every_nth strings
74
75 >>> combine_every_nth(every_nth(string.ascii_lowercase, 5))
76 'abcdefghijklmnopqrstuvwxyz'
77 >>> combine_every_nth(every_nth(string.ascii_lowercase, 1))
78 'abcdefghijklmnopqrstuvwxyz'
79 >>> combine_every_nth(every_nth(string.ascii_lowercase, 26))
80 'abcdefghijklmnopqrstuvwxyz'
81 """
82 return ''.join([''.join(l) for l in zip_longest(*split_text, fillvalue='')])
83
84
85 def letter_frequencies(text):
86 """Count the number of occurrences of each character in text
87
88 >>> sorted(letter_frequencies('abcdefabc').items())
89 [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
90 >>> sorted(letter_frequencies('the quick brown fox jumped over the lazy dog').items())
91 [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
92 >>> sorted(letter_frequencies('The Quick BROWN fox jumped! over... the (9lazy) DOG').items())
93 [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1), ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1), ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1), ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
94 >>> sorted(letter_frequencies(sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')).items())
95 [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
96 """
97 counts = collections.defaultdict(int)
98 for c in text:
99 counts[c] += 1
100 return counts
101
102 def deduplicate(text):
103 return list(collections.OrderedDict.fromkeys(text))
104
105
106
107 def caesar_encipher_letter(letter, shift):
108 """Encipher a letter, given a shift amount
109
110 >>> caesar_encipher_letter('a', 1)
111 'b'
112 >>> caesar_encipher_letter('a', 2)
113 'c'
114 >>> caesar_encipher_letter('b', 2)
115 'd'
116 >>> caesar_encipher_letter('x', 2)
117 'z'
118 >>> caesar_encipher_letter('y', 2)
119 'a'
120 >>> caesar_encipher_letter('z', 2)
121 'b'
122 >>> caesar_encipher_letter('z', -1)
123 'y'
124 >>> caesar_encipher_letter('a', -1)
125 'z'
126 """
127 if letter in string.ascii_letters:
128 if letter in string.ascii_uppercase:
129 alphabet_start = ord('A')
130 else:
131 alphabet_start = ord('a')
132 return chr(((ord(letter) - alphabet_start + shift) % 26) + alphabet_start)
133 else:
134 return letter
135
136 def caesar_decipher_letter(letter, shift):
137 """Decipher a letter, given a shift amount
138
139 >>> caesar_decipher_letter('b', 1)
140 'a'
141 >>> caesar_decipher_letter('b', 2)
142 'z'
143 """
144 return caesar_encipher_letter(letter, -shift)
145
146 def caesar_encipher(message, shift):
147 """Encipher a message with the Caesar cipher of given shift
148
149 >>> caesar_encipher('abc', 1)
150 'bcd'
151 >>> caesar_encipher('abc', 2)
152 'cde'
153 >>> caesar_encipher('abcxyz', 2)
154 'cdezab'
155 >>> caesar_encipher('ab cx yz', 2)
156 'cd ez ab'
157 """
158 enciphered = [caesar_encipher_letter(l, shift) for l in message]
159 return ''.join(enciphered)
160
161 def caesar_decipher(message, shift):
162 """Encipher a message with the Caesar cipher of given shift
163
164 >>> caesar_decipher('bcd', 1)
165 'abc'
166 >>> caesar_decipher('cde', 2)
167 'abc'
168 >>> caesar_decipher('cd ez ab', 2)
169 'ab cx yz'
170 """
171 return caesar_encipher(message, -shift)
172
173 def affine_encipher_letter(letter, multiplier=1, adder=0, one_based=True):
174 """Encipher a letter, given a multiplier and adder
175
176 >>> ''.join([affine_encipher_letter(l, 3, 5, True) for l in string.ascii_uppercase])
177 'HKNQTWZCFILORUXADGJMPSVYBE'
178 >>> ''.join([affine_encipher_letter(l, 3, 5, False) for l in string.ascii_uppercase])
179 'FILORUXADGJMPSVYBEHKNQTWZC'
180 """
181 if letter in string.ascii_letters:
182 if letter in string.ascii_uppercase:
183 alphabet_start = ord('A')
184 else:
185 alphabet_start = ord('a')
186 letter_number = ord(letter) - alphabet_start
187 if one_based: letter_number += 1
188 cipher_number = (letter_number * multiplier + adder) % 26
189 if one_based: cipher_number -= 1
190 return chr(cipher_number % 26 + alphabet_start)
191 else:
192 return letter
193
194 def affine_decipher_letter(letter, multiplier=1, adder=0, one_based=True):
195 """Encipher a letter, given a multiplier and adder
196
197 >>> ''.join([affine_decipher_letter(l, 3, 5, True) for l in 'HKNQTWZCFILORUXADGJMPSVYBE'])
198 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
199 >>> ''.join([affine_decipher_letter(l, 3, 5, False) for l in 'FILORUXADGJMPSVYBEHKNQTWZC'])
200 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
201 """
202 if letter in string.ascii_letters:
203 if letter in string.ascii_uppercase:
204 alphabet_start = ord('A')
205 else:
206 alphabet_start = ord('a')
207 cipher_number = ord(letter) - alphabet_start
208 if one_based: cipher_number += 1
209 plaintext_number = modular_division_table[multiplier][(cipher_number - adder) % 26]
210 if one_based: plaintext_number -= 1
211 return chr(plaintext_number % 26 + alphabet_start)
212 else:
213 return letter
214
215 def affine_encipher(message, multiplier=1, adder=0, one_based=True):
216 """Encipher a message
217
218 >>> affine_encipher('hours passed during which jerico tried every trick he could think of', 15, 22, True)
219 'lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh'
220 """
221 enciphered = [affine_encipher_letter(l, multiplier, adder, one_based) for l in message]
222 return ''.join(enciphered)
223
224 def affine_decipher(message, multiplier=1, adder=0, one_based=True):
225 """Decipher a message
226
227 >>> affine_decipher('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh', 15, 22, True)
228 'hours passed during which jerico tried every trick he could think of'
229 """
230 enciphered = [affine_decipher_letter(l, multiplier, adder, one_based) for l in message]
231 return ''.join(enciphered)
232
233
234 def keyword_cipher_alphabet_of(keyword, wrap_alphabet=0):
235 """Find the cipher alphabet given a keyword.
236 wrap_alphabet controls how the rest of the alphabet is added
237 after the keyword.
238 0 : from 'a'
239 1 : from the last letter in the sanitised keyword
240 2 : from the largest letter in the sanitised keyword
241
242 >>> keyword_cipher_alphabet_of('bayes')
243 'bayescdfghijklmnopqrtuvwxz'
244 >>> keyword_cipher_alphabet_of('bayes', 0)
245 'bayescdfghijklmnopqrtuvwxz'
246 >>> keyword_cipher_alphabet_of('bayes', 1)
247 'bayestuvwxzcdfghijklmnopqr'
248 >>> keyword_cipher_alphabet_of('bayes', 2)
249 'bayeszcdfghijklmnopqrtuvwx'
250 """
251 if wrap_alphabet == 0:
252 cipher_alphabet = ''.join(deduplicate(sanitise(keyword) + string.ascii_lowercase))
253 else:
254 if wrap_alphabet == 1:
255 last_keyword_letter = deduplicate(sanitise(keyword))[-1]
256 else:
257 last_keyword_letter = sorted(sanitise(keyword))[-1]
258 last_keyword_position = string.ascii_lowercase.find(last_keyword_letter) + 1
259 cipher_alphabet = ''.join(deduplicate(sanitise(keyword) +
260 string.ascii_lowercase[last_keyword_position:] +
261 string.ascii_lowercase))
262 return cipher_alphabet
263
264
265 def keyword_encipher(message, keyword, wrap_alphabet=0):
266 """Enciphers a message with a keyword substitution cipher.
267 wrap_alphabet controls how the rest of the alphabet is added
268 after the keyword.
269 0 : from 'a'
270 1 : from the last letter in the sanitised keyword
271 2 : from the largest letter in the sanitised keyword
272
273 >>> keyword_encipher('test message', 'bayes')
274 'rsqr ksqqbds'
275 >>> keyword_encipher('test message', 'bayes', 0)
276 'rsqr ksqqbds'
277 >>> keyword_encipher('test message', 'bayes', 1)
278 'lskl dskkbus'
279 >>> keyword_encipher('test message', 'bayes', 2)
280 'qspq jsppbcs'
281 """
282 cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
283 cipher_translation = ''.maketrans(string.ascii_lowercase, cipher_alphabet)
284 return message.lower().translate(cipher_translation)
285
286 def keyword_decipher(message, keyword, wrap_alphabet=0):
287 """Deciphers a message with a keyword substitution cipher.
288 wrap_alphabet controls how the rest of the alphabet is added
289 after the keyword.
290 0 : from 'a'
291 1 : from the last letter in the sanitised keyword
292 2 : from the largest letter in the sanitised keyword
293
294 >>> keyword_decipher('rsqr ksqqbds', 'bayes')
295 'test message'
296 >>> keyword_decipher('rsqr ksqqbds', 'bayes', 0)
297 'test message'
298 >>> keyword_decipher('lskl dskkbus', 'bayes', 1)
299 'test message'
300 >>> keyword_decipher('qspq jsppbcs', 'bayes', 2)
301 'test message'
302 """
303 cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
304 cipher_translation = ''.maketrans(cipher_alphabet, string.ascii_lowercase)
305 return message.lower().translate(cipher_translation)
306
307
308 def caesar_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
309 """Breaks a Caesar cipher using frequency analysis
310
311 >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrhecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS
312 (4, 0.31863952890183...)
313 >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgteeraxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS
314 (19, 0.42152901235832...)
315 >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurersvaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS
316 (13, 0.316029208075451...)
317 """
318 sanitised_message = sanitise(message)
319 best_shift = 0
320 best_fit = float("inf")
321 for shift in range(26):
322 plaintext = caesar_decipher(sanitised_message, shift)
323 frequencies = message_frequency_scaling(letter_frequencies(plaintext))
324 fit = metric(target_frequencies, frequencies)
325 logger.debug('Caesar break attempt using key {0} gives fit of {1} and decrypt starting: {2}'.format(shift, fit, plaintext[:50]))
326 if fit < best_fit:
327 best_fit = fit
328 best_shift = shift
329 logger.info('Caesar break best fit: key {0} gives fit of {1} and decrypt starting: {2}'.format(best_shift, best_fit, caesar_decipher(sanitised_message, best_shift)[:50]))
330 return best_shift, best_fit
331
332 def affine_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
333 """Breaks an affine cipher using frequency analysis
334
335 >>> affine_break('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh jm bfmibj umis hfsul axubafkjamx. ls kffkxwsd jls ofgbjmwfkiu olfmxmtmwaokttg jlsx ls kffkxwsd jlsi zg tsxwjl. jlsx ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai kxd clm ckuxj.') # doctest: +ELLIPSIS
336 ((15, 22, True), 0.23570361818655...)
337 """
338 sanitised_message = sanitise(message)
339 best_multiplier = 0
340 best_adder = 0
341 best_one_based = True
342 best_fit = float("inf")
343 for one_based in [True, False]:
344 for multiplier in range(1, 26, 2):
345 for adder in range(26):
346 plaintext = affine_decipher(sanitised_message, multiplier, adder, one_based)
347 frequencies = message_frequency_scaling(letter_frequencies(plaintext))
348 fit = metric(target_frequencies, frequencies)
349 logger.debug('Affine break attempt using key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(multiplier, adder, one_based, fit, plaintext[:50]))
350 if fit < best_fit:
351 best_fit = fit
352 best_multiplier = multiplier
353 best_adder = adder
354 best_one_based = one_based
355 logger.info('Affine break best fit with key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(best_multiplier, best_adder, best_one_based, best_fit, affine_decipher(sanitised_message, best_multiplier, best_adder, best_one_based)[:50]))
356 return (best_multiplier, best_adder, best_one_based), best_fit
357
358
359 def keyword_break(message, wordlist=keywords, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
360 """Breaks a keyword substitution cipher using a dictionary and frequency analysis
361
362 >>> keyword_break(keyword_encipher('this is a test message for the keyword decipherment', 'elephant', 1), wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
363 (('elephant', 1), 0.41643991598441...)
364 """
365 best_keyword = ''
366 best_wrap_alphabet = True
367 best_fit = float("inf")
368 for wrap_alphabet in range(3):
369 for keyword in wordlist:
370 plaintext = keyword_decipher(message, keyword, wrap_alphabet)
371 frequencies = message_frequency_scaling(letter_frequencies(plaintext))
372 fit = metric(target_frequencies, frequencies)
373 logger.debug('Keyword break attempt using key {0} (wrap={1}) gives fit of {2} and decrypt starting: {3}'.format(keyword, wrap_alphabet, fit, sanitise(plaintext)[:50]))
374 if fit < best_fit:
375 best_fit = fit
376 best_keyword = keyword
377 best_wrap_alphabet = wrap_alphabet
378 logger.info('Keyword break best fit with key {0} (wrap={1}) gives fit of {2} and decrypt starting: {3}'.format(best_keyword, best_wrap_alphabet, best_fit, sanitise(keyword_decipher(message, best_keyword))[:50]))
379 return (best_keyword, best_wrap_alphabet), best_fit
380
381
382 if __name__ == "__main__":
383 import doctest
384 doctest.testmod()