Added notes on using timeit
[cipher-tools.git] / cipher.py
1 import string
2 import collections
3 import norms
4 import logging
5 from segment import segment
6
7 # To time a run:
8 #
9 # import timeit
10 # c5a = open('2012/5a.ciphertext', 'r').read()
11 # timeit.timeit('keyword_break(c5a)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break', number=1)
12
13
14 logger = logging.getLogger(__name__)
15 logger.addHandler(logging.FileHandler('cipher.log'))
16 logger.setLevel(logging.WARNING)
17 #logger.setLevel(logging.INFO)
18
19 english_counts = collections.defaultdict(int)
20 with open('count_1l.txt', 'r') as f:
21 for line in f:
22 (letter, count) = line.split("\t")
23 english_counts[letter] = int(count)
24 normalised_english_counts = norms.normalise(english_counts)
25
26 with open('words.txt', 'r') as f:
27 keywords = [line.rstrip() for line in f]
28
29 modular_division_table = [[0]*26 for x in range(26)]
30 for a in range(26):
31 for b in range(26):
32 c = (a * b) % 26
33 modular_division_table[b][c] = a
34
35 modular_division_table_one_based = [[0]*27 for x in range(27)]
36 for a in range(27):
37 for b in range(27):
38 c = ((a * b)-1) % 26 + 1
39 modular_division_table_one_based[b][c] = a
40
41
42
43 def sanitise(text):
44 """Remove all non-alphabetic characters and convert the text to lowercase
45
46 >>> sanitise('The Quick')
47 'thequick'
48 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
49 'thequickbrownfoxjumpedoverthelazydog'
50 """
51 sanitised = [c.lower() for c in text if c in string.ascii_letters]
52 return ''.join(sanitised)
53
54 def ngrams(text, n):
55 """Returns all n-grams of a text
56
57 >>> ngrams(sanitise('the quick brown fox'), 2)
58 [('t', 'h'), ('h', 'e'), ('e', 'q'), ('q', 'u'), ('u', 'i'), ('i', 'c'), ('c', 'k'), ('k', 'b'), ('b', 'r'), ('r', 'o'), ('o', 'w'), ('w', 'n'), ('n', 'f'), ('f', 'o'), ('o', 'x')]
59 >>> ngrams(sanitise('the quick brown fox'), 4)
60 [('t', 'h', 'e', 'q'), ('h', 'e', 'q', 'u'), ('e', 'q', 'u', 'i'), ('q', 'u', 'i', 'c'), ('u', 'i', 'c', 'k'), ('i', 'c', 'k', 'b'), ('c', 'k', 'b', 'r'), ('k', 'b', 'r', 'o'), ('b', 'r', 'o', 'w'), ('r', 'o', 'w', 'n'), ('o', 'w', 'n', 'f'), ('w', 'n', 'f', 'o'), ('n', 'f', 'o', 'x')]
61 """
62 return [tuple(text[i:i+n]) for i in range(len(text)-n+1)]
63
64 def letter_frequencies(text):
65 """Count the number of occurrences of each character in text
66
67 >>> sorted(letter_frequencies('abcdefabc').items())
68 [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
69 >>> sorted(letter_frequencies('the quick brown fox jumped over the lazy dog').items())
70 [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
71 >>> sorted(letter_frequencies('The Quick BROWN fox jumped! over... the (9lazy) DOG').items())
72 [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1), ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1), ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1), ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
73 >>> sorted(letter_frequencies(sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')).items())
74 [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
75 """
76 counts = collections.defaultdict(int)
77 for c in text:
78 counts[c] += 1
79 return counts
80
81 def deduplicate(text):
82 return list(collections.OrderedDict.fromkeys(text))
83
84
85
86 def caesar_encipher_letter(letter, shift):
87 """Encipher a letter, given a shift amount
88
89 >>> caesar_encipher_letter('a', 1)
90 'b'
91 >>> caesar_encipher_letter('a', 2)
92 'c'
93 >>> caesar_encipher_letter('b', 2)
94 'd'
95 >>> caesar_encipher_letter('x', 2)
96 'z'
97 >>> caesar_encipher_letter('y', 2)
98 'a'
99 >>> caesar_encipher_letter('z', 2)
100 'b'
101 >>> caesar_encipher_letter('z', -1)
102 'y'
103 >>> caesar_encipher_letter('a', -1)
104 'z'
105 """
106 if letter in string.ascii_letters:
107 if letter in string.ascii_uppercase:
108 alphabet_start = ord('A')
109 else:
110 alphabet_start = ord('a')
111 return chr(((ord(letter) - alphabet_start + shift) % 26) + alphabet_start)
112 else:
113 return letter
114
115 def caesar_decipher_letter(letter, shift):
116 """Decipher a letter, given a shift amount
117
118 >>> caesar_decipher_letter('b', 1)
119 'a'
120 >>> caesar_decipher_letter('b', 2)
121 'z'
122 """
123 return caesar_encipher_letter(letter, -shift)
124
125 def caesar_encipher(message, shift):
126 """Encipher a message with the Caesar cipher of given shift
127
128 >>> caesar_encipher('abc', 1)
129 'bcd'
130 >>> caesar_encipher('abc', 2)
131 'cde'
132 >>> caesar_encipher('abcxyz', 2)
133 'cdezab'
134 >>> caesar_encipher('ab cx yz', 2)
135 'cd ez ab'
136 """
137 enciphered = [caesar_encipher_letter(l, shift) for l in message]
138 return ''.join(enciphered)
139
140 def caesar_decipher(message, shift):
141 """Encipher a message with the Caesar cipher of given shift
142
143 >>> caesar_decipher('bcd', 1)
144 'abc'
145 >>> caesar_decipher('cde', 2)
146 'abc'
147 >>> caesar_decipher('cd ez ab', 2)
148 'ab cx yz'
149 """
150 return caesar_encipher(message, -shift)
151
152 def affine_encipher_letter(letter, multiplier=1, adder=0, one_based=True):
153 """Encipher a letter, given a multiplier and adder
154
155 >>> ''.join([affine_encipher_letter(l, 3, 5, True) for l in string.ascii_uppercase])
156 'HKNQTWZCFILORUXADGJMPSVYBE'
157 >>> ''.join([affine_encipher_letter(l, 3, 5, False) for l in string.ascii_uppercase])
158 'FILORUXADGJMPSVYBEHKNQTWZC'
159 """
160 if letter in string.ascii_letters:
161 if letter in string.ascii_uppercase:
162 alphabet_start = ord('A')
163 else:
164 alphabet_start = ord('a')
165 letter_number = ord(letter) - alphabet_start
166 if one_based:
167 letter_number += 1
168 raw_cipher_number = (letter_number * multiplier + adder)
169 cipher_number = 0
170 if one_based:
171 cipher_number = (raw_cipher_number - 1) % 26
172 else:
173 cipher_number = raw_cipher_number % 26
174 return chr(cipher_number + alphabet_start)
175 else:
176 return letter
177
178 def affine_decipher_letter(letter, multiplier=1, adder=0, one_based=True):
179 """Encipher a letter, given a multiplier and adder
180
181 >>> ''.join([affine_decipher_letter(l, 3, 5, True) for l in 'HKNQTWZCFILORUXADGJMPSVYBE'])
182 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
183 >>> ''.join([affine_decipher_letter(l, 3, 5, False) for l in 'FILORUXADGJMPSVYBEHKNQTWZC'])
184 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
185 """
186 if letter in string.ascii_letters:
187 if letter in string.ascii_uppercase:
188 alphabet_start = ord('A')
189 else:
190 alphabet_start = ord('a')
191 cipher_number = ord(letter) - alphabet_start
192 if one_based:
193 cipher_number += 1
194 plaintext_number = 0
195 if one_based:
196 plaintext_number = (modular_division_table_one_based[multiplier][(cipher_number - adder + 26) % 26] - 1) % 26
197 else:
198 #plaintext_number = (modular_division_table[multiplier][cipher_number] - adder) % 26
199 plaintext_number = modular_division_table[multiplier][(cipher_number - adder + 26) % 26]
200 return chr(plaintext_number + alphabet_start)
201 else:
202 return letter
203
204 def affine_encipher(message, multiplier=1, adder=0, one_based=True):
205 """Encipher a message
206
207 >>> affine_encipher('hours passed during which jerico tried every trick he could think of', 15, 22, True)
208 'lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh'
209 """
210 enciphered = [affine_encipher_letter(l, multiplier, adder, one_based) for l in message]
211 return ''.join(enciphered)
212
213 def affine_decipher(message, multiplier=1, adder=0, one_based=True):
214 """Decipher a message
215
216 >>> affine_decipher('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh', 15, 22, True)
217 'hours passed during which jerico tried every trick he could think of'
218 """
219 enciphered = [affine_decipher_letter(l, multiplier, adder, one_based) for l in message]
220 return ''.join(enciphered)
221
222
223 def keyword_cipher_alphabet_of(keyword, wrap_alphabet=0):
224 """Find the cipher alphabet given a keyword.
225 wrap_alphabet controls how the rest of the alphabet is added
226 after the keyword.
227 0 : from 'a'
228 1 : from the last letter in the sanitised keyword
229 2 : from the largest letter in the sanitised keyword
230
231 >>> keyword_cipher_alphabet_of('bayes')
232 'bayescdfghijklmnopqrtuvwxz'
233 >>> keyword_cipher_alphabet_of('bayes', 0)
234 'bayescdfghijklmnopqrtuvwxz'
235 >>> keyword_cipher_alphabet_of('bayes', 1)
236 'bayestuvwxzcdfghijklmnopqr'
237 >>> keyword_cipher_alphabet_of('bayes', 2)
238 'bayeszcdfghijklmnopqrtuvwx'
239 """
240 if wrap_alphabet == 0:
241 cipher_alphabet = ''.join(deduplicate(sanitise(keyword) + string.ascii_lowercase))
242 else:
243 if wrap_alphabet == 1:
244 last_keyword_letter = deduplicate(sanitise(keyword))[-1]
245 else:
246 last_keyword_letter = sorted(sanitise(keyword))[-1]
247 last_keyword_position = string.ascii_lowercase.find(last_keyword_letter) + 1
248 cipher_alphabet = ''.join(deduplicate(sanitise(keyword) +
249 string.ascii_lowercase[last_keyword_position:] +
250 string.ascii_lowercase))
251 return cipher_alphabet
252
253
254 def keyword_encipher(message, keyword, wrap_alphabet=0):
255 """Enciphers a message with a keyword substitution cipher.
256 wrap_alphabet controls how the rest of the alphabet is added
257 after the keyword.
258 0 : from 'a'
259 1 : from the last letter in the sanitised keyword
260 2 : from the largest letter in the sanitised keyword
261
262 >>> keyword_encipher('test message', 'bayes')
263 'rsqr ksqqbds'
264 >>> keyword_encipher('test message', 'bayes', 0)
265 'rsqr ksqqbds'
266 >>> keyword_encipher('test message', 'bayes', 1)
267 'lskl dskkbus'
268 >>> keyword_encipher('test message', 'bayes', 2)
269 'qspq jsppbcs'
270 """
271 cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
272 cipher_translation = ''.maketrans(string.ascii_lowercase, cipher_alphabet)
273 return message.lower().translate(cipher_translation)
274
275 def keyword_decipher(message, keyword, wrap_alphabet=0):
276 """Deciphers a message with a keyword substitution cipher.
277 wrap_alphabet controls how the rest of the alphabet is added
278 after the keyword.
279 0 : from 'a'
280 1 : from the last letter in the sanitised keyword
281 2 : from the largest letter in the sanitised keyword
282
283 >>> keyword_decipher('rsqr ksqqbds', 'bayes')
284 'test message'
285 >>> keyword_decipher('rsqr ksqqbds', 'bayes', 0)
286 'test message'
287 >>> keyword_decipher('lskl dskkbus', 'bayes', 1)
288 'test message'
289 >>> keyword_decipher('qspq jsppbcs', 'bayes', 2)
290 'test message'
291 """
292 cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
293 cipher_translation = ''.maketrans(cipher_alphabet, string.ascii_lowercase)
294 return message.lower().translate(cipher_translation)
295
296
297 def caesar_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
298 """Breaks a Caesar cipher using frequency analysis
299
300 >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrhecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS
301 (4, 0.31863952890183...)
302 >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgteeraxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS
303 (19, 0.42152901235832...)
304 >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurersvaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS
305 (13, 0.316029208075451...)
306 """
307 sanitised_message = sanitise(message)
308 best_shift = 0
309 best_fit = float("inf")
310 for shift in range(26):
311 plaintext = caesar_decipher(sanitised_message, shift)
312 frequencies = message_frequency_scaling(letter_frequencies(plaintext))
313 fit = metric(target_frequencies, frequencies)
314 logger.debug('Caesar break attempt using key {0} gives fit of {1} and decrypt starting: {2}'.format(shift, fit, plaintext[:50]))
315 if fit < best_fit:
316 best_fit = fit
317 best_shift = shift
318 logger.info('Caesar break best fit: key {0} gives fit of {1} and decrypt starting: {2}'.format(best_shift, best_fit, caesar_decipher(sanitised_message, best_shift)[:50]))
319 return best_shift, best_fit
320
321 def affine_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
322 """Breaks an affine cipher using frequency analysis
323
324 >>> affine_break('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh jm bfmibj umis hfsul axubafkjamx. ls kffkxwsd jls ofgbjmwfkiu olfmxmtmwaokttg jlsx ls kffkxwsd jlsi zg tsxwjl. jlsx ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai kxd clm ckuxj.') # doctest: +ELLIPSIS
325 ((15, 22, True), 0.23570361818655...)
326 """
327 sanitised_message = sanitise(message)
328 best_multiplier = 0
329 best_adder = 0
330 best_one_based = True
331 best_fit = float("inf")
332 for one_based in [True, False]:
333 for multiplier in range(1, 26, 2):
334 for adder in range(26):
335 plaintext = affine_decipher(sanitised_message, multiplier, adder, one_based)
336 frequencies = message_frequency_scaling(letter_frequencies(plaintext))
337 fit = metric(target_frequencies, frequencies)
338 logger.debug('Affine break attempt using key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(multiplier, adder, one_based, fit, plaintext[:50]))
339 if fit < best_fit:
340 best_fit = fit
341 best_multiplier = multiplier
342 best_adder = adder
343 best_one_based = one_based
344 logger.info('Affine break best fit with key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(best_multiplier, best_adder, best_one_based, best_fit, affine_decipher(sanitised_message, best_multiplier, best_adder, best_one_based)[:50]))
345 return (best_multiplier, best_adder, best_one_based), best_fit
346
347
348 def keyword_break(message, wordlist=keywords, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
349 """Breaks a keyword substitution cipher using a dictionary and frequency analysis
350
351 >>> keyword_break(keyword_encipher('this is a test message for the keyword decipherment', 'elephant', 1), wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
352 (('elephant', 1), 0.41643991598441...)
353 """
354 best_keyword = ''
355 best_wrap_alphabet = True
356 best_fit = float("inf")
357 for wrap_alphabet in range(3):
358 for keyword in wordlist:
359 plaintext = keyword_decipher(message, keyword, wrap_alphabet)
360 frequencies = message_frequency_scaling(letter_frequencies(plaintext))
361 fit = metric(target_frequencies, frequencies)
362 logger.debug('Keyword break attempt using key {0} (wrap={1}) gives fit of {2} and decrypt starting: {3}'.format(keyword, wrap_alphabet, fit, sanitise(plaintext)[:50]))
363 if fit < best_fit:
364 best_fit = fit
365 best_keyword = keyword
366 best_wrap_alphabet = wrap_alphabet
367 logger.info('Keyword break best fit with key {0} (wrap={1}) gives fit of {2} and decrypt starting: {3}'.format(best_keyword, best_wrap_alphabet, best_fit, sanitise(keyword_decipher(message, best_keyword))[:50]))
368 return (best_keyword, best_wrap_alphabet), best_fit
369
370
371 if __name__ == "__main__":
372 import doctest
373 doctest.testmod()