Added another mode to keyword cipher.
[cipher-tools.git] / cipher.py
1 import string
2 import collections
3 import norms
4 import logging
5
6 logger = logging.getLogger(__name__)
7 logger.addHandler(logging.FileHandler('cipher.log'))
8 logger.setLevel(logging.WARNING)
9 #logger.setLevel(logging.INFO)
10
11 english_counts = collections.defaultdict(int)
12 with open('count_1l.txt', 'r') as f:
13 for line in f:
14 (letter, count) = line.split("\t")
15 english_counts[letter] = int(count)
16 normalised_english_counts = norms.normalise(english_counts)
17
18 with open('words.txt', 'r') as f:
19 keywords = [line.rstrip() for line in f]
20
21 modular_division_table = [[0]*26 for x in range(26)]
22 for a in range(26):
23 for b in range(26):
24 c = (a * b) % 26
25 modular_division_table[b][c] = a
26
27 modular_division_table_one_based = [[0]*27 for x in range(27)]
28 for a in range(27):
29 for b in range(27):
30 c = ((a * b)-1) % 26 + 1
31 modular_division_table_one_based[b][c] = a
32
33
34
35 def sanitise(text):
36 """Remove all non-alphabetic characters and convert the text to lowercase
37
38 >>> sanitise('The Quick')
39 'thequick'
40 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
41 'thequickbrownfoxjumpedoverthelazydog'
42 """
43 sanitised = [c.lower() for c in text if c in string.ascii_letters]
44 return ''.join(sanitised)
45
46 def ngrams(text, n):
47 """Returns all n-grams of a text
48
49 >>> ngrams(sanitise('the quick brown fox'), 2)
50 [('t', 'h'), ('h', 'e'), ('e', 'q'), ('q', 'u'), ('u', 'i'), ('i', 'c'), ('c', 'k'), ('k', 'b'), ('b', 'r'), ('r', 'o'), ('o', 'w'), ('w', 'n'), ('n', 'f'), ('f', 'o'), ('o', 'x')]
51 >>> ngrams(sanitise('the quick brown fox'), 4)
52 [('t', 'h', 'e', 'q'), ('h', 'e', 'q', 'u'), ('e', 'q', 'u', 'i'), ('q', 'u', 'i', 'c'), ('u', 'i', 'c', 'k'), ('i', 'c', 'k', 'b'), ('c', 'k', 'b', 'r'), ('k', 'b', 'r', 'o'), ('b', 'r', 'o', 'w'), ('r', 'o', 'w', 'n'), ('o', 'w', 'n', 'f'), ('w', 'n', 'f', 'o'), ('n', 'f', 'o', 'x')]
53 """
54 return [tuple(text[i:i+n]) for i in range(len(text)-n+1)]
55
56 def letter_frequencies(text):
57 """Count the number of occurrences of each character in text
58
59 >>> sorted(letter_frequencies('abcdefabc').items())
60 [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
61 >>> sorted(letter_frequencies('the quick brown fox jumped over the lazy dog').items())
62 [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
63 >>> sorted(letter_frequencies('The Quick BROWN fox jumped! over... the (9lazy) DOG').items())
64 [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1), ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1), ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1), ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
65 >>> sorted(letter_frequencies(sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')).items())
66 [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
67 """
68 counts = collections.defaultdict(int)
69 for c in text:
70 counts[c] += 1
71 return counts
72
73 def deduplicate(text):
74 return list(collections.OrderedDict.fromkeys(text))
75
76
77
78 def caesar_encipher_letter(letter, shift):
79 """Encipher a letter, given a shift amount
80
81 >>> caesar_encipher_letter('a', 1)
82 'b'
83 >>> caesar_encipher_letter('a', 2)
84 'c'
85 >>> caesar_encipher_letter('b', 2)
86 'd'
87 >>> caesar_encipher_letter('x', 2)
88 'z'
89 >>> caesar_encipher_letter('y', 2)
90 'a'
91 >>> caesar_encipher_letter('z', 2)
92 'b'
93 >>> caesar_encipher_letter('z', -1)
94 'y'
95 >>> caesar_encipher_letter('a', -1)
96 'z'
97 """
98 if letter in string.ascii_letters:
99 if letter in string.ascii_uppercase:
100 alphabet_start = ord('A')
101 else:
102 alphabet_start = ord('a')
103 return chr(((ord(letter) - alphabet_start + shift) % 26) + alphabet_start)
104 else:
105 return letter
106
107 def caesar_decipher_letter(letter, shift):
108 """Decipher a letter, given a shift amount
109
110 >>> caesar_decipher_letter('b', 1)
111 'a'
112 >>> caesar_decipher_letter('b', 2)
113 'z'
114 """
115 return caesar_encipher_letter(letter, -shift)
116
117 def caesar_encipher(message, shift):
118 """Encipher a message with the Caesar cipher of given shift
119
120 >>> caesar_encipher('abc', 1)
121 'bcd'
122 >>> caesar_encipher('abc', 2)
123 'cde'
124 >>> caesar_encipher('abcxyz', 2)
125 'cdezab'
126 >>> caesar_encipher('ab cx yz', 2)
127 'cd ez ab'
128 """
129 enciphered = [caesar_encipher_letter(l, shift) for l in message]
130 return ''.join(enciphered)
131
132 def caesar_decipher(message, shift):
133 """Encipher a message with the Caesar cipher of given shift
134
135 >>> caesar_decipher('bcd', 1)
136 'abc'
137 >>> caesar_decipher('cde', 2)
138 'abc'
139 >>> caesar_decipher('cd ez ab', 2)
140 'ab cx yz'
141 """
142 return caesar_encipher(message, -shift)
143
144 def affine_encipher_letter(letter, multiplier=1, adder=0, one_based=True):
145 """Encipher a letter, given a multiplier and adder
146
147 >>> ''.join([affine_encipher_letter(l, 3, 5, True) for l in string.ascii_uppercase])
148 'HKNQTWZCFILORUXADGJMPSVYBE'
149 >>> ''.join([affine_encipher_letter(l, 3, 5, False) for l in string.ascii_uppercase])
150 'FILORUXADGJMPSVYBEHKNQTWZC'
151 """
152 if letter in string.ascii_letters:
153 if letter in string.ascii_uppercase:
154 alphabet_start = ord('A')
155 else:
156 alphabet_start = ord('a')
157 letter_number = ord(letter) - alphabet_start
158 if one_based:
159 letter_number += 1
160 raw_cipher_number = (letter_number * multiplier + adder)
161 cipher_number = 0
162 if one_based:
163 cipher_number = (raw_cipher_number - 1) % 26
164 else:
165 cipher_number = raw_cipher_number % 26
166 return chr(cipher_number + alphabet_start)
167 else:
168 return letter
169
170 def affine_decipher_letter(letter, multiplier=1, adder=0, one_based=True):
171 """Encipher a letter, given a multiplier and adder
172
173 >>> ''.join([affine_decipher_letter(l, 3, 5, True) for l in 'HKNQTWZCFILORUXADGJMPSVYBE'])
174 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
175 >>> ''.join([affine_decipher_letter(l, 3, 5, False) for l in 'FILORUXADGJMPSVYBEHKNQTWZC'])
176 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
177 """
178 if letter in string.ascii_letters:
179 if letter in string.ascii_uppercase:
180 alphabet_start = ord('A')
181 else:
182 alphabet_start = ord('a')
183 cipher_number = ord(letter) - alphabet_start
184 if one_based:
185 cipher_number += 1
186 plaintext_number = 0
187 if one_based:
188 plaintext_number = (modular_division_table_one_based[multiplier][(cipher_number - adder + 26) % 26] - 1) % 26
189 else:
190 #plaintext_number = (modular_division_table[multiplier][cipher_number] - adder) % 26
191 plaintext_number = modular_division_table[multiplier][(cipher_number - adder + 26) % 26]
192 return chr(plaintext_number + alphabet_start)
193 else:
194 return letter
195
196 def affine_encipher(message, multiplier=1, adder=0, one_based=True):
197 """Encipher a message
198
199 >>> affine_encipher('hours passed during which jerico tried every trick he could think of', 15, 22, True)
200 'lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh'
201 """
202 enciphered = [affine_encipher_letter(l, multiplier, adder, one_based) for l in message]
203 return ''.join(enciphered)
204
205 def affine_decipher(message, multiplier=1, adder=0, one_based=True):
206 """Decipher a message
207
208 >>> affine_decipher('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh', 15, 22, True)
209 'hours passed during which jerico tried every trick he could think of'
210 """
211 enciphered = [affine_decipher_letter(l, multiplier, adder, one_based) for l in message]
212 return ''.join(enciphered)
213
214
215 def keyword_cipher_alphabet_of(keyword, wrap_alphabet=0):
216 """Find the cipher alphabet given a keyword.
217 wrap_alphabet controls how the rest of the alphabet is added
218 after the keyword.
219 0 : from 'a'
220 1 : from the last letter in the sanitised keyword
221 2 : from the largest letter in the sanitised keyword
222
223 >>> keyword_cipher_alphabet_of('bayes')
224 'bayescdfghijklmnopqrtuvwxz'
225 >>> keyword_cipher_alphabet_of('bayes', 0)
226 'bayescdfghijklmnopqrtuvwxz'
227 >>> keyword_cipher_alphabet_of('bayes', 1)
228 'bayestuvwxzcdfghijklmnopqr'
229 >>> keyword_cipher_alphabet_of('bayes', 2)
230 'bayeszcdfghijklmnopqrtuvwx'
231 """
232 if wrap_alphabet == 0:
233 cipher_alphabet = ''.join(deduplicate(sanitise(keyword) + string.ascii_lowercase))
234 else:
235 if wrap_alphabet == 1:
236 last_keyword_letter = deduplicate(sanitise(keyword))[-1]
237 else:
238 last_keyword_letter = sorted(sanitise(keyword))[-1]
239 last_keyword_position = string.ascii_lowercase.find(last_keyword_letter) + 1
240 cipher_alphabet = ''.join(deduplicate(sanitise(keyword) +
241 string.ascii_lowercase[last_keyword_position:] +
242 string.ascii_lowercase))
243 return cipher_alphabet
244
245
246 def keyword_encipher(message, keyword, wrap_alphabet=0):
247 """Enciphers a message with a keyword substitution cipher.
248 wrap_alphabet controls how the rest of the alphabet is added
249 after the keyword.
250 0 : from 'a'
251 1 : from the last letter in the sanitised keyword
252 2 : from the largest letter in the sanitised keyword
253
254 >>> keyword_encipher('test message', 'bayes')
255 'rsqr ksqqbds'
256 >>> keyword_encipher('test message', 'bayes', 0)
257 'rsqr ksqqbds'
258 >>> keyword_encipher('test message', 'bayes', 1)
259 'lskl dskkbus'
260 >>> keyword_encipher('test message', 'bayes', 2)
261 'qspq jsppbcs'
262 """
263 cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
264 cipher_translation = ''.maketrans(string.ascii_lowercase, cipher_alphabet)
265 return message.lower().translate(cipher_translation)
266
267 def keyword_decipher(message, keyword, wrap_alphabet=0):
268 """Deciphers a message with a keyword substitution cipher.
269 wrap_alphabet controls how the rest of the alphabet is added
270 after the keyword.
271 0 : from 'a'
272 1 : from the last letter in the sanitised keyword
273 2 : from the largest letter in the sanitised keyword
274
275 >>> keyword_decipher('rsqr ksqqbds', 'bayes')
276 'test message'
277 >>> keyword_decipher('rsqr ksqqbds', 'bayes', 0)
278 'test message'
279 >>> keyword_decipher('lskl dskkbus', 'bayes', 1)
280 'test message'
281 >>> keyword_decipher('qspq jsppbcs', 'bayes', 2)
282 'test message'
283 """
284 cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
285 cipher_translation = ''.maketrans(cipher_alphabet, string.ascii_lowercase)
286 return message.lower().translate(cipher_translation)
287
288
289 def caesar_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
290 """Breaks a Caesar cipher using frequency analysis
291
292 >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrhecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS
293 (4, 0.31863952890183...)
294 >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgteeraxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS
295 (19, 0.42152901235832...)
296 >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurersvaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS
297 (13, 0.316029208075451...)
298 """
299 sanitised_message = sanitise(message)
300 best_shift = 0
301 best_fit = float("inf")
302 for shift in range(26):
303 plaintext = caesar_decipher(sanitised_message, shift)
304 frequencies = message_frequency_scaling(letter_frequencies(plaintext))
305 fit = metric(target_frequencies, frequencies)
306 logger.debug('Caesar break attempt using key {0} gives fit of {1} and decrypt starting: {2}'.format(shift, fit, plaintext[:50]))
307 if fit < best_fit:
308 best_fit = fit
309 best_shift = shift
310 logger.info('Caesar break best fit: key {0} gives fit of {1} and decrypt starting: {2}'.format(best_shift, best_fit, caesar_decipher(sanitised_message, best_shift)[:50]))
311 return best_shift, best_fit
312
313 def affine_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
314 """Breaks an affine cipher using frequency analysis
315
316 >>> affine_break('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh jm bfmibj umis hfsul axubafkjamx. ls kffkxwsd jls ofgbjmwfkiu olfmxmtmwaokttg jlsx ls kffkxwsd jlsi zg tsxwjl. jlsx ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai kxd clm ckuxj.') # doctest: +ELLIPSIS
317 ((15, 22, True), 0.23570361818655...)
318 """
319 sanitised_message = sanitise(message)
320 best_multiplier = 0
321 best_adder = 0
322 best_one_based = True
323 best_fit = float("inf")
324 for one_based in [True, False]:
325 for multiplier in range(1, 26, 2):
326 for adder in range(26):
327 plaintext = affine_decipher(sanitised_message, multiplier, adder, one_based)
328 frequencies = message_frequency_scaling(letter_frequencies(plaintext))
329 fit = metric(target_frequencies, frequencies)
330 logger.debug('Affine break attempt using key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(multiplier, adder, one_based, fit, plaintext[:50]))
331 if fit < best_fit:
332 best_fit = fit
333 best_multiplier = multiplier
334 best_adder = adder
335 best_one_based = one_based
336 logger.info('Affine break best fit with key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(best_multiplier, best_adder, best_one_based, best_fit, affine_decipher(sanitised_message, best_multiplier, best_adder, best_one_based)[:50]))
337 return (best_multiplier, best_adder, best_one_based), best_fit
338
339
340 def keyword_break(message, wordlist=keywords, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
341 """Breaks a keyword substitution cipher using a dictionary and frequency analysis
342
343 >>> keyword_break(keyword_encipher('this is a test message for the keyword decipherment', 'elephant', 1), wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
344 (('elephant', 1), 0.41643991598441...)
345 """
346 best_keyword = ''
347 best_wrap_alphabet = True
348 best_fit = float("inf")
349 for wrap_alphabet in range(3):
350 for keyword in wordlist:
351 plaintext = keyword_decipher(message, keyword, wrap_alphabet)
352 frequencies = message_frequency_scaling(letter_frequencies(plaintext))
353 fit = metric(target_frequencies, frequencies)
354 logger.debug('Keyword break attempt using key {0} (wrap={1}) gives fit of {2} and decrypt starting: {3}'.format(keyword, wrap_alphabet, fit, sanitise(plaintext)[:50]))
355 if fit < best_fit:
356 best_fit = fit
357 best_keyword = keyword
358 best_wrap_alphabet = wrap_alphabet
359 logger.info('Keyword break best fit with key {0} (wrap={1}) gives fit of {2} and decrypt starting: {3}'.format(best_keyword, best_wrap_alphabet, best_fit, sanitise(keyword_decipher(message, best_keyword))[:50]))
360 return (best_keyword, best_wrap_alphabet), best_fit
361
362
363 if __name__ == "__main__":
364 import doctest
365 doctest.testmod()