Segmentation working, though hits recursion limit for texts longer than 250 characters
[cipher-tools.git] / cipher.py
1 import string
2 import collections
3 import norms
4 import logging
5 from segment import segment
6
7 logger = logging.getLogger(__name__)
8 logger.addHandler(logging.FileHandler('cipher.log'))
9 logger.setLevel(logging.WARNING)
10 #logger.setLevel(logging.INFO)
11
12 english_counts = collections.defaultdict(int)
13 with open('count_1l.txt', 'r') as f:
14 for line in f:
15 (letter, count) = line.split("\t")
16 english_counts[letter] = int(count)
17 normalised_english_counts = norms.normalise(english_counts)
18
19 with open('words.txt', 'r') as f:
20 keywords = [line.rstrip() for line in f]
21
22 modular_division_table = [[0]*26 for x in range(26)]
23 for a in range(26):
24 for b in range(26):
25 c = (a * b) % 26
26 modular_division_table[b][c] = a
27
28 modular_division_table_one_based = [[0]*27 for x in range(27)]
29 for a in range(27):
30 for b in range(27):
31 c = ((a * b)-1) % 26 + 1
32 modular_division_table_one_based[b][c] = a
33
34
35
36 def sanitise(text):
37 """Remove all non-alphabetic characters and convert the text to lowercase
38
39 >>> sanitise('The Quick')
40 'thequick'
41 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
42 'thequickbrownfoxjumpedoverthelazydog'
43 """
44 sanitised = [c.lower() for c in text if c in string.ascii_letters]
45 return ''.join(sanitised)
46
47 def ngrams(text, n):
48 """Returns all n-grams of a text
49
50 >>> ngrams(sanitise('the quick brown fox'), 2)
51 [('t', 'h'), ('h', 'e'), ('e', 'q'), ('q', 'u'), ('u', 'i'), ('i', 'c'), ('c', 'k'), ('k', 'b'), ('b', 'r'), ('r', 'o'), ('o', 'w'), ('w', 'n'), ('n', 'f'), ('f', 'o'), ('o', 'x')]
52 >>> ngrams(sanitise('the quick brown fox'), 4)
53 [('t', 'h', 'e', 'q'), ('h', 'e', 'q', 'u'), ('e', 'q', 'u', 'i'), ('q', 'u', 'i', 'c'), ('u', 'i', 'c', 'k'), ('i', 'c', 'k', 'b'), ('c', 'k', 'b', 'r'), ('k', 'b', 'r', 'o'), ('b', 'r', 'o', 'w'), ('r', 'o', 'w', 'n'), ('o', 'w', 'n', 'f'), ('w', 'n', 'f', 'o'), ('n', 'f', 'o', 'x')]
54 """
55 return [tuple(text[i:i+n]) for i in range(len(text)-n+1)]
56
57 def letter_frequencies(text):
58 """Count the number of occurrences of each character in text
59
60 >>> sorted(letter_frequencies('abcdefabc').items())
61 [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
62 >>> sorted(letter_frequencies('the quick brown fox jumped over the lazy dog').items())
63 [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
64 >>> sorted(letter_frequencies('The Quick BROWN fox jumped! over... the (9lazy) DOG').items())
65 [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1), ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1), ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1), ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
66 >>> sorted(letter_frequencies(sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')).items())
67 [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
68 """
69 counts = collections.defaultdict(int)
70 for c in text:
71 counts[c] += 1
72 return counts
73
74 def deduplicate(text):
75 return list(collections.OrderedDict.fromkeys(text))
76
77
78
79 def caesar_encipher_letter(letter, shift):
80 """Encipher a letter, given a shift amount
81
82 >>> caesar_encipher_letter('a', 1)
83 'b'
84 >>> caesar_encipher_letter('a', 2)
85 'c'
86 >>> caesar_encipher_letter('b', 2)
87 'd'
88 >>> caesar_encipher_letter('x', 2)
89 'z'
90 >>> caesar_encipher_letter('y', 2)
91 'a'
92 >>> caesar_encipher_letter('z', 2)
93 'b'
94 >>> caesar_encipher_letter('z', -1)
95 'y'
96 >>> caesar_encipher_letter('a', -1)
97 'z'
98 """
99 if letter in string.ascii_letters:
100 if letter in string.ascii_uppercase:
101 alphabet_start = ord('A')
102 else:
103 alphabet_start = ord('a')
104 return chr(((ord(letter) - alphabet_start + shift) % 26) + alphabet_start)
105 else:
106 return letter
107
108 def caesar_decipher_letter(letter, shift):
109 """Decipher a letter, given a shift amount
110
111 >>> caesar_decipher_letter('b', 1)
112 'a'
113 >>> caesar_decipher_letter('b', 2)
114 'z'
115 """
116 return caesar_encipher_letter(letter, -shift)
117
118 def caesar_encipher(message, shift):
119 """Encipher a message with the Caesar cipher of given shift
120
121 >>> caesar_encipher('abc', 1)
122 'bcd'
123 >>> caesar_encipher('abc', 2)
124 'cde'
125 >>> caesar_encipher('abcxyz', 2)
126 'cdezab'
127 >>> caesar_encipher('ab cx yz', 2)
128 'cd ez ab'
129 """
130 enciphered = [caesar_encipher_letter(l, shift) for l in message]
131 return ''.join(enciphered)
132
133 def caesar_decipher(message, shift):
134 """Encipher a message with the Caesar cipher of given shift
135
136 >>> caesar_decipher('bcd', 1)
137 'abc'
138 >>> caesar_decipher('cde', 2)
139 'abc'
140 >>> caesar_decipher('cd ez ab', 2)
141 'ab cx yz'
142 """
143 return caesar_encipher(message, -shift)
144
145 def affine_encipher_letter(letter, multiplier=1, adder=0, one_based=True):
146 """Encipher a letter, given a multiplier and adder
147
148 >>> ''.join([affine_encipher_letter(l, 3, 5, True) for l in string.ascii_uppercase])
149 'HKNQTWZCFILORUXADGJMPSVYBE'
150 >>> ''.join([affine_encipher_letter(l, 3, 5, False) for l in string.ascii_uppercase])
151 'FILORUXADGJMPSVYBEHKNQTWZC'
152 """
153 if letter in string.ascii_letters:
154 if letter in string.ascii_uppercase:
155 alphabet_start = ord('A')
156 else:
157 alphabet_start = ord('a')
158 letter_number = ord(letter) - alphabet_start
159 if one_based:
160 letter_number += 1
161 raw_cipher_number = (letter_number * multiplier + adder)
162 cipher_number = 0
163 if one_based:
164 cipher_number = (raw_cipher_number - 1) % 26
165 else:
166 cipher_number = raw_cipher_number % 26
167 return chr(cipher_number + alphabet_start)
168 else:
169 return letter
170
171 def affine_decipher_letter(letter, multiplier=1, adder=0, one_based=True):
172 """Encipher a letter, given a multiplier and adder
173
174 >>> ''.join([affine_decipher_letter(l, 3, 5, True) for l in 'HKNQTWZCFILORUXADGJMPSVYBE'])
175 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
176 >>> ''.join([affine_decipher_letter(l, 3, 5, False) for l in 'FILORUXADGJMPSVYBEHKNQTWZC'])
177 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
178 """
179 if letter in string.ascii_letters:
180 if letter in string.ascii_uppercase:
181 alphabet_start = ord('A')
182 else:
183 alphabet_start = ord('a')
184 cipher_number = ord(letter) - alphabet_start
185 if one_based:
186 cipher_number += 1
187 plaintext_number = 0
188 if one_based:
189 plaintext_number = (modular_division_table_one_based[multiplier][(cipher_number - adder + 26) % 26] - 1) % 26
190 else:
191 #plaintext_number = (modular_division_table[multiplier][cipher_number] - adder) % 26
192 plaintext_number = modular_division_table[multiplier][(cipher_number - adder + 26) % 26]
193 return chr(plaintext_number + alphabet_start)
194 else:
195 return letter
196
197 def affine_encipher(message, multiplier=1, adder=0, one_based=True):
198 """Encipher a message
199
200 >>> affine_encipher('hours passed during which jerico tried every trick he could think of', 15, 22, True)
201 'lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh'
202 """
203 enciphered = [affine_encipher_letter(l, multiplier, adder, one_based) for l in message]
204 return ''.join(enciphered)
205
206 def affine_decipher(message, multiplier=1, adder=0, one_based=True):
207 """Decipher a message
208
209 >>> affine_decipher('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh', 15, 22, True)
210 'hours passed during which jerico tried every trick he could think of'
211 """
212 enciphered = [affine_decipher_letter(l, multiplier, adder, one_based) for l in message]
213 return ''.join(enciphered)
214
215
216 def keyword_cipher_alphabet_of(keyword, wrap_alphabet=0):
217 """Find the cipher alphabet given a keyword.
218 wrap_alphabet controls how the rest of the alphabet is added
219 after the keyword.
220 0 : from 'a'
221 1 : from the last letter in the sanitised keyword
222 2 : from the largest letter in the sanitised keyword
223
224 >>> keyword_cipher_alphabet_of('bayes')
225 'bayescdfghijklmnopqrtuvwxz'
226 >>> keyword_cipher_alphabet_of('bayes', 0)
227 'bayescdfghijklmnopqrtuvwxz'
228 >>> keyword_cipher_alphabet_of('bayes', 1)
229 'bayestuvwxzcdfghijklmnopqr'
230 >>> keyword_cipher_alphabet_of('bayes', 2)
231 'bayeszcdfghijklmnopqrtuvwx'
232 """
233 if wrap_alphabet == 0:
234 cipher_alphabet = ''.join(deduplicate(sanitise(keyword) + string.ascii_lowercase))
235 else:
236 if wrap_alphabet == 1:
237 last_keyword_letter = deduplicate(sanitise(keyword))[-1]
238 else:
239 last_keyword_letter = sorted(sanitise(keyword))[-1]
240 last_keyword_position = string.ascii_lowercase.find(last_keyword_letter) + 1
241 cipher_alphabet = ''.join(deduplicate(sanitise(keyword) +
242 string.ascii_lowercase[last_keyword_position:] +
243 string.ascii_lowercase))
244 return cipher_alphabet
245
246
247 def keyword_encipher(message, keyword, wrap_alphabet=0):
248 """Enciphers a message with a keyword substitution cipher.
249 wrap_alphabet controls how the rest of the alphabet is added
250 after the keyword.
251 0 : from 'a'
252 1 : from the last letter in the sanitised keyword
253 2 : from the largest letter in the sanitised keyword
254
255 >>> keyword_encipher('test message', 'bayes')
256 'rsqr ksqqbds'
257 >>> keyword_encipher('test message', 'bayes', 0)
258 'rsqr ksqqbds'
259 >>> keyword_encipher('test message', 'bayes', 1)
260 'lskl dskkbus'
261 >>> keyword_encipher('test message', 'bayes', 2)
262 'qspq jsppbcs'
263 """
264 cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
265 cipher_translation = ''.maketrans(string.ascii_lowercase, cipher_alphabet)
266 return message.lower().translate(cipher_translation)
267
268 def keyword_decipher(message, keyword, wrap_alphabet=0):
269 """Deciphers a message with a keyword substitution cipher.
270 wrap_alphabet controls how the rest of the alphabet is added
271 after the keyword.
272 0 : from 'a'
273 1 : from the last letter in the sanitised keyword
274 2 : from the largest letter in the sanitised keyword
275
276 >>> keyword_decipher('rsqr ksqqbds', 'bayes')
277 'test message'
278 >>> keyword_decipher('rsqr ksqqbds', 'bayes', 0)
279 'test message'
280 >>> keyword_decipher('lskl dskkbus', 'bayes', 1)
281 'test message'
282 >>> keyword_decipher('qspq jsppbcs', 'bayes', 2)
283 'test message'
284 """
285 cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
286 cipher_translation = ''.maketrans(cipher_alphabet, string.ascii_lowercase)
287 return message.lower().translate(cipher_translation)
288
289
290 def caesar_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
291 """Breaks a Caesar cipher using frequency analysis
292
293 >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrhecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS
294 (4, 0.31863952890183...)
295 >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgteeraxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS
296 (19, 0.42152901235832...)
297 >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurersvaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS
298 (13, 0.316029208075451...)
299 """
300 sanitised_message = sanitise(message)
301 best_shift = 0
302 best_fit = float("inf")
303 for shift in range(26):
304 plaintext = caesar_decipher(sanitised_message, shift)
305 frequencies = message_frequency_scaling(letter_frequencies(plaintext))
306 fit = metric(target_frequencies, frequencies)
307 logger.debug('Caesar break attempt using key {0} gives fit of {1} and decrypt starting: {2}'.format(shift, fit, plaintext[:50]))
308 if fit < best_fit:
309 best_fit = fit
310 best_shift = shift
311 logger.info('Caesar break best fit: key {0} gives fit of {1} and decrypt starting: {2}'.format(best_shift, best_fit, caesar_decipher(sanitised_message, best_shift)[:50]))
312 return best_shift, best_fit
313
314 def affine_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
315 """Breaks an affine cipher using frequency analysis
316
317 >>> affine_break('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh jm bfmibj umis hfsul axubafkjamx. ls kffkxwsd jls ofgbjmwfkiu olfmxmtmwaokttg jlsx ls kffkxwsd jlsi zg tsxwjl. jlsx ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai kxd clm ckuxj.') # doctest: +ELLIPSIS
318 ((15, 22, True), 0.23570361818655...)
319 """
320 sanitised_message = sanitise(message)
321 best_multiplier = 0
322 best_adder = 0
323 best_one_based = True
324 best_fit = float("inf")
325 for one_based in [True, False]:
326 for multiplier in range(1, 26, 2):
327 for adder in range(26):
328 plaintext = affine_decipher(sanitised_message, multiplier, adder, one_based)
329 frequencies = message_frequency_scaling(letter_frequencies(plaintext))
330 fit = metric(target_frequencies, frequencies)
331 logger.debug('Affine break attempt using key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(multiplier, adder, one_based, fit, plaintext[:50]))
332 if fit < best_fit:
333 best_fit = fit
334 best_multiplier = multiplier
335 best_adder = adder
336 best_one_based = one_based
337 logger.info('Affine break best fit with key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(best_multiplier, best_adder, best_one_based, best_fit, affine_decipher(sanitised_message, best_multiplier, best_adder, best_one_based)[:50]))
338 return (best_multiplier, best_adder, best_one_based), best_fit
339
340
341 def keyword_break(message, wordlist=keywords, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
342 """Breaks a keyword substitution cipher using a dictionary and frequency analysis
343
344 >>> keyword_break(keyword_encipher('this is a test message for the keyword decipherment', 'elephant', 1), wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
345 (('elephant', 1), 0.41643991598441...)
346 """
347 best_keyword = ''
348 best_wrap_alphabet = True
349 best_fit = float("inf")
350 for wrap_alphabet in range(3):
351 for keyword in wordlist:
352 plaintext = keyword_decipher(message, keyword, wrap_alphabet)
353 frequencies = message_frequency_scaling(letter_frequencies(plaintext))
354 fit = metric(target_frequencies, frequencies)
355 logger.debug('Keyword break attempt using key {0} (wrap={1}) gives fit of {2} and decrypt starting: {3}'.format(keyword, wrap_alphabet, fit, sanitise(plaintext)[:50]))
356 if fit < best_fit:
357 best_fit = fit
358 best_keyword = keyword
359 best_wrap_alphabet = wrap_alphabet
360 logger.info('Keyword break best fit with key {0} (wrap={1}) gives fit of {2} and decrypt starting: {3}'.format(best_keyword, best_wrap_alphabet, best_fit, sanitise(keyword_decipher(message, best_keyword))[:50]))
361 return (best_keyword, best_wrap_alphabet), best_fit
362
363
364 if __name__ == "__main__":
365 import doctest
366 doctest.testmod()