Word segmentation not working, but it's now late...
[cipher-tools.git] / cipher.py
1 import string
2 import collections
3 import norms
4 import logging
5
6 logger = logging.getLogger(__name__)
7 logger.addHandler(logging.FileHandler('cipher.log'))
8 logger.setLevel(logging.WARNING)
9 #logger.setLevel(logging.INFO)
10
11 english_counts = collections.defaultdict(int)
12 with open('count_1l.txt', 'r') as f:
13 for line in f:
14 (letter, count) = line.split("\t")
15 english_counts[letter] = int(count)
16 normalised_english_counts = norms.normalise(english_counts)
17
18 keywords = []
19 with open('words.txt', 'r') as f:
20 keywords = [line.rstrip() for line in f]
21
22
23 modular_division_table = [[0]*26 for x in range(26)]
24 for a in range(26):
25 for b in range(26):
26 c = (a * b) % 26
27 modular_division_table[b][c] = a
28
29 modular_division_table_one_based = [[0]*27 for x in range(27)]
30 for a in range(27):
31 for b in range(27):
32 c = ((a * b)-1) % 26 + 1
33 modular_division_table_one_based[b][c] = a
34
35
36
37 def sanitise(text):
38 """Remove all non-alphabetic characters and convert the text to lowercase
39
40 >>> sanitise('The Quick')
41 'thequick'
42 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
43 'thequickbrownfoxjumpedoverthelazydog'
44 """
45 sanitised = [c.lower() for c in text if c in string.ascii_letters]
46 return ''.join(sanitised)
47
48 def ngrams(text, n):
49 """Returns all n-grams of a text
50
51 >>> ngrams(sanitise('the quick brown fox'), 2)
52 [('t', 'h'), ('h', 'e'), ('e', 'q'), ('q', 'u'), ('u', 'i'), ('i', 'c'), ('c', 'k'), ('k', 'b'), ('b', 'r'), ('r', 'o'), ('o', 'w'), ('w', 'n'), ('n', 'f'), ('f', 'o'), ('o', 'x')]
53 >>> ngrams(sanitise('the quick brown fox'), 4)
54 [('t', 'h', 'e', 'q'), ('h', 'e', 'q', 'u'), ('e', 'q', 'u', 'i'), ('q', 'u', 'i', 'c'), ('u', 'i', 'c', 'k'), ('i', 'c', 'k', 'b'), ('c', 'k', 'b', 'r'), ('k', 'b', 'r', 'o'), ('b', 'r', 'o', 'w'), ('r', 'o', 'w', 'n'), ('o', 'w', 'n', 'f'), ('w', 'n', 'f', 'o'), ('n', 'f', 'o', 'x')]
55 """
56 return [tuple(text[i:i+n]) for i in range(len(text)-n+1)]
57
58 def letter_frequencies(text):
59 """Count the number of occurrences of each character in text
60
61 >>> sorted(letter_frequencies('abcdefabc').items())
62 [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
63 >>> sorted(letter_frequencies('the quick brown fox jumped over the lazy dog').items())
64 [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
65 >>> sorted(letter_frequencies('The Quick BROWN fox jumped! over... the (9lazy) DOG').items())
66 [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1), ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1), ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1), ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
67 >>> sorted(letter_frequencies(sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')).items())
68 [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
69 """
70 counts = collections.defaultdict(int)
71 for c in text:
72 counts[c] += 1
73 return counts
74
75 def deduplicate(text):
76 return list(collections.OrderedDict.fromkeys(text))
77
78
79
80 def caesar_encipher_letter(letter, shift):
81 """Encipher a letter, given a shift amount
82
83 >>> caesar_encipher_letter('a', 1)
84 'b'
85 >>> caesar_encipher_letter('a', 2)
86 'c'
87 >>> caesar_encipher_letter('b', 2)
88 'd'
89 >>> caesar_encipher_letter('x', 2)
90 'z'
91 >>> caesar_encipher_letter('y', 2)
92 'a'
93 >>> caesar_encipher_letter('z', 2)
94 'b'
95 >>> caesar_encipher_letter('z', -1)
96 'y'
97 >>> caesar_encipher_letter('a', -1)
98 'z'
99 """
100 if letter in string.ascii_letters:
101 if letter in string.ascii_uppercase:
102 alphabet_start = ord('A')
103 else:
104 alphabet_start = ord('a')
105 return chr(((ord(letter) - alphabet_start + shift) % 26) + alphabet_start)
106 else:
107 return letter
108
109 def caesar_decipher_letter(letter, shift):
110 """Decipher a letter, given a shift amount
111
112 >>> caesar_decipher_letter('b', 1)
113 'a'
114 >>> caesar_decipher_letter('b', 2)
115 'z'
116 """
117 return caesar_encipher_letter(letter, -shift)
118
119 def caesar_encipher(message, shift):
120 """Encipher a message with the Caesar cipher of given shift
121
122 >>> caesar_encipher('abc', 1)
123 'bcd'
124 >>> caesar_encipher('abc', 2)
125 'cde'
126 >>> caesar_encipher('abcxyz', 2)
127 'cdezab'
128 >>> caesar_encipher('ab cx yz', 2)
129 'cd ez ab'
130 """
131 enciphered = [caesar_encipher_letter(l, shift) for l in message]
132 return ''.join(enciphered)
133
134 def caesar_decipher(message, shift):
135 """Encipher a message with the Caesar cipher of given shift
136
137 >>> caesar_decipher('bcd', 1)
138 'abc'
139 >>> caesar_decipher('cde', 2)
140 'abc'
141 >>> caesar_decipher('cd ez ab', 2)
142 'ab cx yz'
143 """
144 return caesar_encipher(message, -shift)
145
146 def affine_encipher_letter(letter, multiplier=1, adder=0, one_based=True):
147 """Encipher a letter, given a multiplier and adder
148
149 >>> ''.join([affine_encipher_letter(l, 3, 5, True) for l in string.ascii_uppercase])
150 'HKNQTWZCFILORUXADGJMPSVYBE'
151 >>> ''.join([affine_encipher_letter(l, 3, 5, False) for l in string.ascii_uppercase])
152 'FILORUXADGJMPSVYBEHKNQTWZC'
153 """
154 if letter in string.ascii_letters:
155 if letter in string.ascii_uppercase:
156 alphabet_start = ord('A')
157 else:
158 alphabet_start = ord('a')
159 letter_number = ord(letter) - alphabet_start
160 if one_based: letter_number += 1
161 raw_cipher_number = (letter_number * multiplier + adder)
162 cipher_number = 0
163 if one_based:
164 cipher_number = (raw_cipher_number - 1) % 26
165 else:
166 cipher_number = raw_cipher_number % 26
167 return chr(cipher_number + alphabet_start)
168 else:
169 return letter
170
171 def affine_decipher_letter(letter, multiplier=1, adder=0, one_based=True):
172 """Encipher a letter, given a multiplier and adder
173
174 >>> ''.join([affine_decipher_letter(l, 3, 5, True) for l in 'HKNQTWZCFILORUXADGJMPSVYBE'])
175 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
176 >>> ''.join([affine_decipher_letter(l, 3, 5, False) for l in 'FILORUXADGJMPSVYBEHKNQTWZC'])
177 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
178 """
179 if letter in string.ascii_letters:
180 if letter in string.ascii_uppercase:
181 alphabet_start = ord('A')
182 else:
183 alphabet_start = ord('a')
184 cipher_number = ord(letter) - alphabet_start
185 if one_based: cipher_number += 1
186 plaintext_number = 0
187 if one_based:
188 plaintext_number = (modular_division_table_one_based[multiplier][(cipher_number - adder + 26) % 26] - 1) % 26
189 else:
190 #plaintext_number = (modular_division_table[multiplier][cipher_number] - adder) % 26
191 plaintext_number = modular_division_table[multiplier][(cipher_number - adder + 26) % 26]
192 return chr(plaintext_number + alphabet_start)
193 else:
194 return letter
195
196 def affine_encipher(message, multiplier=1, adder=0, one_based=True):
197 """Encipher a message
198
199 >>> affine_encipher('hours passed during which jerico tried every trick he could think of', 15, 22, True)
200 'lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh'
201 """
202
203 enciphered = [affine_encipher_letter(l, multiplier, adder, one_based) for l in message]
204 return ''.join(enciphered)
205
206 def affine_decipher(message, multiplier=1, adder=0, one_based=True):
207 """Decipher a message
208
209 >>> affine_decipher('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh', 15, 22, True)
210 'hours passed during which jerico tried every trick he could think of'
211 """
212 enciphered = [affine_decipher_letter(l, multiplier, adder, one_based) for l in message]
213 return ''.join(enciphered)
214
215
216 def keyword_cipher_alphabet_of(keyword, wrap_alphabet=False):
217 """Find the cipher alphabet given a keyword
218
219 >>> keyword_cipher_alphabet_of('harry')
220 'harybcdefgijklmnopqstuvwxz'
221 >>> keyword_cipher_alphabet_of('harry', True)
222 'haryzbcdefgijklmnopqstuvwx'
223 >>> keyword_cipher_alphabet_of('harry', False)
224 'harybcdefgijklmnopqstuvwxz'
225 """
226 cipher_alphabet = ''
227 if wrap_alphabet:
228 last_keyword_letter = deduplicate(sanitise(keyword))[-1]
229 last_keyword_position = string.ascii_lowercase.find(last_keyword_letter) + 1
230 cipher_alphabet = ''.join(deduplicate(sanitise(keyword) + string.ascii_lowercase[last_keyword_position:] + string.ascii_lowercase))
231 else:
232 cipher_alphabet = ''.join(deduplicate(sanitise(keyword) + string.ascii_lowercase))
233 return cipher_alphabet
234
235
236 def keyword_encipher(message, keyword, wrap_alphabet=False):
237 """Enciphers a message with a keyword substitution cipher
238
239 >>> keyword_encipher('test message', 'harry')
240 'sbqs kbqqhdb'
241 >>> keyword_encipher('test message', 'harry', True)
242 'qzpq jzpphcz'
243 >>> keyword_encipher('test message', 'harry', False)
244 'sbqs kbqqhdb'
245 """
246 cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
247 cipher_translation = ''.maketrans(string.ascii_lowercase, cipher_alphabet)
248 return message.lower().translate(cipher_translation)
249
250 def keyword_decipher(message, keyword, wrap_alphabet=False):
251 """Deciphers a message with a keyword substitution cipher
252
253 >>> keyword_decipher('sbqs kbqqhdb', 'harry')
254 'test message'
255 >>> keyword_decipher('qzpq jzpphcz', 'harry', True)
256 'test message'
257 >>> keyword_decipher('sbqs kbqqhdb', 'harry', False)
258 'test message'
259 """
260 cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
261 cipher_translation = ''.maketrans(cipher_alphabet, string.ascii_lowercase)
262 return message.lower().translate(cipher_translation)
263
264
265 def caesar_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
266 """Breaks a Caesar cipher using frequency analysis
267
268 >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrhecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS
269 (4, 0.31863952890183...)
270 >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgteeraxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS
271 (19, 0.42152901235832...)
272 >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurersvaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS
273 (13, 0.316029208075451...)
274 """
275 sanitised_message = sanitise(message)
276 best_shift = 0
277 best_fit = float("inf")
278 for shift in range(26):
279 plaintext = caesar_decipher(sanitised_message, shift)
280 frequencies = message_frequency_scaling(letter_frequencies(plaintext))
281 fit = metric(target_frequencies, frequencies)
282 logger.debug('Caesar break attempt using key {0} gives fit of {1} and decrypt starting: {2}'.format(shift, fit, plaintext[:50]))
283 if fit < best_fit:
284 best_fit = fit
285 best_shift = shift
286 logger.info('Caesar break best fit: key {0} gives fit of {1} and decrypt starting: {2}'.format(best_shift, best_fit, caesar_decipher(sanitised_message, best_shift)[:50]))
287 return best_shift, best_fit
288
289 def affine_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
290 """Breaks an affine cipher using frequency analysis
291
292 >>> affine_break('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh jm bfmibj umis hfsul axubafkjamx. ls kffkxwsd jls ofgbjmwfkiu olfmxmtmwaokttg jlsx ls kffkxwsd jlsi zg tsxwjl. jlsx ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai kxd clm ckuxj.') # doctest: +ELLIPSIS
293 ((15, 22, True), 0.23570361818655...)
294 """
295 sanitised_message = sanitise(message)
296 best_multiplier = 0
297 best_adder = 0
298 best_one_based = True
299 best_fit = float("inf")
300 for one_based in [True, False]:
301 for multiplier in range(1, 26, 2):
302 for adder in range(26):
303 plaintext = affine_decipher(sanitised_message, multiplier, adder, one_based)
304 frequencies = message_frequency_scaling(letter_frequencies(plaintext))
305 fit = metric(target_frequencies, frequencies)
306 logger.debug('Affine break attempt using key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(multiplier, adder, one_based, fit, plaintext[:50]))
307 if fit < best_fit:
308 best_fit = fit
309 best_multiplier = multiplier
310 best_adder = adder
311 best_one_based = one_based
312 logger.info('Affine break best fit with key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(best_multiplier, best_adder, best_one_based, best_fit, affine_decipher(sanitised_message, best_multiplier, best_adder, best_one_based)[:50]))
313 return (best_multiplier, best_adder, best_one_based), best_fit
314
315
316 def keyword_break(message, wordlist=keywords, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
317 """Breaks a keyword substitution cipher using a dictionary and frequency analysis
318
319 >>> keyword_break(keyword_encipher('this is a test message for the keyword decipherment', 'elephant', True), wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
320 (('elephant', True), 0.41643991598441...)
321 """
322 best_keyword = ''
323 best_wrap_alphabet = True
324 best_fit = float("inf")
325 for wrap_alphabet in [True, False]:
326 for keyword in wordlist:
327 plaintext = keyword_decipher(message, keyword, wrap_alphabet)
328 frequencies = message_frequency_scaling(letter_frequencies(plaintext))
329 fit = metric(target_frequencies, frequencies)
330 logger.debug('Keyword break attempt using key {0} ({1}) gives fit of {2} and decrypt starting: {3}'.format(keyword, wrap_alphabet, fit, sanitise(plaintext)[:50]))
331 if fit < best_fit:
332 best_fit = fit
333 best_keyword = keyword
334 best_wrap_alphabet = wrap_alphabet
335 logger.info('Keyword break best fit with key {0} ({1}) gives fit of {2} and decrypt starting: {3}'.format(best_keyword, best_wrap_alphabet, best_fit, sanitise(keyword_decipher(message, best_keyword))[:50]))
336 return (best_keyword, best_wrap_alphabet), best_fit
337
338
339 if __name__ == "__main__":
340 import doctest
341 doctest.testmod()