Added a test for ngrams
[cipher-tools.git] / cipher.py
1 import string
2 import collections
3 import norms
4 import logging
5
6 logger = logging.getLogger(__name__)
7 logger.addHandler(logging.FileHandler('cipher.log'))
8 logger.setLevel(logging.WARNING)
9 #logger.setLevel(logging.INFO)
10
11 english_counts = collections.defaultdict(int)
12 with open('count_1l.txt', 'r') as f:
13 for line in f:
14 (letter, count) = line.split("\t")
15 english_counts[letter] = int(count)
16 normalised_english_counts = norms.normalise(english_counts)
17
18
19 modular_division_table = [[0]*26 for x in range(26)]
20 for a in range(26):
21 for b in range(26):
22 c = (a * b) % 26
23 modular_division_table[b][c] = a
24
25 modular_division_table_one_based = [[0]*27 for x in range(27)]
26 for a in range(27):
27 for b in range(27):
28 c = ((a * b)-1) % 26 + 1
29 modular_division_table_one_based[b][c] = a
30
31
32
33 def sanitise(text):
34 """Remove all non-alphabetic characters and convert the text to lowercase
35
36 >>> sanitise('The Quick')
37 'thequick'
38 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
39 'thequickbrownfoxjumpedoverthelazydog'
40 """
41 sanitised = [c.lower() for c in text if c in string.ascii_letters]
42 return ''.join(sanitised)
43
44 def ngrams(text, n):
45 """Returns all n-grams of a text
46
47 >>> ngrams(sanitise('the quick brown fox'), 2)
48 [('t', 'h'), ('h', 'e'), ('e', 'q'), ('q', 'u'), ('u', 'i'), ('i', 'c'), ('c', 'k'), ('k', 'b'), ('b', 'r'), ('r', 'o'), ('o', 'w'), ('w', 'n'), ('n', 'f'), ('f', 'o'), ('o', 'x')]
49 >>> ngrams(sanitise('the quick brown fox'), 4)
50 [('t', 'h', 'e', 'q'), ('h', 'e', 'q', 'u'), ('e', 'q', 'u', 'i'), ('q', 'u', 'i', 'c'), ('u', 'i', 'c', 'k'), ('i', 'c', 'k', 'b'), ('c', 'k', 'b', 'r'), ('k', 'b', 'r', 'o'), ('b', 'r', 'o', 'w'), ('r', 'o', 'w', 'n'), ('o', 'w', 'n', 'f'), ('w', 'n', 'f', 'o'), ('n', 'f', 'o', 'x')]
51 """
52 return [tuple(text[i:i+n]) for i in range(len(text)-n+1)]
53
54 def letter_frequencies(text):
55 """Count the number of occurrences of each character in text
56
57 >>> sorted(letter_frequencies('abcdefabc').items())
58 [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
59 >>> sorted(letter_frequencies('the quick brown fox jumped over the lazy dog').items())
60 [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
61 >>> sorted(letter_frequencies('The Quick BROWN fox jumped! over... the (9lazy) DOG').items())
62 [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1), ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1), ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1), ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
63 >>> sorted(letter_frequencies(sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')).items())
64 [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
65 """
66 counts = collections.defaultdict(int)
67 for c in text:
68 counts[c] += 1
69 return counts
70
71 def caesar_encipher_letter(letter, shift):
72 """Encipher a letter, given a shift amount
73
74 >>> caesar_encipher_letter('a', 1)
75 'b'
76 >>> caesar_encipher_letter('a', 2)
77 'c'
78 >>> caesar_encipher_letter('b', 2)
79 'd'
80 >>> caesar_encipher_letter('x', 2)
81 'z'
82 >>> caesar_encipher_letter('y', 2)
83 'a'
84 >>> caesar_encipher_letter('z', 2)
85 'b'
86 >>> caesar_encipher_letter('z', -1)
87 'y'
88 >>> caesar_encipher_letter('a', -1)
89 'z'
90 """
91 if letter in string.ascii_letters:
92 if letter in string.ascii_uppercase:
93 alphabet_start = ord('A')
94 else:
95 alphabet_start = ord('a')
96 return chr(((ord(letter) - alphabet_start + shift) % 26) + alphabet_start)
97 else:
98 return letter
99
100 def caesar_decipher_letter(letter, shift):
101 """Decipher a letter, given a shift amount
102
103 >>> caesar_decipher_letter('b', 1)
104 'a'
105 >>> caesar_decipher_letter('b', 2)
106 'z'
107 """
108 return caesar_encipher_letter(letter, -shift)
109
110 def caesar_encipher(message, shift):
111 """Encipher a message with the Caesar cipher of given shift
112
113 >>> caesar_encipher('abc', 1)
114 'bcd'
115 >>> caesar_encipher('abc', 2)
116 'cde'
117 >>> caesar_encipher('abcxyz', 2)
118 'cdezab'
119 >>> caesar_encipher('ab cx yz', 2)
120 'cd ez ab'
121 """
122 enciphered = [caesar_encipher_letter(l, shift) for l in message]
123 return ''.join(enciphered)
124
125 def caesar_decipher(message, shift):
126 """Encipher a message with the Caesar cipher of given shift
127
128 >>> caesar_decipher('bcd', 1)
129 'abc'
130 >>> caesar_decipher('cde', 2)
131 'abc'
132 >>> caesar_decipher('cd ez ab', 2)
133 'ab cx yz'
134 """
135 return caesar_encipher(message, -shift)
136
137 def affine_encipher_letter(letter, multiplier=1, adder=0, one_based=True):
138 """Encipher a letter, given a multiplier and adder
139
140 >>> ''.join([affine_encipher_letter(l, 3, 5, True) for l in string.ascii_uppercase])
141 'HKNQTWZCFILORUXADGJMPSVYBE'
142 >>> ''.join([affine_encipher_letter(l, 3, 5, False) for l in string.ascii_uppercase])
143 'FILORUXADGJMPSVYBEHKNQTWZC'
144 """
145 if letter in string.ascii_letters:
146 if letter in string.ascii_uppercase:
147 alphabet_start = ord('A')
148 else:
149 alphabet_start = ord('a')
150 letter_number = ord(letter) - alphabet_start
151 if one_based: letter_number += 1
152 raw_cipher_number = (letter_number * multiplier + adder)
153 cipher_number = 0
154 if one_based:
155 cipher_number = (raw_cipher_number - 1) % 26
156 else:
157 cipher_number = raw_cipher_number % 26
158 return chr(cipher_number + alphabet_start)
159 else:
160 return letter
161
162 def affine_decipher_letter(letter, multiplier=1, adder=0, one_based=True):
163 """Encipher a letter, given a multiplier and adder
164
165 >>> ''.join([affine_decipher_letter(l, 3, 5, True) for l in 'HKNQTWZCFILORUXADGJMPSVYBE'])
166 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
167 >>> ''.join([affine_decipher_letter(l, 3, 5, False) for l in 'FILORUXADGJMPSVYBEHKNQTWZC'])
168 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
169 """
170 if letter in string.ascii_letters:
171 if letter in string.ascii_uppercase:
172 alphabet_start = ord('A')
173 else:
174 alphabet_start = ord('a')
175 cipher_number = ord(letter) - alphabet_start
176 if one_based: cipher_number += 1
177 plaintext_number = 0
178 if one_based:
179 plaintext_number = (modular_division_table_one_based[multiplier][(cipher_number - adder + 26) % 26] - 1) % 26
180 else:
181 #plaintext_number = (modular_division_table[multiplier][cipher_number] - adder) % 26
182 plaintext_number = modular_division_table[multiplier][(cipher_number - adder + 26) % 26]
183 return chr(plaintext_number + alphabet_start)
184 else:
185 return letter
186
187 def affine_encipher(message, multiplier=1, adder=0, one_based=True):
188 """Encipher a message
189
190 >>> affine_encipher('hours passed during which jerico tried every trick he could think of', 15, 22, True)
191 'lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh'
192 """
193
194 enciphered = [affine_encipher_letter(l, multiplier, adder, one_based) for l in message]
195 return ''.join(enciphered)
196
197 def affine_decipher(message, multiplier=1, adder=0, one_based=True):
198 """Decipher a message
199
200 >>> affine_decipher('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh', 15, 22, True)
201 'hours passed during which jerico tried every trick he could think of'
202 """
203 enciphered = [affine_decipher_letter(l, multiplier, adder, one_based) for l in message]
204 return ''.join(enciphered)
205
206
207 def caesar_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
208 """Breaks a Caesar cipher using frequency analysis
209
210 >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrhecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm')
211 (4, 0.3186395289018361)
212 >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgteeraxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert')
213 (19, 0.4215290123583277)
214 >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurersvaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur')
215 (13, 0.31602920807545154)
216 """
217 sanitised_message = sanitise(message)
218 best_shift = 0
219 best_fit = float("inf")
220 for shift in range(26):
221 plaintext = caesar_decipher(sanitised_message, shift)
222 frequencies = message_frequency_scaling(letter_frequencies(plaintext))
223 fit = metric(target_frequencies, frequencies)
224 logger.info('Caesar break attempt using key {0} gives fit of {1} and decrypt starting: {2}'.format(shift, fit, plaintext[:50]))
225 if fit < best_fit:
226 best_fit = fit
227 best_shift = shift
228 logger.info('Caesar break best fit: key {0} gives fit of {1} and decrypt starting: {2}'.format(best_shift, best_fit, caesar_decipher(sanitised_message, best_shift)[:50]))
229 return best_shift, best_fit
230
231 def affine_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
232 """Breaks an affine cipher using frequency analysis
233
234 >>> affine_break('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh jm bfmibj umis hfsul axubafkjamx. ls kffkxwsd jls ofgbjmwfkiu olfmxmtmwaokttg jlsx ls kffkxwsd jlsi zg tsxwjl. jlsx ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai kxd clm ckuxj.')
235 ((15, 22, True), 0.2357036181865554)
236 """
237 sanitised_message = sanitise(message)
238 best_multiplier = 0
239 best_adder = 0
240 best_one_based = True
241 best_fit = float("inf")
242 for one_based in [True, False]:
243 for multiplier in range(1, 26, 2):
244 for adder in range(26):
245 plaintext = affine_decipher(sanitised_message, multiplier, adder, one_based)
246 frequencies = message_frequency_scaling(letter_frequencies(plaintext))
247 fit = metric(target_frequencies, frequencies)
248 logger.info('Affine break attempt using key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(multiplier, adder, one_based, fit, plaintext[:50]))
249 if fit < best_fit:
250 best_fit = fit
251 best_multiplier = multiplier
252 best_adder = adder
253 best_one_based = one_based
254 logger.info('Affine break best fit with key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(best_multiplier, best_adder, best_one_based, best_fit, affine_decipher(sanitised_message, best_multiplier, best_adder, best_one_based)[:50]))
255 return (best_multiplier, best_adder, best_one_based), best_fit
256
257
258 if __name__ == "__main__":
259 import doctest
260 doctest.testmod()