Created Neil's branch
[cipher-tools.git] / cipher.py
1 import string
2 import collections
3
4
5 def sanitise(text):
6 """Remove all non-alphabetic characters and convert the text to lowercase
7
8 >>> sanitise('The Quick')
9 'thequick'
10 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
11 'thequickbrownfoxjumpedoverthelazydog'
12 """
13 sanitised = [c.lower() for c in text if c in string.ascii_letters]
14 return ''.join(sanitised)
15
16 def letter_frequencies(text):
17 """Count the number of occurrences of each character in text
18
19 >>> sorted(letter_frequencies('abcdefabc').items())
20 [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
21 >>> sorted(letter_frequencies('the quick brown fox jumped over the lazy dog').items())
22 [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
23 >>> sorted(letter_frequencies('The Quick BROWN fox jumped! over... the (9lazy) DOG').items())
24 [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1), ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1), ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1), ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
25 >>> sorted(letter_frequencies(sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')).items())
26 [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
27 """
28 counts = collections.defaultdict(int)
29 for c in text:
30 counts[c] += 1
31 return counts
32
33
34 def normalise_frequencies(frequencies):
35 """Scale a set of letter frequenies so they add to 1
36
37 >>> sorted(normalise_frequencies(letter_frequencies('abcdefabc')).items())
38 [('a', 0.2222222222222222), ('b', 0.2222222222222222), ('c', 0.2222222222222222), ('d', 0.1111111111111111), ('e', 0.1111111111111111), ('f', 0.1111111111111111)]
39 >>> sorted(normalise_frequencies(letter_frequencies('the quick brown fox jumped over the lazy dog')).items())
40 [(' ', 0.18181818181818182), ('a', 0.022727272727272728), ('b', 0.022727272727272728), ('c', 0.022727272727272728), ('d', 0.045454545454545456), ('e', 0.09090909090909091), ('f', 0.022727272727272728), ('g', 0.022727272727272728), ('h', 0.045454545454545456), ('i', 0.022727272727272728), ('j', 0.022727272727272728), ('k', 0.022727272727272728), ('l', 0.022727272727272728), ('m', 0.022727272727272728), ('n', 0.022727272727272728), ('o', 0.09090909090909091), ('p', 0.022727272727272728), ('q', 0.022727272727272728), ('r', 0.045454545454545456), ('t', 0.045454545454545456), ('u', 0.045454545454545456), ('v', 0.022727272727272728), ('w', 0.022727272727272728), ('x', 0.022727272727272728), ('y', 0.022727272727272728), ('z', 0.022727272727272728)]
41 >>> sorted(normalise_frequencies(letter_frequencies('The Quick BROWN fox jumped! over... the (9lazy) DOG')).items())
42 [(' ', 0.1568627450980392), ('!', 0.0196078431372549), ('(', 0.0196078431372549), (')', 0.0196078431372549), ('.', 0.058823529411764705), ('9', 0.0196078431372549), ('B', 0.0196078431372549), ('D', 0.0196078431372549), ('G', 0.0196078431372549), ('N', 0.0196078431372549), ('O', 0.0392156862745098), ('Q', 0.0196078431372549), ('R', 0.0196078431372549), ('T', 0.0196078431372549), ('W', 0.0196078431372549), ('a', 0.0196078431372549), ('c', 0.0196078431372549), ('d', 0.0196078431372549), ('e', 0.0784313725490196), ('f', 0.0196078431372549), ('h', 0.0392156862745098), ('i', 0.0196078431372549), ('j', 0.0196078431372549), ('k', 0.0196078431372549), ('l', 0.0196078431372549), ('m', 0.0196078431372549), ('o', 0.0392156862745098), ('p', 0.0196078431372549), ('r', 0.0196078431372549), ('t', 0.0196078431372549), ('u', 0.0392156862745098), ('v', 0.0196078431372549), ('x', 0.0196078431372549), ('y', 0.0196078431372549), ('z', 0.0196078431372549)]
43 >>> sorted(normalise_frequencies(letter_frequencies(sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG'))).items())
44 [('a', 0.027777777777777776), ('b', 0.027777777777777776), ('c', 0.027777777777777776), ('d', 0.05555555555555555), ('e', 0.1111111111111111), ('f', 0.027777777777777776), ('g', 0.027777777777777776), ('h', 0.05555555555555555), ('i', 0.027777777777777776), ('j', 0.027777777777777776), ('k', 0.027777777777777776), ('l', 0.027777777777777776), ('m', 0.027777777777777776), ('n', 0.027777777777777776), ('o', 0.1111111111111111), ('p', 0.027777777777777776), ('q', 0.027777777777777776), ('r', 0.05555555555555555), ('t', 0.05555555555555555), ('u', 0.05555555555555555), ('v', 0.027777777777777776), ('w', 0.027777777777777776), ('x', 0.027777777777777776), ('y', 0.027777777777777776), ('z', 0.027777777777777776)]
45 """
46 total = sum(frequencies.values())
47 return dict((k, v / total) for (k, v) in frequencies.items())
48
49 def l2_norm(frequencies1, frequencies2):
50 """Finds the distances between two frequency profiles, expressed as dictionaries.
51 Assumes every key in frequencies1 is also in frequencies2
52
53 >>> l2_norm({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
54 0.0
55 >>> l2_norm({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
56 0.0
57 >>> l2_norm({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
58 0.816496580927726
59 >>> l2_norm({'a':0, 'b':1}, {'a':1, 'b':1})
60 0.7071067811865476
61 """
62 f1n = normalise_frequencies(frequencies1)
63 f2n = normalise_frequencies(frequencies2)
64 total = 0
65 for k in f1n.keys():
66 total += (f1n[k] - f2n[k]) ** 2
67 return total ** 0.5
68 euclidean_distance = l2_norm
69
70 def l1_norm(frequencies1, frequencies2):
71 """Finds the distances between two frequency profiles, expressed as dictionaries.
72 Assumes every key in frequencies1 is also in frequencies2
73
74 >>> l1_norm({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
75 0.0
76 >>> l1_norm({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
77 0.0
78 >>> l1_norm({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
79 1.3333333333333333
80 >>> l1_norm({'a':0, 'b':1}, {'a':1, 'b':1})
81 1.0
82 """
83 f1n = normalise_frequencies(frequencies1)
84 f2n = normalise_frequencies(frequencies2)
85 total = 0
86 for k in f1n.keys():
87 total += abs(f1n[k] - f2n[k])
88 return total
89
90 def l3_norm(frequencies1, frequencies2):
91 """Finds the distances between two frequency profiles, expressed as dictionaries.
92 Assumes every key in frequencies1 is also in frequencies2
93
94 >>> l3_norm({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
95 0.0
96 >>> l3_norm({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
97 0.0
98 >>> l3_norm({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
99 0.7181448966772946
100 >>> l3_norm({'a':0, 'b':1}, {'a':1, 'b':1})
101 0.6299605249474366
102 """
103 f1n = normalise_frequencies(frequencies1)
104 f2n = normalise_frequencies(frequencies2)
105 total = 0
106 for k in f1n.keys():
107 total += abs(f1n[k] - f2n[k]) ** 3
108 return total ** (1/3)
109
110 def cosine_distance(frequencies1, frequencies2):
111 """Finds the distances between two frequency profiles, expressed as dictionaries.
112 Assumes every key in frequencies1 is also in frequencies2
113
114 >>> cosine_distance({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
115 -2.220446049250313e-16
116 >>> cosine_distance({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
117 -2.220446049250313e-16
118 >>> cosine_distance({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
119 0.42264973081037416
120 >>> cosine_distance({'a':0, 'b':1}, {'a':1, 'b':1})
121 0.29289321881345254
122 """
123 numerator = 0
124 length1 = 0
125 length2 = 0
126 for k in frequencies1.keys():
127 numerator += frequencies1[k] * frequencies2[k]
128 length1 += frequencies1[k]**2
129 for k in frequencies2.keys():
130 length2 += frequencies2[k]
131 return 1 - (numerator / (length1 ** 0.5 * length2 ** 0.5))
132
133
134
135
136 def caesar_encipher_letter(letter, shift):
137 """Encipher a letter, given a shift amount
138
139 >>> caesar_encipher_letter('a', 1)
140 'b'
141 >>> caesar_encipher_letter('a', 2)
142 'c'
143 >>> caesar_encipher_letter('b', 2)
144 'd'
145 >>> caesar_encipher_letter('x', 2)
146 'z'
147 >>> caesar_encipher_letter('y', 2)
148 'a'
149 >>> caesar_encipher_letter('z', 2)
150 'b'
151 >>> caesar_encipher_letter('z', -1)
152 'y'
153 >>> caesar_encipher_letter('a', -1)
154 'z'
155 """
156 if letter in string.ascii_letters:
157 if letter in string.ascii_uppercase:
158 alphabet_start = ord('A')
159 else:
160 alphabet_start = ord('a')
161 return chr(((ord(letter) - alphabet_start + shift) % 26) + alphabet_start)
162 else:
163 return letter
164
165 def caesar_decipher_letter(letter, shift):
166 """Decipher a letter, given a shift amount
167
168 >>> caesar_decipher_letter('b', 1)
169 'a'
170 >>> caesar_decipher_letter('b', 2)
171 'z'
172 """
173 return caesar_encipher_letter(letter, -shift)
174
175 def caesar_encipher(message, shift):
176 """Encipher a message with the Caesar cipher of given shift
177
178 >>> caesar_encipher('abc', 1)
179 'bcd'
180 >>> caesar_encipher('abc', 2)
181 'cde'
182 >>> caesar_encipher('abcxyz', 2)
183 'cdezab'
184 >>> caesar_encipher('ab cx yz', 2)
185 'cd ez ab'
186 """
187 enciphered = [caesar_encipher_letter(l, shift) for l in message]
188 return ''.join(enciphered)
189
190 def caesar_decipher(message, shift):
191 """Encipher a message with the Caesar cipher of given shift
192
193 >>> caesar_decipher('bcd', 1)
194 'abc'
195 >>> caesar_decipher('cde', 2)
196 'abc'
197 >>> caesar_decipher('cd ez ab', 2)
198 'ab cx yz'
199 """
200 return caesar_encipher(message, -shift)
201
202 def caesar_break(message, metric=euclidean_distance):
203 sanitised_message = sanitise(message)
204 best_shift = 0
205 best_fit = float("inf")
206 for shift in range(1, 25):
207 plaintext = caesar_decipher(sanitised_message, shift)
208 frequencies = letter_frequencies(plaintext)
209 fit = metric(english_counts, frequencies)
210 if fit < best_fit:
211 best_fit = fit
212 best_shift = shift
213 return best_shift, best_fit
214
215
216
217
218
219 english_counts = collections.defaultdict(int)
220 with open('count_1l.txt', 'r') as f:
221 for line in f:
222 (letter, count) = line.split("\t")
223 english_counts[letter] = int(count)
224
225
226 if __name__ == "__main__":
227 import doctest
228 doctest.testmod()