Fixed bugs in geometric and harmonic means, added some tests.
[cipher-tools.git] / cipher.py
1 import string
2 import collections
3 import norms
4
5 english_counts = collections.defaultdict(int)
6 with open('count_1l.txt', 'r') as f:
7 for line in f:
8 (letter, count) = line.split("\t")
9 english_counts[letter] = int(count)
10 normalised_english_counts = norms.normalise(english_counts)
11
12
13 def sanitise(text):
14 """Remove all non-alphabetic characters and convert the text to lowercase
15
16 >>> sanitise('The Quick')
17 'thequick'
18 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
19 'thequickbrownfoxjumpedoverthelazydog'
20 """
21 sanitised = [c.lower() for c in text if c in string.ascii_letters]
22 return ''.join(sanitised)
23
24 def ngrams(text, n):
25 return [tuple(text[i:i+n]) for i in range(len(text)-n+1)]
26
27 def letter_frequencies(text):
28 """Count the number of occurrences of each character in text
29
30 >>> sorted(letter_frequencies('abcdefabc').items())
31 [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
32 >>> sorted(letter_frequencies('the quick brown fox jumped over the lazy dog').items())
33 [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
34 >>> sorted(letter_frequencies('The Quick BROWN fox jumped! over... the (9lazy) DOG').items())
35 [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1), ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1), ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1), ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
36 >>> sorted(letter_frequencies(sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')).items())
37 [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
38 """
39 counts = collections.defaultdict(int)
40 for c in text:
41 counts[c] += 1
42 return counts
43
44 def caesar_encipher_letter(letter, shift):
45 """Encipher a letter, given a shift amount
46
47 >>> caesar_encipher_letter('a', 1)
48 'b'
49 >>> caesar_encipher_letter('a', 2)
50 'c'
51 >>> caesar_encipher_letter('b', 2)
52 'd'
53 >>> caesar_encipher_letter('x', 2)
54 'z'
55 >>> caesar_encipher_letter('y', 2)
56 'a'
57 >>> caesar_encipher_letter('z', 2)
58 'b'
59 >>> caesar_encipher_letter('z', -1)
60 'y'
61 >>> caesar_encipher_letter('a', -1)
62 'z'
63 """
64 if letter in string.ascii_letters:
65 if letter in string.ascii_uppercase:
66 alphabet_start = ord('A')
67 else:
68 alphabet_start = ord('a')
69 return chr(((ord(letter) - alphabet_start + shift) % 26) + alphabet_start)
70 else:
71 return letter
72
73 def caesar_decipher_letter(letter, shift):
74 """Decipher a letter, given a shift amount
75
76 >>> caesar_decipher_letter('b', 1)
77 'a'
78 >>> caesar_decipher_letter('b', 2)
79 'z'
80 """
81 return caesar_encipher_letter(letter, -shift)
82
83 def caesar_encipher(message, shift):
84 """Encipher a message with the Caesar cipher of given shift
85
86 >>> caesar_encipher('abc', 1)
87 'bcd'
88 >>> caesar_encipher('abc', 2)
89 'cde'
90 >>> caesar_encipher('abcxyz', 2)
91 'cdezab'
92 >>> caesar_encipher('ab cx yz', 2)
93 'cd ez ab'
94 """
95 enciphered = [caesar_encipher_letter(l, shift) for l in message]
96 return ''.join(enciphered)
97
98 def caesar_decipher(message, shift):
99 """Encipher a message with the Caesar cipher of given shift
100
101 >>> caesar_decipher('bcd', 1)
102 'abc'
103 >>> caesar_decipher('cde', 2)
104 'abc'
105 >>> caesar_decipher('cd ez ab', 2)
106 'ab cx yz'
107 """
108 return caesar_encipher(message, -shift)
109
110 def caesar_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
111 """Breaks a Caesar cipher using frequency analysis
112
113
114 >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrhecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm')
115 (4, 0.3186395289018361)
116 >>> caesar_break('jhzhuhfrqilqhgwrdevwudfwuhdvrqlqjwkhqkdylqjvxemhfwhgwrfulwlflvpwkhhasodqdwlrqrisrzhuwkdwmxulglfdovfl')
117 (3, 0.32902042861730835)
118 >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgteeraxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert')
119 (19, 0.4215290123583277)
120 >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurersvaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur')
121 (13, 0.31602920807545154)
122 """
123 sanitised_message = sanitise(message)
124 best_shift = 0
125 best_fit = float("inf")
126 for shift in range(26):
127 plaintext = caesar_decipher(sanitised_message, shift)
128 frequencies = message_frequency_scaling(letter_frequencies(plaintext))
129 fit = metric(target_frequencies, frequencies)
130 if fit < best_fit:
131 best_fit = fit
132 best_shift = shift
133 return best_shift, best_fit
134
135
136 if __name__ == "__main__":
137 import doctest
138 doctest.testmod()