5 from segment
import segment
10 # c5a = open('2012/5a.ciphertext', 'r').read()
11 # timeit.timeit('keyword_break(c5a)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break', number=1)
14 logger
= logging
.getLogger(__name__
)
15 logger
.addHandler(logging
.FileHandler('cipher.log'))
16 logger
.setLevel(logging
.WARNING
)
17 #logger.setLevel(logging.INFO)
19 english_counts
= collections
.defaultdict(int)
20 with
open('count_1l.txt', 'r') as f
:
22 (letter
, count
) = line
.split("\t")
23 english_counts
[letter
] = int(count
)
24 normalised_english_counts
= norms
.normalise(english_counts
)
26 with
open('words.txt', 'r') as f
:
27 keywords
= [line
.rstrip() for line
in f
]
29 modular_division_table
= [[0]*26 for x
in range(26)]
33 modular_division_table
[b
][c
] = a
35 modular_division_table_one_based
= [[0]*27 for x
in range(27)]
38 c
= ((a
* b
)-1) % 26 + 1
39 modular_division_table_one_based
[b
][c
] = a
44 """Remove all non-alphabetic characters and convert the text to lowercase
46 >>> sanitise('The Quick')
48 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
49 'thequickbrownfoxjumpedoverthelazydog'
51 sanitised
= [c
.lower() for c
in text
if c
in string
.ascii_letters
]
52 return ''.join(sanitised
)
55 """Returns all n-grams of a text
57 >>> ngrams(sanitise('the quick brown fox'), 2)
58 [('t', 'h'), ('h', 'e'), ('e', 'q'), ('q', 'u'), ('u', 'i'), ('i', 'c'), ('c', 'k'), ('k', 'b'), ('b', 'r'), ('r', 'o'), ('o', 'w'), ('w', 'n'), ('n', 'f'), ('f', 'o'), ('o', 'x')]
59 >>> ngrams(sanitise('the quick brown fox'), 4)
60 [('t', 'h', 'e', 'q'), ('h', 'e', 'q', 'u'), ('e', 'q', 'u', 'i'), ('q', 'u', 'i', 'c'), ('u', 'i', 'c', 'k'), ('i', 'c', 'k', 'b'), ('c', 'k', 'b', 'r'), ('k', 'b', 'r', 'o'), ('b', 'r', 'o', 'w'), ('r', 'o', 'w', 'n'), ('o', 'w', 'n', 'f'), ('w', 'n', 'f', 'o'), ('n', 'f', 'o', 'x')]
62 return [tuple(text
[i
:i
+n
]) for i
in range(len(text
)-n
+1)]
64 def letter_frequencies(text
):
65 """Count the number of occurrences of each character in text
67 >>> sorted(letter_frequencies('abcdefabc').items())
68 [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
69 >>> sorted(letter_frequencies('the quick brown fox jumped over the lazy dog').items())
70 [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
71 >>> sorted(letter_frequencies('The Quick BROWN fox jumped! over... the (9lazy) DOG').items())
72 [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1), ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1), ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1), ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
73 >>> sorted(letter_frequencies(sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')).items())
74 [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
76 counts
= collections
.defaultdict(int)
81 def deduplicate(text
):
82 return list(collections
.OrderedDict
.fromkeys(text
))
86 def caesar_encipher_letter(letter
, shift
):
87 """Encipher a letter, given a shift amount
89 >>> caesar_encipher_letter('a', 1)
91 >>> caesar_encipher_letter('a', 2)
93 >>> caesar_encipher_letter('b', 2)
95 >>> caesar_encipher_letter('x', 2)
97 >>> caesar_encipher_letter('y', 2)
99 >>> caesar_encipher_letter('z', 2)
101 >>> caesar_encipher_letter('z', -1)
103 >>> caesar_encipher_letter('a', -1)
106 if letter
in string
.ascii_letters
:
107 if letter
in string
.ascii_uppercase
:
108 alphabet_start
= ord('A')
110 alphabet_start
= ord('a')
111 return chr(((ord(letter
) - alphabet_start
+ shift
) % 26) + alphabet_start
)
115 def caesar_decipher_letter(letter
, shift
):
116 """Decipher a letter, given a shift amount
118 >>> caesar_decipher_letter('b', 1)
120 >>> caesar_decipher_letter('b', 2)
123 return caesar_encipher_letter(letter
, -shift
)
125 def caesar_encipher(message
, shift
):
126 """Encipher a message with the Caesar cipher of given shift
128 >>> caesar_encipher('abc', 1)
130 >>> caesar_encipher('abc', 2)
132 >>> caesar_encipher('abcxyz', 2)
134 >>> caesar_encipher('ab cx yz', 2)
137 enciphered
= [caesar_encipher_letter(l
, shift
) for l
in message
]
138 return ''.join(enciphered
)
140 def caesar_decipher(message
, shift
):
141 """Encipher a message with the Caesar cipher of given shift
143 >>> caesar_decipher('bcd', 1)
145 >>> caesar_decipher('cde', 2)
147 >>> caesar_decipher('cd ez ab', 2)
150 return caesar_encipher(message
, -shift
)
152 def affine_encipher_letter(letter
, multiplier
=1, adder
=0, one_based
=True):
153 """Encipher a letter, given a multiplier and adder
155 >>> ''.join([affine_encipher_letter(l, 3, 5, True) for l in string.ascii_uppercase])
156 'HKNQTWZCFILORUXADGJMPSVYBE'
157 >>> ''.join([affine_encipher_letter(l, 3, 5, False) for l in string.ascii_uppercase])
158 'FILORUXADGJMPSVYBEHKNQTWZC'
160 if letter
in string
.ascii_letters
:
161 if letter
in string
.ascii_uppercase
:
162 alphabet_start
= ord('A')
164 alphabet_start
= ord('a')
165 letter_number
= ord(letter
) - alphabet_start
168 raw_cipher_number
= (letter_number
* multiplier
+ adder
)
171 cipher_number
= (raw_cipher_number
- 1) % 26
173 cipher_number
= raw_cipher_number
% 26
174 return chr(cipher_number
+ alphabet_start
)
178 def affine_decipher_letter(letter
, multiplier
=1, adder
=0, one_based
=True):
179 """Encipher a letter, given a multiplier and adder
181 >>> ''.join([affine_decipher_letter(l, 3, 5, True) for l in 'HKNQTWZCFILORUXADGJMPSVYBE'])
182 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
183 >>> ''.join([affine_decipher_letter(l, 3, 5, False) for l in 'FILORUXADGJMPSVYBEHKNQTWZC'])
184 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
186 if letter
in string
.ascii_letters
:
187 if letter
in string
.ascii_uppercase
:
188 alphabet_start
= ord('A')
190 alphabet_start
= ord('a')
191 cipher_number
= ord(letter
) - alphabet_start
196 plaintext_number
= (modular_division_table_one_based
[multiplier
][(cipher_number
- adder
+ 26) % 26] - 1) % 26
198 #plaintext_number = (modular_division_table[multiplier][cipher_number] - adder) % 26
199 plaintext_number
= modular_division_table
[multiplier
][(cipher_number
- adder
+ 26) % 26]
200 return chr(plaintext_number
+ alphabet_start
)
204 def affine_encipher(message
, multiplier
=1, adder
=0, one_based
=True):
205 """Encipher a message
207 >>> affine_encipher('hours passed during which jerico tried every trick he could think of', 15, 22, True)
208 'lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh'
210 enciphered
= [affine_encipher_letter(l
, multiplier
, adder
, one_based
) for l
in message
]
211 return ''.join(enciphered
)
213 def affine_decipher(message
, multiplier
=1, adder
=0, one_based
=True):
214 """Decipher a message
216 >>> affine_decipher('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh', 15, 22, True)
217 'hours passed during which jerico tried every trick he could think of'
219 enciphered
= [affine_decipher_letter(l
, multiplier
, adder
, one_based
) for l
in message
]
220 return ''.join(enciphered
)
223 def keyword_cipher_alphabet_of(keyword
, wrap_alphabet
=0):
224 """Find the cipher alphabet given a keyword.
225 wrap_alphabet controls how the rest of the alphabet is added
228 1 : from the last letter in the sanitised keyword
229 2 : from the largest letter in the sanitised keyword
231 >>> keyword_cipher_alphabet_of('bayes')
232 'bayescdfghijklmnopqrtuvwxz'
233 >>> keyword_cipher_alphabet_of('bayes', 0)
234 'bayescdfghijklmnopqrtuvwxz'
235 >>> keyword_cipher_alphabet_of('bayes', 1)
236 'bayestuvwxzcdfghijklmnopqr'
237 >>> keyword_cipher_alphabet_of('bayes', 2)
238 'bayeszcdfghijklmnopqrtuvwx'
240 if wrap_alphabet
== 0:
241 cipher_alphabet
= ''.join(deduplicate(sanitise(keyword
) + string
.ascii_lowercase
))
243 if wrap_alphabet
== 1:
244 last_keyword_letter
= deduplicate(sanitise(keyword
))[-1]
246 last_keyword_letter
= sorted(sanitise(keyword
))[-1]
247 last_keyword_position
= string
.ascii_lowercase
.find(last_keyword_letter
) + 1
248 cipher_alphabet
= ''.join(deduplicate(sanitise(keyword
) +
249 string
.ascii_lowercase
[last_keyword_position
:] +
250 string
.ascii_lowercase
))
251 return cipher_alphabet
254 def keyword_encipher(message
, keyword
, wrap_alphabet
=0):
255 """Enciphers a message with a keyword substitution cipher.
256 wrap_alphabet controls how the rest of the alphabet is added
259 1 : from the last letter in the sanitised keyword
260 2 : from the largest letter in the sanitised keyword
262 >>> keyword_encipher('test message', 'bayes')
264 >>> keyword_encipher('test message', 'bayes', 0)
266 >>> keyword_encipher('test message', 'bayes', 1)
268 >>> keyword_encipher('test message', 'bayes', 2)
271 cipher_alphabet
= keyword_cipher_alphabet_of(keyword
, wrap_alphabet
)
272 cipher_translation
= ''.maketrans(string
.ascii_lowercase
, cipher_alphabet
)
273 return message
.lower().translate(cipher_translation
)
275 def keyword_decipher(message
, keyword
, wrap_alphabet
=0):
276 """Deciphers a message with a keyword substitution cipher.
277 wrap_alphabet controls how the rest of the alphabet is added
280 1 : from the last letter in the sanitised keyword
281 2 : from the largest letter in the sanitised keyword
283 >>> keyword_decipher('rsqr ksqqbds', 'bayes')
285 >>> keyword_decipher('rsqr ksqqbds', 'bayes', 0)
287 >>> keyword_decipher('lskl dskkbus', 'bayes', 1)
289 >>> keyword_decipher('qspq jsppbcs', 'bayes', 2)
292 cipher_alphabet
= keyword_cipher_alphabet_of(keyword
, wrap_alphabet
)
293 cipher_translation
= ''.maketrans(cipher_alphabet
, string
.ascii_lowercase
)
294 return message
.lower().translate(cipher_translation
)
297 def caesar_break(message
, metric
=norms
.euclidean_distance
, target_frequencies
=normalised_english_counts
, message_frequency_scaling
=norms
.normalise
):
298 """Breaks a Caesar cipher using frequency analysis
300 >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrhecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS
301 (4, 0.31863952890183...)
302 >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgteeraxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS
303 (19, 0.42152901235832...)
304 >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurersvaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS
305 (13, 0.316029208075451...)
307 sanitised_message
= sanitise(message
)
309 best_fit
= float("inf")
310 for shift
in range(26):
311 plaintext
= caesar_decipher(sanitised_message
, shift
)
312 frequencies
= message_frequency_scaling(letter_frequencies(plaintext
))
313 fit
= metric(target_frequencies
, frequencies
)
314 logger
.debug('Caesar break attempt using key {0} gives fit of {1} and decrypt starting: {2}'.format(shift
, fit
, plaintext
[:50]))
318 logger
.info('Caesar break best fit: key {0} gives fit of {1} and decrypt starting: {2}'.format(best_shift
, best_fit
, caesar_decipher(sanitised_message
, best_shift
)[:50]))
319 return best_shift
, best_fit
321 def affine_break(message
, metric
=norms
.euclidean_distance
, target_frequencies
=normalised_english_counts
, message_frequency_scaling
=norms
.normalise
):
322 """Breaks an affine cipher using frequency analysis
324 >>> affine_break('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh jm bfmibj umis hfsul axubafkjamx. ls kffkxwsd jls ofgbjmwfkiu olfmxmtmwaokttg jlsx ls kffkxwsd jlsi zg tsxwjl. jlsx ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai kxd clm ckuxj.') # doctest: +ELLIPSIS
325 ((15, 22, True), 0.23570361818655...)
327 sanitised_message
= sanitise(message
)
330 best_one_based
= True
331 best_fit
= float("inf")
332 for one_based
in [True, False]:
333 for multiplier
in range(1, 26, 2):
334 for adder
in range(26):
335 plaintext
= affine_decipher(sanitised_message
, multiplier
, adder
, one_based
)
336 frequencies
= message_frequency_scaling(letter_frequencies(plaintext
))
337 fit
= metric(target_frequencies
, frequencies
)
338 logger
.debug('Affine break attempt using key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(multiplier
, adder
, one_based
, fit
, plaintext
[:50]))
341 best_multiplier
= multiplier
343 best_one_based
= one_based
344 logger
.info('Affine break best fit with key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(best_multiplier
, best_adder
, best_one_based
, best_fit
, affine_decipher(sanitised_message
, best_multiplier
, best_adder
, best_one_based
)[:50]))
345 return (best_multiplier
, best_adder
, best_one_based
), best_fit
348 def keyword_break(message
, wordlist
=keywords
, metric
=norms
.euclidean_distance
, target_frequencies
=normalised_english_counts
, message_frequency_scaling
=norms
.normalise
):
349 """Breaks a keyword substitution cipher using a dictionary and frequency analysis
351 >>> keyword_break(keyword_encipher('this is a test message for the keyword decipherment', 'elephant', 1), wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
352 (('elephant', 1), 0.41643991598441...)
355 best_wrap_alphabet
= True
356 best_fit
= float("inf")
357 for wrap_alphabet
in range(3):
358 for keyword
in wordlist
:
359 plaintext
= keyword_decipher(message
, keyword
, wrap_alphabet
)
360 frequencies
= message_frequency_scaling(letter_frequencies(plaintext
))
361 fit
= metric(target_frequencies
, frequencies
)
362 logger
.debug('Keyword break attempt using key {0} (wrap={1}) gives fit of {2} and decrypt starting: {3}'.format(keyword
, wrap_alphabet
, fit
, sanitise(plaintext
)[:50]))
365 best_keyword
= keyword
366 best_wrap_alphabet
= wrap_alphabet
367 logger
.info('Keyword break best fit with key {0} (wrap={1}) gives fit of {2} and decrypt starting: {3}'.format(best_keyword
, best_wrap_alphabet
, best_fit
, sanitise(keyword_decipher(message
, best_keyword
))[:50]))
368 return (best_keyword
, best_wrap_alphabet
), best_fit
371 if __name__
== "__main__":