5 from itertools
import zip_longest
, cycle
6 from segment
import segment
7 from multiprocessing
import Pool
14 # c5a = open('2012/5a.ciphertext', 'r').read()
15 # timeit.timeit('keyword_break(c5a)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break', number=1)
16 # timeit.repeat('keyword_break_mp(c5a, chunksize=500)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break_mp', repeat=5, number=1)
19 english_counts
= collections
.defaultdict(int)
20 with
open('count_1l.txt', 'r') as f
:
22 (letter
, count
) = line
.split("\t")
23 english_counts
[letter
] = int(count
)
24 normalised_english_counts
= norms
.normalise(english_counts
)
26 english_bigram_counts
= collections
.defaultdict(int)
27 with
open('count_2l.txt', 'r') as f
:
29 (bigram
, count
) = line
.split("\t")
30 english_bigram_counts
[bigram
] = int(count
)
31 normalised_english_bigram_counts
= norms
.normalise(english_bigram_counts
)
33 english_trigram_counts
= collections
.defaultdict(int)
34 with
open('count_3l.txt', 'r') as f
:
36 (trigram
, count
) = line
.split("\t")
37 english_trigram_counts
[trigram
] = int(count
)
38 normalised_english_trigram_counts
= norms
.normalise(english_trigram_counts
)
41 with
open('words.txt', 'r') as f
:
42 keywords
= [line
.rstrip() for line
in f
]
44 transpositions
= collections
.defaultdict(list)
46 transpositions
[transpositions_of(word
)] += [word
]
48 def frequencies(text
):
49 """Count the number of occurrences of each character in text
51 >>> sorted(frequencies('abcdefabc').items())
52 [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
53 >>> sorted(frequencies('the quick brown fox jumped over the lazy ' \
54 'dog').items()) # doctest: +NORMALIZE_WHITESPACE
55 [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1),
56 ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1),
57 ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2),
58 ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
59 >>> sorted(frequencies('The Quick BROWN fox jumped! over... the ' \
60 '(9lazy) DOG').items()) # doctest: +NORMALIZE_WHITESPACE
61 [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1),
62 ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1),
63 ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2),
64 ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1),
65 ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
66 >>> sorted(frequencies(sanitise('The Quick BROWN fox jumped! over... ' \
67 'the (9lazy) DOG')).items()) # doctest: +NORMALIZE_WHITESPACE
68 [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1),
69 ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1),
70 ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1),
71 ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
72 >>> frequencies('abcdefabcdef')['x']
75 #counts = collections.defaultdict(int)
79 return collections
.Counter(c
for c
in text
)
80 letter_frequencies
= frequencies
84 def caesar_break(message
,
85 metric
=norms
.euclidean_distance
,
86 target_counts
=normalised_english_counts
,
87 message_frequency_scaling
=norms
.normalise
):
88 """Breaks a Caesar cipher using frequency analysis
90 >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrh' \
91 'ecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS
92 (4, 0.31863952890183...)
93 >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgtee' \
94 'raxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS
95 (19, 0.42152901235832...)
96 >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurer' \
97 'svaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS
98 (13, 0.316029208075451...)
100 sanitised_message
= sanitise(message
)
102 best_fit
= float("inf")
103 for shift
in range(26):
104 plaintext
= caesar_decipher(sanitised_message
, shift
)
105 counts
= message_frequency_scaling(letter_frequencies(plaintext
))
106 fit
= metric(target_counts
, counts
)
107 logger
.debug('Caesar break attempt using key {0} gives fit of {1} '
108 'and decrypt starting: {2}'.format(shift
, fit
, plaintext
[:50]))
112 logger
.info('Caesar break best fit: key {0} gives fit of {1} and '
113 'decrypt starting: {2}'.format(best_shift
, best_fit
,
114 caesar_decipher(sanitised_message
, best_shift
)[:50]))
115 return best_shift
, best_fit
117 def affine_break(message
,
118 metric
=norms
.euclidean_distance
,
119 target_counts
=normalised_english_counts
,
120 message_frequency_scaling
=norms
.normalise
):
121 """Breaks an affine cipher using frequency analysis
123 >>> affine_break('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls ' \
124 'omytd jlaxe mh jm bfmibj umis hfsul axubafkjamx. ls kffkxwsd jls ' \
125 'ofgbjmwfkiu olfmxmtmwaokttg jlsx ls kffkxwsd jlsi zg tsxwjl. jlsx ' \
126 'ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls ' \
127 'bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai kxd ' \
128 'clm ckuxj.') # doctest: +ELLIPSIS
129 ((15, 22, True), 0.23570361818655...)
131 sanitised_message
= sanitise(message
)
134 best_one_based
= True
135 best_fit
= float("inf")
136 for one_based
in [True, False]:
137 for multiplier
in range(1, 26, 2):
138 for adder
in range(26):
139 plaintext
= affine_decipher(sanitised_message
,
140 multiplier
, adder
, one_based
)
141 counts
= message_frequency_scaling(letter_frequencies(plaintext
))
142 fit
= metric(target_counts
, counts
)
143 logger
.debug('Affine break attempt using key {0}x+{1} ({2}) '
144 'gives fit of {3} and decrypt starting: {4}'.
145 format(multiplier
, adder
, one_based
, fit
,
149 best_multiplier
= multiplier
151 best_one_based
= one_based
152 logger
.info('Affine break best fit with key {0}x+{1} ({2}) gives fit of {3} '
153 'and decrypt starting: {4}'.format(
154 best_multiplier
, best_adder
, best_one_based
, best_fit
,
155 affine_decipher(sanitised_message
, best_multiplier
,
156 best_adder
, best_one_based
)[:50]))
157 return (best_multiplier
, best_adder
, best_one_based
), best_fit
159 def keyword_break(message
,
161 metric
=norms
.euclidean_distance
,
162 target_counts
=normalised_english_counts
,
163 message_frequency_scaling
=norms
.normalise
):
164 """Breaks a keyword substitution cipher using a dictionary and
167 >>> keyword_break(keyword_encipher('this is a test message for the ' \
168 'keyword decipherment', 'elephant', 1), \
169 wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
170 (('elephant', 1), 0.41643991598441...)
173 best_wrap_alphabet
= True
174 best_fit
= float("inf")
175 for wrap_alphabet
in range(3):
176 for keyword
in wordlist
:
177 plaintext
= keyword_decipher(message
, keyword
, wrap_alphabet
)
178 counts
= message_frequency_scaling(letter_frequencies(plaintext
))
179 fit
= metric(target_counts
, counts
)
180 logger
.debug('Keyword break attempt using key {0} (wrap={1}) '
181 'gives fit of {2} and decrypt starting: {3}'.format(
182 keyword
, wrap_alphabet
, fit
,
183 sanitise(plaintext
)[:50]))
186 best_keyword
= keyword
187 best_wrap_alphabet
= wrap_alphabet
188 logger
.info('Keyword break best fit with key {0} (wrap={1}) gives fit of '
189 '{2} and decrypt starting: {3}'.format(best_keyword
,
190 best_wrap_alphabet
, best_fit
, sanitise(
191 keyword_decipher(message
, best_keyword
,
192 best_wrap_alphabet
))[:50]))
193 return (best_keyword
, best_wrap_alphabet
), best_fit
195 def keyword_break_mp(message
,
197 metric
=norms
.euclidean_distance
,
198 target_counts
=normalised_english_counts
,
199 message_frequency_scaling
=norms
.normalise
,
201 """Breaks a keyword substitution cipher using a dictionary and
204 >>> keyword_break_mp(keyword_encipher('this is a test message for the ' \
205 'keyword decipherment', 'elephant', 1), \
206 wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
207 (('elephant', 1), 0.41643991598441...)
210 helper_args
= [(message
, word
, wrap
, metric
, target_counts
,
211 message_frequency_scaling
)
212 for word
in wordlist
for wrap
in range(3)]
213 # Gotcha: the helper function here needs to be defined at the top level
214 # (limitation of Pool.starmap)
215 breaks
= pool
.starmap(keyword_break_worker
, helper_args
, chunksize
)
216 return min(breaks
, key
=lambda k
: k
[1])
218 def keyword_break_worker(message
, keyword
, wrap_alphabet
, metric
, target_counts
,
219 message_frequency_scaling
):
220 plaintext
= keyword_decipher(message
, keyword
, wrap_alphabet
)
221 counts
= message_frequency_scaling(letter_frequencies(plaintext
))
222 fit
= metric(target_counts
, counts
)
223 logger
.debug('Keyword break attempt using key {0} (wrap={1}) gives fit of '
224 '{2} and decrypt starting: {3}'.format(keyword
,
225 wrap_alphabet
, fit
, sanitise(plaintext
)[:50]))
226 return (keyword
, wrap_alphabet
), fit
228 def scytale_break(message
,
229 metric
=norms
.euclidean_distance
,
230 target_counts
=normalised_english_bigram_counts
,
231 message_frequency_scaling
=norms
.normalise
):
232 """Breaks a Scytale cipher
234 >>> scytale_break('tfeulchtrtteehwahsdehneoifeayfsondmwpltmaoalhikotoere' \
235 'dcweatehiplwxsnhooacgorrcrcraotohsgullasenylrendaianeplscdriioto' \
236 'aek') # doctest: +ELLIPSIS
237 (6, 0.83453041115025...)
240 best_fit
= float("inf")
241 ngram_length
= len(next(iter(target_counts
.keys())))
242 for key
in range(1, 20):
243 if len(message
) % key
== 0:
244 plaintext
= scytale_decipher(message
, key
)
245 counts
= message_frequency_scaling(frequencies(
246 ngrams(sanitise(plaintext
), ngram_length
)))
247 fit
= metric(target_counts
, counts
)
248 logger
.debug('Scytale break attempt using key {0} gives fit of '
249 '{1} and decrypt starting: {2}'.format(key
,
250 fit
, sanitise(plaintext
)[:50]))
254 logger
.info('Scytale break best fit with key {0} gives fit of {1} and '
255 'decrypt starting: {2}'.format(best_key
, best_fit
,
256 sanitise(scytale_decipher(message
, best_key
))[:50]))
257 return best_key
, best_fit
259 def column_transposition_break(message
,
260 translist
=transpositions
,
261 metric
=norms
.euclidean_distance
,
262 target_counts
=normalised_english_bigram_counts
,
263 message_frequency_scaling
=norms
.normalise
):
264 """Breaks a column transposition cipher using a dictionary and
265 n-gram frequency analysis
267 >>> column_transposition_break(column_transposition_encipher(sanitise( \
268 "Turing's homosexuality resulted in a criminal prosecution in 1952, \
269 when homosexual acts were still illegal in the United Kingdom. "), \
271 translist={(2, 0, 5, 3, 1, 4, 6): ['encipher'], \
272 (5, 0, 6, 1, 3, 4, 2): ['fourteen'], \
273 (6, 1, 0, 4, 5, 3, 2): ['keyword']}) # doctest: +ELLIPSIS
274 ((2, 0, 5, 3, 1, 4, 6), 0.898128626285...)
275 >>> column_transposition_break(column_transposition_encipher(sanitise( \
276 "Turing's homosexuality resulted in a criminal prosecution in 1952, " \
277 "when homosexual acts were still illegal in the United Kingdom."), \
279 translist={(2, 0, 5, 3, 1, 4, 6): ['encipher'], \
280 (5, 0, 6, 1, 3, 4, 2): ['fourteen'], \
281 (6, 1, 0, 4, 5, 3, 2): ['keyword']}, \
282 target_counts=normalised_english_trigram_counts) # doctest: +ELLIPSIS
283 ((2, 0, 5, 3, 1, 4, 6), 1.1958792913127...)
285 best_transposition
= ''
286 best_fit
= float("inf")
287 ngram_length
= len(next(iter(target_counts
.keys())))
288 for transposition
in translist
.keys():
289 if len(message
) % len(transposition
) == 0:
290 plaintext
= column_transposition_decipher(message
, transposition
)
291 counts
= message_frequency_scaling(frequencies(
292 ngrams(sanitise(plaintext
), ngram_length
)))
293 fit
= metric(target_counts
, counts
)
294 logger
.debug('Column transposition break attempt using key {0} '
295 'gives fit of {1} and decrypt starting: {2}'.format(
296 translist
[transposition
][0], fit
,
297 sanitise(plaintext
)[:50]))
300 best_transposition
= transposition
301 logger
.info('Column transposition break best fit with key {0} gives fit '
302 'of {1} and decrypt starting: {2}'.format(
303 translist
[best_transposition
][0],
305 column_transposition_decipher(message
,
306 best_transposition
))[:50]))
307 return best_transposition
, best_fit
310 def column_transposition_break_mp(message
,
311 translist
=transpositions
,
312 metric
=norms
.euclidean_distance
,
313 target_counts
=normalised_english_bigram_counts
,
314 message_frequency_scaling
=norms
.normalise
,
316 """Breaks a column transposition cipher using a dictionary and
317 n-gram frequency analysis
319 >>> column_transposition_break_mp(column_transposition_encipher(sanitise( \
320 "Turing's homosexuality resulted in a criminal prosecution in 1952, \
321 when homosexual acts were still illegal in the United Kingdom. "), \
323 translist={(2, 0, 5, 3, 1, 4, 6): ['encipher'], \
324 (5, 0, 6, 1, 3, 4, 2): ['fourteen'], \
325 (6, 1, 0, 4, 5, 3, 2): ['keyword']}) # doctest: +ELLIPSIS
326 ((2, 0, 5, 3, 1, 4, 6), 0.898128626285...)
327 >>> column_transposition_break_mp(column_transposition_encipher(sanitise( \
328 "Turing's homosexuality resulted in a criminal prosecution in 1952, " \
329 "when homosexual acts were still illegal in the United Kingdom."), \
331 translist={(2, 0, 5, 3, 1, 4, 6): ['encipher'], \
332 (5, 0, 6, 1, 3, 4, 2): ['fourteen'], \
333 (6, 1, 0, 4, 5, 3, 2): ['keyword']}, \
334 target_counts=normalised_english_trigram_counts) # doctest: +ELLIPSIS
335 ((2, 0, 5, 3, 1, 4, 6), 1.1958792913127...)
337 ngram_length
= len(next(iter(target_counts
.keys())))
339 helper_args
= [(message
, trans
, metric
, target_counts
, ngram_length
,
340 message_frequency_scaling
)
341 for trans
in translist
.keys()]
342 # Gotcha: the helper function here needs to be defined at the top level
343 # (limitation of Pool.starmap)
344 breaks
= pool
.starmap(column_transposition_break_worker
, helper_args
, chunksize
)
345 return min(breaks
, key
=lambda k
: k
[1])
347 def column_transposition_break_worker(message
, transposition
, metric
, target_counts
,
348 ngram_length
, message_frequency_scaling
):
349 plaintext
= column_transposition_decipher(message
, transposition
)
350 counts
= message_frequency_scaling(frequencies(
351 ngrams(sanitise(plaintext
), ngram_length
)))
352 fit
= metric(target_counts
, counts
)
353 logger
.debug('Column transposition break attempt using key {0} '
354 'gives fit of {1} and decrypt starting: {2}'.format(
356 sanitise(plaintext
)[:50]))
357 return transposition
, fit
359 def vigenere_keyword_break(message
,
361 metric
=norms
.euclidean_distance
,
362 target_counts
=normalised_english_counts
,
363 message_frequency_scaling
=norms
.normalise
):
364 """Breaks a vigenere cipher using a dictionary and
367 >>> vigenere_keyword_break(vigenere_encipher(sanitise('this is a test ' \
368 'message for the vigenere decipherment'), 'cat'), \
369 wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
370 ('cat', 0.4950195952826...)
373 best_fit
= float("inf")
374 for keyword
in wordlist
:
375 plaintext
= vigenere_decipher(message
, keyword
)
376 counts
= message_frequency_scaling(letter_frequencies(plaintext
))
377 fit
= metric(target_counts
, counts
)
378 logger
.debug('Vigenere break attempt using key {0} '
379 'gives fit of {1} and decrypt starting: {2}'.format(
381 sanitise(plaintext
)[:50]))
384 best_keyword
= keyword
385 logger
.info('Vigenere break best fit with key {0} gives fit '
386 'of {1} and decrypt starting: {2}'.format(best_keyword
,
388 vigenere_decipher(message
, best_keyword
))[:50]))
389 return best_keyword
, best_fit
391 def vigenere_keyword_break_mp(message
,
393 metric
=norms
.euclidean_distance
,
394 target_counts
=normalised_english_counts
,
395 message_frequency_scaling
=norms
.normalise
,
397 """Breaks a vigenere cipher using a dictionary and
400 >>> vigenere_keyword_break_mp(vigenere_encipher(sanitise('this is a test ' \
401 'message for the vigenere decipherment'), 'cat'), \
402 wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
403 ('cat', 0.4950195952826...)
406 helper_args
= [(message
, word
, metric
, target_counts
,
407 message_frequency_scaling
)
408 for word
in wordlist
]
409 # Gotcha: the helper function here needs to be defined at the top level
410 # (limitation of Pool.starmap)
411 breaks
= pool
.starmap(vigenere_keyword_break_worker
, helper_args
, chunksize
)
412 return min(breaks
, key
=lambda k
: k
[1])
414 def vigenere_keyword_break_worker(message
, keyword
, metric
, target_counts
,
415 message_frequency_scaling
):
416 plaintext
= vigenere_decipher(message
, keyword
)
417 counts
= message_frequency_scaling(letter_frequencies(plaintext
))
418 fit
= metric(target_counts
, counts
)
419 logger
.debug('Vigenere keyword break attempt using key {0} gives fit of '
420 '{1} and decrypt starting: {2}'.format(keyword
,
421 fit
, sanitise(plaintext
)[:50]))
425 if __name__
== "__main__":