break.py

   1 import string
   2 import collections
   3 import norms
   4 import logging
   5 from itertools import zip_longest, cycle
   6 from segment import segment
   7 from multiprocessing import Pool
   8
   9 from cipher import *
  10
  11 # To time a run:
  12 #
  13 # import timeit
  14 # c5a = open('2012/5a.ciphertext', 'r').read()
  15 # timeit.timeit('keyword_break(c5a)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break', number=1)
  16 # timeit.repeat('keyword_break_mp(c5a, chunksize=500)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break_mp', repeat=5, number=1)
  17
  18
  19 english_counts = collections.defaultdict(int)
  20 with open('count_1l.txt', 'r') as f:
  21     for line in f:
  22         (letter, count) = line.split("\t")
  23         english_counts[letter] = int(count)
  24 normalised_english_counts = norms.normalise(english_counts)
  25
  26 english_bigram_counts = collections.defaultdict(int)
  27 with open('count_2l.txt', 'r') as f:
  28     for line in f:
  29         (bigram, count) = line.split("\t")
  30         english_bigram_counts[bigram] = int(count)
  31 normalised_english_bigram_counts = norms.normalise(english_bigram_counts)
  32
  33 english_trigram_counts = collections.defaultdict(int)
  34 with open('count_3l.txt', 'r') as f:
  35     for line in f:
  36         (trigram, count) = line.split("\t")
  37         english_trigram_counts[trigram] = int(count)
  38 normalised_english_trigram_counts = norms.normalise(english_trigram_counts)
  39
  40
  41 with open('words.txt', 'r') as f:
  42     keywords = [line.rstrip() for line in f]
  43
  44 transpositions = collections.defaultdict(list)
  45 for word in keywords:
  46     transpositions[transpositions_of(word)] += [word]
  47
  48 def frequencies(text):
  49     """Count the number of occurrences of each character in text
  50
  51     >>> sorted(frequencies('abcdefabc').items())
  52     [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
  53     >>> sorted(frequencies('the quick brown fox jumped over the lazy ' \
  54          'dog').items()) # doctest: +NORMALIZE_WHITESPACE
  55     [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1),
  56      ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1),
  57      ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2),
  58      ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
  59     >>> sorted(frequencies('The Quick BROWN fox jumped! over... the ' \
  60          '(9lazy) DOG').items()) # doctest: +NORMALIZE_WHITESPACE
  61     [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1),
  62      ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1),
  63      ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2),
  64      ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1),
  65      ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
  66     >>> sorted(frequencies(sanitise('The Quick BROWN fox jumped! over... ' \
  67          'the (9lazy) DOG')).items()) # doctest: +NORMALIZE_WHITESPACE
  68     [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1),
  69      ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1),
  70      ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1),
  71      ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
  72     >>> frequencies('abcdefabcdef')['x']
  73     0
  74     """
  75     #counts = collections.defaultdict(int)
  76     #for c in text:
  77     #    counts[c] += 1
  78     #return counts
  79     return collections.Counter(c for c in text)
  80 letter_frequencies = frequencies
  81
  82
  83
  84 def caesar_break(message,
  85                  metric=norms.euclidean_distance,
  86                  target_counts=normalised_english_counts,
  87                  message_frequency_scaling=norms.normalise):
  88     """Breaks a Caesar cipher using frequency analysis
  89
  90     >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrh' \
  91           'ecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS
  92     (4, 0.31863952890183...)
  93     >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgtee' \
  94           'raxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS
  95     (19, 0.42152901235832...)
  96     >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurer' \
  97           'svaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS
  98     (13, 0.316029208075451...)
  99     """
 100     sanitised_message = sanitise(message)
 101     best_shift = 0
 102     best_fit = float("inf")
 103     for shift in range(26):
 104         plaintext = caesar_decipher(sanitised_message, shift)
 105         counts = message_frequency_scaling(letter_frequencies(plaintext))
 106         fit = metric(target_counts, counts)
 107         logger.debug('Caesar break attempt using key {0} gives fit of {1} '
 108                       'and decrypt starting: {2}'.format(shift, fit, plaintext[:50]))
 109         if fit < best_fit:
 110             best_fit = fit
 111             best_shift = shift
 112     logger.info('Caesar break best fit: key {0} gives fit of {1} and '
 113                 'decrypt starting: {2}'.format(best_shift, best_fit,
 114                     caesar_decipher(sanitised_message, best_shift)[:50]))
 115     return best_shift, best_fit
 116
 117 def affine_break(message,
 118                  metric=norms.euclidean_distance,
 119                  target_counts=normalised_english_counts,
 120                  message_frequency_scaling=norms.normalise):
 121     """Breaks an affine cipher using frequency analysis
 122
 123     >>> affine_break('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls ' \
 124           'omytd jlaxe mh jm bfmibj umis hfsul axubafkjamx. ls kffkxwsd jls ' \
 125           'ofgbjmwfkiu olfmxmtmwaokttg jlsx ls kffkxwsd jlsi zg tsxwjl. jlsx ' \
 126           'ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls ' \
 127           'bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai kxd ' \
 128           'clm ckuxj.') # doctest: +ELLIPSIS
 129     ((15, 22, True), 0.23570361818655...)
 130     """
 131     sanitised_message = sanitise(message)
 132     best_multiplier = 0
 133     best_adder = 0
 134     best_one_based = True
 135     best_fit = float("inf")
 136     for one_based in [True, False]:
 137         for multiplier in range(1, 26, 2):
 138             for adder in range(26):
 139                 plaintext = affine_decipher(sanitised_message,
 140                                             multiplier, adder, one_based)
 141                 counts = message_frequency_scaling(letter_frequencies(plaintext))
 142                 fit = metric(target_counts, counts)
 143                 logger.debug('Affine break attempt using key {0}x+{1} ({2}) '
 144                              'gives fit of {3} and decrypt starting: {4}'.
 145                              format(multiplier, adder, one_based, fit,
 146                                     plaintext[:50]))
 147                 if fit < best_fit:
 148                     best_fit = fit
 149                     best_multiplier = multiplier
 150                     best_adder = adder
 151                     best_one_based = one_based
 152     logger.info('Affine break best fit with key {0}x+{1} ({2}) gives fit of {3} '
 153                 'and decrypt starting: {4}'.format(
 154                     best_multiplier, best_adder, best_one_based, best_fit,
 155                     affine_decipher(sanitised_message, best_multiplier,
 156                         best_adder, best_one_based)[:50]))
 157     return (best_multiplier, best_adder, best_one_based), best_fit
 158
 159 def keyword_break(message,
 160                   wordlist=keywords,
 161                   metric=norms.euclidean_distance,
 162                   target_counts=normalised_english_counts,
 163                   message_frequency_scaling=norms.normalise):
 164     """Breaks a keyword substitution cipher using a dictionary and
 165     frequency analysis
 166
 167     >>> keyword_break(keyword_encipher('this is a test message for the ' \
 168           'keyword decipherment', 'elephant', 1), \
 169           wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
 170     (('elephant', 1), 0.41643991598441...)
 171     """
 172     best_keyword = ''
 173     best_wrap_alphabet = True
 174     best_fit = float("inf")
 175     for wrap_alphabet in range(3):
 176         for keyword in wordlist:
 177             plaintext = keyword_decipher(message, keyword, wrap_alphabet)
 178             counts = message_frequency_scaling(letter_frequencies(plaintext))
 179             fit = metric(target_counts, counts)
 180             logger.debug('Keyword break attempt using key {0} (wrap={1}) '
 181                          'gives fit of {2} and decrypt starting: {3}'.format(
 182                              keyword, wrap_alphabet, fit,
 183                              sanitise(plaintext)[:50]))
 184             if fit < best_fit:
 185                 best_fit = fit
 186                 best_keyword = keyword
 187                 best_wrap_alphabet = wrap_alphabet
 188     logger.info('Keyword break best fit with key {0} (wrap={1}) gives fit of '
 189                 '{2} and decrypt starting: {3}'.format(best_keyword,
 190                     best_wrap_alphabet, best_fit, sanitise(
 191                         keyword_decipher(message, best_keyword,
 192                                          best_wrap_alphabet))[:50]))
 193     return (best_keyword, best_wrap_alphabet), best_fit
 194
 195 def keyword_break_mp(message,
 196                      wordlist=keywords,
 197                      metric=norms.euclidean_distance,
 198                      target_counts=normalised_english_counts,
 199                      message_frequency_scaling=norms.normalise,
 200                      chunksize=500):
 201     """Breaks a keyword substitution cipher using a dictionary and
 202     frequency analysis
 203
 204     >>> keyword_break_mp(keyword_encipher('this is a test message for the ' \
 205           'keyword decipherment', 'elephant', 1), \
 206           wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
 207     (('elephant', 1), 0.41643991598441...)
 208     """
 209     with Pool() as pool:
 210         helper_args = [(message, word, wrap, metric, target_counts,
 211                         message_frequency_scaling)
 212                        for word in wordlist for wrap in range(3)]
 213         # Gotcha: the helper function here needs to be defined at the top level
 214         #   (limitation of Pool.starmap)
 215         breaks = pool.starmap(keyword_break_worker, helper_args, chunksize)
 216         return min(breaks, key=lambda k: k[1])
 217
 218 def keyword_break_worker(message, keyword, wrap_alphabet, metric, target_counts,
 219                       message_frequency_scaling):
 220     plaintext = keyword_decipher(message, keyword, wrap_alphabet)
 221     counts = message_frequency_scaling(letter_frequencies(plaintext))
 222     fit = metric(target_counts, counts)
 223     logger.debug('Keyword break attempt using key {0} (wrap={1}) gives fit of '
 224                  '{2} and decrypt starting: {3}'.format(keyword,
 225                      wrap_alphabet, fit, sanitise(plaintext)[:50]))
 226     return (keyword, wrap_alphabet), fit
 227
 228 def scytale_break(message,
 229                   metric=norms.euclidean_distance,
 230                   target_counts=normalised_english_bigram_counts,
 231                   message_frequency_scaling=norms.normalise):
 232     """Breaks a Scytale cipher
 233
 234     >>> scytale_break('tfeulchtrtteehwahsdehneoifeayfsondmwpltmaoalhikotoere' \
 235            'dcweatehiplwxsnhooacgorrcrcraotohsgullasenylrendaianeplscdriioto' \
 236            'aek') # doctest: +ELLIPSIS
 237     (6, 0.83453041115025...)
 238     """
 239     best_key = 0
 240     best_fit = float("inf")
 241     ngram_length = len(next(iter(target_counts.keys())))
 242     for key in range(1, 20):
 243         if len(message) % key == 0:
 244             plaintext = scytale_decipher(message, key)
 245             counts = message_frequency_scaling(frequencies(
 246                          ngrams(sanitise(plaintext), ngram_length)))
 247             fit = metric(target_counts, counts)
 248             logger.debug('Scytale break attempt using key {0} gives fit of '
 249                          '{1} and decrypt starting: {2}'.format(key,
 250                              fit, sanitise(plaintext)[:50]))
 251             if fit < best_fit:
 252                 best_fit = fit
 253                 best_key = key
 254     logger.info('Scytale break best fit with key {0} gives fit of {1} and '
 255                 'decrypt starting: {2}'.format(best_key, best_fit,
 256                     sanitise(scytale_decipher(message, best_key))[:50]))
 257     return best_key, best_fit
 258
 259 def column_transposition_break(message,
 260                   translist=transpositions,
 261                   metric=norms.euclidean_distance,
 262                   target_counts=normalised_english_bigram_counts,
 263                   message_frequency_scaling=norms.normalise):
 264     """Breaks a column transposition cipher using a dictionary and
 265     n-gram frequency analysis
 266
 267     >>> column_transposition_break(column_transposition_encipher(sanitise( \
 268         "Turing's homosexuality resulted in a criminal prosecution in 1952, \
 269         when homosexual acts were still illegal in the United Kingdom. "), \
 270         'encipher'), \
 271         translist={(2, 0, 5, 3, 1, 4, 6): ['encipher'], \
 272                    (5, 0, 6, 1, 3, 4, 2): ['fourteen'], \
 273                    (6, 1, 0, 4, 5, 3, 2): ['keyword']}) # doctest: +ELLIPSIS
 274     ((2, 0, 5, 3, 1, 4, 6), 0.898128626285...)
 275     >>> column_transposition_break(column_transposition_encipher(sanitise( \
 276         "Turing's homosexuality resulted in a criminal prosecution in 1952, " \
 277         "when homosexual acts were still illegal in the United Kingdom."), \
 278         'encipher'), \
 279         translist={(2, 0, 5, 3, 1, 4, 6): ['encipher'], \
 280                    (5, 0, 6, 1, 3, 4, 2): ['fourteen'], \
 281                    (6, 1, 0, 4, 5, 3, 2): ['keyword']}, \
 282         target_counts=normalised_english_trigram_counts) # doctest: +ELLIPSIS
 283     ((2, 0, 5, 3, 1, 4, 6), 1.1958792913127...)
 284     """
 285     best_transposition = ''
 286     best_fit = float("inf")
 287     ngram_length = len(next(iter(target_counts.keys())))
 288     for transposition in translist.keys():
 289         if len(message) % len(transposition) == 0:
 290             plaintext = column_transposition_decipher(message, transposition)
 291             counts = message_frequency_scaling(frequencies(
 292                          ngrams(sanitise(plaintext), ngram_length)))
 293             fit = metric(target_counts, counts)
 294             logger.debug('Column transposition break attempt using key {0} '
 295                          'gives fit of {1} and decrypt starting: {2}'.format(
 296                              translist[transposition][0], fit,
 297                              sanitise(plaintext)[:50]))
 298             if fit < best_fit:
 299                 best_fit = fit
 300                 best_transposition = transposition
 301     logger.info('Column transposition break best fit with key {0} gives fit '
 302                 'of {1} and decrypt starting: {2}'.format(
 303                     translist[best_transposition][0],
 304                     best_fit, sanitise(
 305                         column_transposition_decipher(message,
 306                             best_transposition))[:50]))
 307     return best_transposition, best_fit
 308
 309
 310 def column_transposition_break_mp(message,
 311                      translist=transpositions,
 312                      metric=norms.euclidean_distance,
 313                      target_counts=normalised_english_bigram_counts,
 314                      message_frequency_scaling=norms.normalise,
 315                      chunksize=500):
 316     """Breaks a column transposition cipher using a dictionary and
 317     n-gram frequency analysis
 318
 319     >>> column_transposition_break_mp(column_transposition_encipher(sanitise( \
 320         "Turing's homosexuality resulted in a criminal prosecution in 1952, \
 321         when homosexual acts were still illegal in the United Kingdom. "), \
 322         'encipher'), \
 323         translist={(2, 0, 5, 3, 1, 4, 6): ['encipher'], \
 324                    (5, 0, 6, 1, 3, 4, 2): ['fourteen'], \
 325                    (6, 1, 0, 4, 5, 3, 2): ['keyword']}) # doctest: +ELLIPSIS
 326     ((2, 0, 5, 3, 1, 4, 6), 0.898128626285...)
 327     >>> column_transposition_break_mp(column_transposition_encipher(sanitise( \
 328         "Turing's homosexuality resulted in a criminal prosecution in 1952, " \
 329         "when homosexual acts were still illegal in the United Kingdom."), \
 330         'encipher'), \
 331         translist={(2, 0, 5, 3, 1, 4, 6): ['encipher'], \
 332                    (5, 0, 6, 1, 3, 4, 2): ['fourteen'], \
 333                    (6, 1, 0, 4, 5, 3, 2): ['keyword']}, \
 334         target_counts=normalised_english_trigram_counts) # doctest: +ELLIPSIS
 335     ((2, 0, 5, 3, 1, 4, 6), 1.1958792913127...)
 336     """
 337     ngram_length = len(next(iter(target_counts.keys())))
 338     with Pool() as pool:
 339         helper_args = [(message, trans, metric, target_counts, ngram_length,
 340                         message_frequency_scaling)
 341                        for trans in translist.keys()]
 342         # Gotcha: the helper function here needs to be defined at the top level
 343         #   (limitation of Pool.starmap)
 344         breaks = pool.starmap(column_transposition_break_worker, helper_args, chunksize)
 345         return min(breaks, key=lambda k: k[1])
 346
 347 def column_transposition_break_worker(message, transposition, metric, target_counts,
 348                       ngram_length, message_frequency_scaling):
 349     plaintext = column_transposition_decipher(message, transposition)
 350     counts = message_frequency_scaling(frequencies(
 351                          ngrams(sanitise(plaintext), ngram_length)))
 352     fit = metric(target_counts, counts)
 353     logger.debug('Column transposition break attempt using key {0} '
 354                          'gives fit of {1} and decrypt starting: {2}'.format(
 355                              transposition, fit,
 356                              sanitise(plaintext)[:50]))
 357     return transposition, fit
 358
 359 def vigenere_keyword_break(message,
 360                   wordlist=keywords,
 361                   metric=norms.euclidean_distance,
 362                   target_counts=normalised_english_counts,
 363                   message_frequency_scaling=norms.normalise):
 364     """Breaks a vigenere cipher using a dictionary and
 365     frequency analysis
 366
 367     >>> vigenere_keyword_break(vigenere_encipher(sanitise('this is a test ' \
 368              'message for the vigenere decipherment'), 'cat'), \
 369              wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
 370     ('cat', 0.4950195952826...)
 371     """
 372     best_keyword = ''
 373     best_fit = float("inf")
 374     for keyword in wordlist:
 375         plaintext = vigenere_decipher(message, keyword)
 376         counts = message_frequency_scaling(letter_frequencies(plaintext))
 377         fit = metric(target_counts, counts)
 378         logger.debug('Vigenere break attempt using key {0} '
 379                          'gives fit of {1} and decrypt starting: {2}'.format(
 380                              keyword, fit,
 381                              sanitise(plaintext)[:50]))
 382         if fit < best_fit:
 383             best_fit = fit
 384             best_keyword = keyword
 385     logger.info('Vigenere break best fit with key {0} gives fit '
 386                 'of {1} and decrypt starting: {2}'.format(best_keyword,
 387                     best_fit, sanitise(
 388                         vigenere_decipher(message, best_keyword))[:50]))
 389     return best_keyword, best_fit
 390
 391 def vigenere_keyword_break_mp(message,
 392                      wordlist=keywords,
 393                      metric=norms.euclidean_distance,
 394                      target_counts=normalised_english_counts,
 395                      message_frequency_scaling=norms.normalise,
 396                      chunksize=500):
 397     """Breaks a vigenere cipher using a dictionary and
 398     frequency analysis
 399
 400     >>> vigenere_keyword_break_mp(vigenere_encipher(sanitise('this is a test ' \
 401              'message for the vigenere decipherment'), 'cat'), \
 402              wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
 403     ('cat', 0.4950195952826...)
 404     """
 405     with Pool() as pool:
 406         helper_args = [(message, word, metric, target_counts,
 407                         message_frequency_scaling)
 408                        for word in wordlist]
 409         # Gotcha: the helper function here needs to be defined at the top level
 410         #   (limitation of Pool.starmap)
 411         breaks = pool.starmap(vigenere_keyword_break_worker, helper_args, chunksize)
 412         return min(breaks, key=lambda k: k[1])
 413
 414 def vigenere_keyword_break_worker(message, keyword, metric, target_counts,
 415                       message_frequency_scaling):
 416     plaintext = vigenere_decipher(message, keyword)
 417     counts = message_frequency_scaling(letter_frequencies(plaintext))
 418     fit = metric(target_counts, counts)
 419     logger.debug('Vigenere keyword break attempt using key {0} gives fit of '
 420                  '{1} and decrypt starting: {2}'.format(keyword,
 421                      fit, sanitise(plaintext)[:50]))
 422     return keyword, fit
 423
 424
 425 if __name__ == "__main__":
 426     import doctest
 427     doctest.testmod()
 428