break.py

   1 import string
   2 import collections
   3 import norms
   4 import logging
   5 from itertools import zip_longest, cycle
   6 from segment import segment
   7 from multiprocessing import Pool
   8
   9 from cipher import *
  10
  11 # To time a run:
  12 #
  13 # import timeit
  14 # c5a = open('2012/5a.ciphertext', 'r').read()
  15 # timeit.timeit('keyword_break(c5a)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break', number=1)
  16 # timeit.repeat('keyword_break_mp(c5a, chunksize=500)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break_mp', repeat=5, number=1)
  17
  18
  19 english_counts = collections.defaultdict(int)
  20 with open('count_1l.txt', 'r') as f:
  21     for line in f:
  22         (letter, count) = line.split("\t")
  23         english_counts[letter] = int(count)
  24 normalised_english_counts = norms.normalise(english_counts)
  25
  26 english_bigram_counts = collections.defaultdict(int)
  27 with open('count_2l.txt', 'r') as f:
  28     for line in f:
  29         (bigram, count) = line.split("\t")
  30         english_bigram_counts[bigram] = int(count)
  31 normalised_english_bigram_counts = norms.normalise(english_bigram_counts)
  32
  33 english_trigram_counts = collections.defaultdict(int)
  34 with open('count_3l.txt', 'r') as f:
  35     for line in f:
  36         (trigram, count) = line.split("\t")
  37         english_trigram_counts[trigram] = int(count)
  38 normalised_english_trigram_counts = norms.normalise(english_trigram_counts)
  39
  40
  41 with open('words.txt', 'r') as f:
  42     keywords = [line.rstrip() for line in f]
  43
  44 transpositions = collections.defaultdict(list)
  45 for word in keywords:
  46     transpositions[transpositions_of(word)] += [word]
  47
  48 def frequencies(text):
  49     """Count the number of occurrences of each character in text
  50
  51     >>> sorted(frequencies('abcdefabc').items())
  52     [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
  53     >>> sorted(frequencies('the quick brown fox jumped over the lazy ' \
  54          'dog').items()) # doctest: +NORMALIZE_WHITESPACE
  55     [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1),
  56      ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1),
  57      ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2),
  58      ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
  59     >>> sorted(frequencies('The Quick BROWN fox jumped! over... the ' \
  60          '(9lazy) DOG').items()) # doctest: +NORMALIZE_WHITESPACE
  61     [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1),
  62      ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1),
  63      ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2),
  64      ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1),
  65      ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
  66     >>> sorted(frequencies(sanitise('The Quick BROWN fox jumped! over... ' \
  67          'the (9lazy) DOG')).items()) # doctest: +NORMALIZE_WHITESPACE
  68     [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1),
  69      ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1),
  70      ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1),
  71      ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
  72     >>> frequencies('abcdefabcdef')['x']
  73     0
  74     """
  75     #counts = collections.defaultdict(int)
  76     #for c in text:
  77     #    counts[c] += 1
  78     #return counts
  79     return collections.Counter(c for c in text)
  80 letter_frequencies = frequencies
  81
  82
  83
  84 def caesar_break(message,
  85                  metric=norms.euclidean_distance,
  86                  target_counts=normalised_english_counts,
  87                  message_frequency_scaling=norms.normalise):
  88     """Breaks a Caesar cipher using frequency analysis
  89
  90     >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrh' \
  91           'ecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS
  92     (4, 0.080345432737...)
  93     >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgtee' \
  94           'raxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS
  95     (19, 0.11189290326...)
  96     >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurer' \
  97           'svaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS
  98     (13, 0.08293968842...)
  99     """
 100     sanitised_message = sanitise(message)
 101     best_shift = 0
 102     best_fit = float("inf")
 103     for shift in range(26):
 104         plaintext = caesar_decipher(sanitised_message, shift)
 105         counts = message_frequency_scaling(letter_frequencies(plaintext))
 106         fit = metric(target_counts, counts)
 107         logger.debug('Caesar break attempt using key {0} gives fit of {1} '
 108                       'and decrypt starting: {2}'.format(shift, fit, plaintext[:50]))
 109         if fit < best_fit:
 110             best_fit = fit
 111             best_shift = shift
 112     logger.info('Caesar break best fit: key {0} gives fit of {1} and '
 113                 'decrypt starting: {2}'.format(best_shift, best_fit,
 114                     caesar_decipher(sanitised_message, best_shift)[:50]))
 115     return best_shift, best_fit
 116
 117 def affine_break(message,
 118                  metric=norms.euclidean_distance,
 119                  target_counts=normalised_english_counts,
 120                  message_frequency_scaling=norms.normalise):
 121     """Breaks an affine cipher using frequency analysis
 122
 123     >>> affine_break('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls ' \
 124           'omytd jlaxe mh jm bfmibj umis hfsul axubafkjamx. ls kffkxwsd jls ' \
 125           'ofgbjmwfkiu olfmxmtmwaokttg jlsx ls kffkxwsd jlsi zg tsxwjl. jlsx ' \
 126           'ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls ' \
 127           'bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai kxd ' \
 128           'clm ckuxj.') # doctest: +ELLIPSIS
 129     ((15, 22, True), 0.0598745365924...)
 130     """
 131     sanitised_message = sanitise(message)
 132     best_multiplier = 0
 133     best_adder = 0
 134     best_one_based = True
 135     best_fit = float("inf")
 136     for one_based in [True, False]:
 137         for multiplier in range(1, 26, 2):
 138             for adder in range(26):
 139                 plaintext = affine_decipher(sanitised_message,
 140                                             multiplier, adder, one_based)
 141                 counts = message_frequency_scaling(letter_frequencies(plaintext))
 142                 fit = metric(target_counts, counts)
 143                 logger.debug('Affine break attempt using key {0}x+{1} ({2}) '
 144                              'gives fit of {3} and decrypt starting: {4}'.
 145                              format(multiplier, adder, one_based, fit,
 146                                     plaintext[:50]))
 147                 if fit < best_fit:
 148                     best_fit = fit
 149                     best_multiplier = multiplier
 150                     best_adder = adder
 151                     best_one_based = one_based
 152     logger.info('Affine break best fit with key {0}x+{1} ({2}) gives fit of {3} '
 153                 'and decrypt starting: {4}'.format(
 154                     best_multiplier, best_adder, best_one_based, best_fit,
 155                     affine_decipher(sanitised_message, best_multiplier,
 156                         best_adder, best_one_based)[:50]))
 157     return (best_multiplier, best_adder, best_one_based), best_fit
 158
 159 def keyword_break(message,
 160                   wordlist=keywords,
 161                   metric=norms.euclidean_distance,
 162                   target_counts=normalised_english_counts,
 163                   message_frequency_scaling=norms.normalise):
 164     """Breaks a keyword substitution cipher using a dictionary and
 165     frequency analysis
 166
 167     >>> keyword_break(keyword_encipher('this is a test message for the ' \
 168           'keyword decipherment', 'elephant', 1), \
 169           wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
 170     (('elephant', 1), 0.1066453448861...)
 171     """
 172     best_keyword = ''
 173     best_wrap_alphabet = True
 174     best_fit = float("inf")
 175     for wrap_alphabet in range(3):
 176         for keyword in wordlist:
 177             plaintext = keyword_decipher(message, keyword, wrap_alphabet)
 178             counts = message_frequency_scaling(letter_frequencies(plaintext))
 179             fit = metric(target_counts, counts)
 180             logger.debug('Keyword break attempt using key {0} (wrap={1}) '
 181                          'gives fit of {2} and decrypt starting: {3}'.format(
 182                              keyword, wrap_alphabet, fit,
 183                              sanitise(plaintext)[:50]))
 184             if fit < best_fit:
 185                 best_fit = fit
 186                 best_keyword = keyword
 187                 best_wrap_alphabet = wrap_alphabet
 188     logger.info('Keyword break best fit with key {0} (wrap={1}) gives fit of '
 189                 '{2} and decrypt starting: {3}'.format(best_keyword,
 190                     best_wrap_alphabet, best_fit, sanitise(
 191                         keyword_decipher(message, best_keyword,
 192                                          best_wrap_alphabet))[:50]))
 193     return (best_keyword, best_wrap_alphabet), best_fit
 194
 195 def keyword_break_mp(message,
 196                      wordlist=keywords,
 197                      metric=norms.euclidean_distance,
 198                      target_counts=normalised_english_counts,
 199                      message_frequency_scaling=norms.normalise,
 200                      chunksize=500):
 201     """Breaks a keyword substitution cipher using a dictionary and
 202     frequency analysis
 203
 204     >>> keyword_break_mp(keyword_encipher('this is a test message for the ' \
 205           'keyword decipherment', 'elephant', 1), \
 206           wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
 207     (('elephant', 1), 0.106645344886...)
 208     """
 209     with Pool() as pool:
 210         helper_args = [(message, word, wrap, metric, target_counts,
 211                         message_frequency_scaling)
 212                        for word in wordlist for wrap in range(3)]
 213         # Gotcha: the helper function here needs to be defined at the top level
 214         #   (limitation of Pool.starmap)
 215         breaks = pool.starmap(keyword_break_worker, helper_args, chunksize)
 216         return min(breaks, key=lambda k: k[1])
 217
 218 def keyword_break_worker(message, keyword, wrap_alphabet, metric, target_counts,
 219                       message_frequency_scaling):
 220     plaintext = keyword_decipher(message, keyword, wrap_alphabet)
 221     counts = message_frequency_scaling(letter_frequencies(plaintext))
 222     fit = metric(target_counts, counts)
 223     logger.debug('Keyword break attempt using key {0} (wrap={1}) gives fit of '
 224                  '{2} and decrypt starting: {3}'.format(keyword,
 225                      wrap_alphabet, fit, sanitise(plaintext)[:50]))
 226     return (keyword, wrap_alphabet), fit
 227
 228 def scytale_break(message,
 229                   metric=norms.euclidean_distance,
 230                   target_counts=normalised_english_bigram_counts,
 231                   message_frequency_scaling=norms.normalise):
 232     """Breaks a Scytale cipher
 233
 234     >>> scytale_break('tfeulchtrtteehwahsdehneoifeayfsondmwpltmaoalhikotoere' \
 235            'dcweatehiplwxsnhooacgorrcrcraotohsgullasenylrendaianeplscdriioto' \
 236            'aek') # doctest: +ELLIPSIS
 237     (6, 0.092599933059...)
 238     """
 239     best_key = 0
 240     best_fit = float("inf")
 241     ngram_length = len(next(iter(target_counts.keys())))
 242     for key in range(1, 20):
 243         if len(message) % key == 0:
 244             plaintext = scytale_decipher(message, key)
 245             counts = message_frequency_scaling(frequencies(
 246                          ngrams(sanitise(plaintext), ngram_length)))
 247             fit = metric(target_counts, counts)
 248             logger.debug('Scytale break attempt using key {0} gives fit of '
 249                          '{1} and decrypt starting: {2}'.format(key,
 250                              fit, sanitise(plaintext)[:50]))
 251             if fit < best_fit:
 252                 best_fit = fit
 253                 best_key = key
 254     logger.info('Scytale break best fit with key {0} gives fit of {1} and '
 255                 'decrypt starting: {2}'.format(best_key, best_fit,
 256                     sanitise(scytale_decipher(message, best_key))[:50]))
 257     return best_key, best_fit
 258
 259 def column_transposition_break(message,
 260                   translist=transpositions,
 261                   metric=norms.euclidean_distance,
 262                   target_counts=normalised_english_bigram_counts,
 263                   message_frequency_scaling=norms.normalise):
 264     """Breaks a column transposition cipher using a dictionary and
 265     n-gram frequency analysis
 266
 267     >>> column_transposition_break(column_transposition_encipher(sanitise( \
 268             "It is a truth universally acknowledged, that a single man in \
 269              possession of a good fortune, must be in want of a wife. However \
 270              little known the feelings or views of such a man may be on his \
 271              first entering a neighbourhood, this truth is so well fixed in the \
 272              minds of the surrounding families, that he is considered the \
 273              rightful property of some one or other of their daughters."), \
 274         'encipher'), \
 275         translist={(2, 0, 5, 3, 1, 4, 6): ['encipher'], \
 276                    (5, 0, 6, 1, 3, 4, 2): ['fourteen'], \
 277                    (6, 1, 0, 4, 5, 3, 2): ['keyword']}) # doctest: +ELLIPSIS
 278     ((2, 0, 5, 3, 1, 4, 6), 0.0628106372...)
 279     >>> column_transposition_break(column_transposition_encipher(sanitise( \
 280             "It is a truth universally acknowledged, that a single man in \
 281              possession of a good fortune, must be in want of a wife. However \
 282              little known the feelings or views of such a man may be on his \
 283              first entering a neighbourhood, this truth is so well fixed in the \
 284              minds of the surrounding families, that he is considered the \
 285              rightful property of some one or other of their daughters."), \
 286         'encipher'), \
 287         translist={(2, 0, 5, 3, 1, 4, 6): ['encipher'], \
 288                    (5, 0, 6, 1, 3, 4, 2): ['fourteen'], \
 289                    (6, 1, 0, 4, 5, 3, 2): ['keyword']}, \
 290         target_counts=normalised_english_trigram_counts) # doctest: +ELLIPSIS
 291     ((2, 0, 5, 3, 1, 4, 6), 0.0592259560...)
 292     """
 293     best_transposition = ''
 294     best_fit = float("inf")
 295     ngram_length = len(next(iter(target_counts.keys())))
 296     for transposition in translist.keys():
 297         if len(message) % len(transposition) == 0:
 298             plaintext = column_transposition_decipher(message, transposition)
 299             counts = message_frequency_scaling(frequencies(
 300                          ngrams(sanitise(plaintext), ngram_length)))
 301             fit = metric(target_counts, counts)
 302             logger.debug('Column transposition break attempt using key {0} '
 303                          'gives fit of {1} and decrypt starting: {2}'.format(
 304                              translist[transposition][0], fit,
 305                              sanitise(plaintext)[:50]))
 306             if fit < best_fit:
 307                 best_fit = fit
 308                 best_transposition = transposition
 309     logger.info('Column transposition break best fit with key {0} gives fit '
 310                 'of {1} and decrypt starting: {2}'.format(
 311                     translist[best_transposition][0],
 312                     best_fit, sanitise(
 313                         column_transposition_decipher(message,
 314                             best_transposition))[:50]))
 315     return best_transposition, best_fit
 316
 317
 318 def column_transposition_break_mp(message,
 319                      translist=transpositions,
 320                      metric=norms.euclidean_distance,
 321                      target_counts=normalised_english_bigram_counts,
 322                      message_frequency_scaling=norms.normalise,
 323                      chunksize=500):
 324     """Breaks a column transposition cipher using a dictionary and
 325     n-gram frequency analysis
 326
 327     >>> column_transposition_break_mp(column_transposition_encipher(sanitise( \
 328             "It is a truth universally acknowledged, that a single man in \
 329              possession of a good fortune, must be in want of a wife. However \
 330              little known the feelings or views of such a man may be on his \
 331              first entering a neighbourhood, this truth is so well fixed in the \
 332              minds of the surrounding families, that he is considered the \
 333              rightful property of some one or other of their daughters."), \
 334         'encipher'), \
 335         translist={(2, 0, 5, 3, 1, 4, 6): ['encipher'], \
 336                    (5, 0, 6, 1, 3, 4, 2): ['fourteen'], \
 337                    (6, 1, 0, 4, 5, 3, 2): ['keyword']}) # doctest: +ELLIPSIS
 338     ((2, 0, 5, 3, 1, 4, 6), 0.0628106372...)
 339     >>> column_transposition_break_mp(column_transposition_encipher(sanitise( \
 340             "It is a truth universally acknowledged, that a single man in \
 341              possession of a good fortune, must be in want of a wife. However \
 342              little known the feelings or views of such a man may be on his \
 343              first entering a neighbourhood, this truth is so well fixed in the \
 344              minds of the surrounding families, that he is considered the \
 345              rightful property of some one or other of their daughters."), \
 346         'encipher'), \
 347         translist={(2, 0, 5, 3, 1, 4, 6): ['encipher'], \
 348                    (5, 0, 6, 1, 3, 4, 2): ['fourteen'], \
 349                    (6, 1, 0, 4, 5, 3, 2): ['keyword']}, \
 350         target_counts=normalised_english_trigram_counts) # doctest: +ELLIPSIS
 351     ((2, 0, 5, 3, 1, 4, 6), 0.0592259560...)
 352     """
 353     ngram_length = len(next(iter(target_counts.keys())))
 354     with Pool() as pool:
 355         helper_args = [(message, trans, metric, target_counts, ngram_length,
 356                         message_frequency_scaling)
 357                        for trans in translist.keys()]
 358         # Gotcha: the helper function here needs to be defined at the top level
 359         #   (limitation of Pool.starmap)
 360         breaks = pool.starmap(column_transposition_break_worker, helper_args, chunksize)
 361         return min(breaks, key=lambda k: k[1])
 362
 363 def column_transposition_break_worker(message, transposition, metric, target_counts,
 364                       ngram_length, message_frequency_scaling):
 365     plaintext = column_transposition_decipher(message, transposition)
 366     counts = message_frequency_scaling(frequencies(
 367                          ngrams(sanitise(plaintext), ngram_length)))
 368     fit = metric(target_counts, counts)
 369     logger.debug('Column transposition break attempt using key {0} '
 370                          'gives fit of {1} and decrypt starting: {2}'.format(
 371                              transposition, fit,
 372                              sanitise(plaintext)[:50]))
 373     return transposition, fit
 374
 375 def vigenere_keyword_break(message,
 376                   wordlist=keywords,
 377                   metric=norms.euclidean_distance,
 378                   target_counts=normalised_english_counts,
 379                   message_frequency_scaling=norms.normalise):
 380     """Breaks a vigenere cipher using a dictionary and
 381     frequency analysis
 382
 383     >>> vigenere_keyword_break(vigenere_encipher(sanitise('this is a test ' \
 384              'message for the vigenere decipherment'), 'cat'), \
 385              wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
 386     ('cat', 0.15965224935...)
 387     """
 388     best_keyword = ''
 389     best_fit = float("inf")
 390     for keyword in wordlist:
 391         plaintext = vigenere_decipher(message, keyword)
 392         counts = message_frequency_scaling(letter_frequencies(plaintext))
 393         fit = metric(target_counts, counts)
 394         logger.debug('Vigenere break attempt using key {0} '
 395                          'gives fit of {1} and decrypt starting: {2}'.format(
 396                              keyword, fit,
 397                              sanitise(plaintext)[:50]))
 398         if fit < best_fit:
 399             best_fit = fit
 400             best_keyword = keyword
 401     logger.info('Vigenere break best fit with key {0} gives fit '
 402                 'of {1} and decrypt starting: {2}'.format(best_keyword,
 403                     best_fit, sanitise(
 404                         vigenere_decipher(message, best_keyword))[:50]))
 405     return best_keyword, best_fit
 406
 407 def vigenere_keyword_break_mp(message,
 408                      wordlist=keywords,
 409                      metric=norms.euclidean_distance,
 410                      target_counts=normalised_english_counts,
 411                      message_frequency_scaling=norms.normalise,
 412                      chunksize=500):
 413     """Breaks a vigenere cipher using a dictionary and
 414     frequency analysis
 415
 416     >>> vigenere_keyword_break_mp(vigenere_encipher(sanitise('this is a test ' \
 417              'message for the vigenere decipherment'), 'cat'), \
 418              wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
 419     ('cat', 0.159652249358...)
 420     """
 421     with Pool() as pool:
 422         helper_args = [(message, word, metric, target_counts,
 423                         message_frequency_scaling)
 424                        for word in wordlist]
 425         # Gotcha: the helper function here needs to be defined at the top level
 426         #   (limitation of Pool.starmap)
 427         breaks = pool.starmap(vigenere_keyword_break_worker, helper_args, chunksize)
 428         return min(breaks, key=lambda k: k[1])
 429
 430 def vigenere_keyword_break_worker(message, keyword, metric, target_counts,
 431                       message_frequency_scaling):
 432     plaintext = vigenere_decipher(message, keyword)
 433     counts = message_frequency_scaling(letter_frequencies(plaintext))
 434     fit = metric(target_counts, counts)
 435     logger.debug('Vigenere keyword break attempt using key {0} gives fit of '
 436                  '{1} and decrypt starting: {2}'.format(keyword,
 437                      fit, sanitise(plaintext)[:50]))
 438     return keyword, fit
 439
 440
 441 def vigenere_ic_break(message, target_counts=normalised_english_counts):
 442     key_length = vigenere_key_length(message),
 443     key = vigenere_find_key(message, key_length)
 444     return key
 445
 446 def vigenere_key_length(message):
 447     best_length = 0
 448     best_ic = 0.0
 449     for trial_length in range(1, 20):
 450         splits = every_nth(message, trial_length)
 451         freqs = [norms.scale(frequencies(s)) for s in splits]
 452         ic = sum([sum([f ** 2 for f in fs.values()]) for fs in freqs]) / trial_length
 453         logger.debug('Vigenere key length of {0} gives IC of {1}'.
 454                      format(trial_length, ic))
 455         if ic > best_ic:
 456             best_length = trial_length
 457             best_ic = ic
 458     return best_length, best_ic
 459
 460 def vigenere_find_key(message, key_length):
 461     splits = every_nth(message, key_length)
 462     return ''.join([chr(caesar_break(s)[0] + ord('a')) for s in splits])
 463
 464
 465 if __name__ == "__main__":
 466     import doctest
 467     doctest.testmod()
 468