cipher.py

   1 import string
   2 import collections
   3 import norms
   4 import logging
   5 import math
   6 from itertools import zip_longest, repeat, cycle
   7 from segment import segment
   8 from multiprocessing import Pool
   9
  10 # To time a run:
  11 #
  12 # import timeit
  13 # c5a = open('2012/5a.ciphertext', 'r').read()
  14 # timeit.timeit('keyword_break(c5a)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break', number=1)
  15 # timeit.repeat('keyword_break_mp(c5a, chunksize=500)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break_mp', repeat=5, number=1
  16
  17 logger = logging.getLogger(__name__)
  18 logger.addHandler(logging.FileHandler('cipher.log'))
  19 logger.setLevel(logging.WARNING)
  20 #logger.setLevel(logging.INFO)
  21 #logger.setLevel(logging.DEBUG)
  22
  23 english_counts = collections.defaultdict(int)
  24 with open('count_1l.txt', 'r') as f:
  25     for line in f:
  26         (letter, count) = line.split("\t")
  27         english_counts[letter] = int(count)
  28 normalised_english_counts = norms.normalise(english_counts)
  29
  30 english_bigram_counts = collections.defaultdict(int)
  31 with open('count_2l.txt', 'r') as f:
  32     for line in f:
  33         (bigram, count) = line.split("\t")
  34         english_bigram_counts[bigram] = int(count)
  35 normalised_english_bigram_counts = norms.normalise(english_bigram_counts)
  36
  37 english_trigram_counts = collections.defaultdict(int)
  38 with open('count_3l.txt', 'r') as f:
  39     for line in f:
  40         (trigram, count) = line.split("\t")
  41         english_trigram_counts[trigram] = int(count)
  42 normalised_english_trigram_counts = norms.normalise(english_trigram_counts)
  43
  44
  45 with open('words.txt', 'r') as f:
  46     keywords = [line.rstrip() for line in f]
  47
  48 modular_division_table = [[0]*26 for x in range(26)]
  49 for a in range(26):
  50     for b in range(26):
  51         c = (a * b) % 26
  52         modular_division_table[b][c] = a
  53
  54 def letters(text):
  55     """Remove all non-alphabetic characters from a text
  56     >>> letters('The Quick')
  57     'TheQuick'
  58     >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
  59     'TheQuickBROWNfoxjumpedoverthelazyDOG'
  60     """
  61     return ''.join([c for c in text if c in string.ascii_letters])
  62
  63 def sanitise(text):
  64     """Remove all non-alphabetic characters and convert the text to lowercase
  65
  66     >>> sanitise('The Quick')
  67     'thequick'
  68     >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
  69     'thequickbrownfoxjumpedoverthelazydog'
  70     """
  71     # sanitised = [c.lower() for c in text if c in string.ascii_letters]
  72     # return ''.join(sanitised)
  73     return letters(text).lower()
  74
  75 def ngrams(text, n):
  76     """Returns all n-grams of a text
  77
  78     >>> ngrams(sanitise('the quick brown fox'), 2) # doctest: +NORMALIZE_WHITESPACE
  79     ['th', 'he', 'eq', 'qu', 'ui', 'ic', 'ck', 'kb', 'br', 'ro', 'ow', 'wn',
  80      'nf', 'fo', 'ox']
  81     >>> ngrams(sanitise('the quick brown fox'), 4) # doctest: +NORMALIZE_WHITESPACE
  82     ['theq', 'hequ', 'equi', 'quic', 'uick', 'ickb', 'ckbr', 'kbro', 'brow',
  83      'rown', 'ownf', 'wnfo', 'nfox']
  84     """
  85     return [text[i:i+n] for i in range(len(text)-n+1)]
  86
  87 def every_nth(text, n, fillvalue=''):
  88     """Returns n strings, each of which consists of every nth character,
  89     starting with the 0th, 1st, 2nd, ... (n-1)th character
  90
  91     >>> every_nth(string.ascii_lowercase, 5)
  92     ['afkpuz', 'bglqv', 'chmrw', 'dinsx', 'ejoty']
  93     >>> every_nth(string.ascii_lowercase, 1)
  94     ['abcdefghijklmnopqrstuvwxyz']
  95     >>> every_nth(string.ascii_lowercase, 26) # doctest: +NORMALIZE_WHITESPACE
  96     ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
  97      'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
  98     >>> every_nth(string.ascii_lowercase, 5, fillvalue='!')
  99     ['afkpuz', 'bglqv!', 'chmrw!', 'dinsx!', 'ejoty!']
 100     """
 101     split_text = [text[i:i+n] for i in range(0, len(text), n)]
 102     return [''.join(l) for l in zip_longest(*split_text, fillvalue=fillvalue)]
 103
 104 def combine_every_nth(split_text):
 105     """Reforms a text split into every_nth strings
 106
 107     >>> combine_every_nth(every_nth(string.ascii_lowercase, 5))
 108     'abcdefghijklmnopqrstuvwxyz'
 109     >>> combine_every_nth(every_nth(string.ascii_lowercase, 1))
 110     'abcdefghijklmnopqrstuvwxyz'
 111     >>> combine_every_nth(every_nth(string.ascii_lowercase, 26))
 112     'abcdefghijklmnopqrstuvwxyz'
 113     """
 114     return ''.join([''.join(l)
 115                     for l in zip_longest(*split_text, fillvalue='')])
 116
 117 def transpose(items, transposition):
 118     """Moves items around according to the given transposition
 119
 120     >>> transpose(['a', 'b', 'c', 'd'], [0,1,2,3])
 121     ['a', 'b', 'c', 'd']
 122     >>> transpose(['a', 'b', 'c', 'd'], [3,1,2,0])
 123     ['d', 'b', 'c', 'a']
 124     >>> transpose([10,11,12,13,14,15], [3,2,4,1,5,0])
 125     [13, 12, 14, 11, 15, 10]
 126     """
 127     transposed = list(repeat('', len(transposition)))
 128     for p, t in enumerate(transposition):
 129        transposed[p] = items[t]
 130     return transposed
 131
 132 def untranspose(items, transposition):
 133     """Undoes a transpose
 134
 135     >>> untranspose(['a', 'b', 'c', 'd'], [0,1,2,3])
 136     ['a', 'b', 'c', 'd']
 137     >>> untranspose(['d', 'b', 'c', 'a'], [3,1,2,0])
 138     ['a', 'b', 'c', 'd']
 139     >>> untranspose([13, 12, 14, 11, 15, 10], [3,2,4,1,5,0])
 140     [10, 11, 12, 13, 14, 15]
 141     """
 142     transposed  = list(repeat('', len(transposition)))
 143     for p, t in enumerate(transposition):
 144        transposed[t] = items[p]
 145     return transposed
 146
 147
 148 def frequencies(text):
 149     """Count the number of occurrences of each character in text
 150
 151     >>> sorted(frequencies('abcdefabc').items())
 152     [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
 153     >>> sorted(frequencies('the quick brown fox jumped over the lazy ' \
 154          'dog').items()) # doctest: +NORMALIZE_WHITESPACE
 155     [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1),
 156      ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1),
 157      ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2),
 158      ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
 159     >>> sorted(frequencies('The Quick BROWN fox jumped! over... the ' \
 160          '(9lazy) DOG').items()) # doctest: +NORMALIZE_WHITESPACE
 161     [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1),
 162      ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1),
 163      ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2),
 164      ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1),
 165      ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
 166     >>> sorted(frequencies(sanitise('The Quick BROWN fox jumped! over... ' \
 167          'the (9lazy) DOG')).items()) # doctest: +NORMALIZE_WHITESPACE
 168     [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1),
 169      ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1),
 170      ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1),
 171      ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
 172     >>> frequencies('abcdefabcdef')['x']
 173     0
 174     """
 175     #counts = collections.defaultdict(int)
 176     #for c in text:
 177     #    counts[c] += 1
 178     #return counts
 179     return collections.Counter(c for c in text)
 180 letter_frequencies = frequencies
 181
 182 def deduplicate(text):
 183     return list(collections.OrderedDict.fromkeys(text))
 184
 185
 186
 187 def caesar_encipher_letter(letter, shift):
 188     """Encipher a letter, given a shift amount
 189
 190     >>> caesar_encipher_letter('a', 1)
 191     'b'
 192     >>> caesar_encipher_letter('a', 2)
 193     'c'
 194     >>> caesar_encipher_letter('b', 2)
 195     'd'
 196     >>> caesar_encipher_letter('x', 2)
 197     'z'
 198     >>> caesar_encipher_letter('y', 2)
 199     'a'
 200     >>> caesar_encipher_letter('z', 2)
 201     'b'
 202     >>> caesar_encipher_letter('z', -1)
 203     'y'
 204     >>> caesar_encipher_letter('a', -1)
 205     'z'
 206     """
 207     if letter in string.ascii_letters:
 208         if letter in string.ascii_uppercase:
 209             alphabet_start = ord('A')
 210         else:
 211             alphabet_start = ord('a')
 212         return chr(((ord(letter) - alphabet_start + shift) % 26) +
 213                    alphabet_start)
 214     else:
 215         return letter
 216
 217 def caesar_decipher_letter(letter, shift):
 218     """Decipher a letter, given a shift amount
 219
 220     >>> caesar_decipher_letter('b', 1)
 221     'a'
 222     >>> caesar_decipher_letter('b', 2)
 223     'z'
 224     """
 225     return caesar_encipher_letter(letter, -shift)
 226
 227 def caesar_encipher(message, shift):
 228     """Encipher a message with the Caesar cipher of given shift
 229
 230     >>> caesar_encipher('abc', 1)
 231     'bcd'
 232     >>> caesar_encipher('abc', 2)
 233     'cde'
 234     >>> caesar_encipher('abcxyz', 2)
 235     'cdezab'
 236     >>> caesar_encipher('ab cx yz', 2)
 237     'cd ez ab'
 238     """
 239     enciphered = [caesar_encipher_letter(l, shift) for l in message]
 240     return ''.join(enciphered)
 241
 242 def caesar_decipher(message, shift):
 243     """Encipher a message with the Caesar cipher of given shift
 244
 245     >>> caesar_decipher('bcd', 1)
 246     'abc'
 247     >>> caesar_decipher('cde', 2)
 248     'abc'
 249     >>> caesar_decipher('cd ez ab', 2)
 250     'ab cx yz'
 251     """
 252     return caesar_encipher(message, -shift)
 253
 254 def affine_encipher_letter(letter, multiplier=1, adder=0, one_based=True):
 255     """Encipher a letter, given a multiplier and adder
 256
 257     >>> ''.join([affine_encipher_letter(l, 3, 5, True) \
 258             for l in string.ascii_uppercase])
 259     'HKNQTWZCFILORUXADGJMPSVYBE'
 260     >>> ''.join([affine_encipher_letter(l, 3, 5, False) \
 261             for l in string.ascii_uppercase])
 262     'FILORUXADGJMPSVYBEHKNQTWZC'
 263     """
 264     if letter in string.ascii_letters:
 265         if letter in string.ascii_uppercase:
 266             alphabet_start = ord('A')
 267         else:
 268             alphabet_start = ord('a')
 269         letter_number = ord(letter) - alphabet_start
 270         if one_based: letter_number += 1
 271         cipher_number = (letter_number * multiplier + adder) % 26
 272         if one_based: cipher_number -= 1
 273         return chr(cipher_number % 26 + alphabet_start)
 274     else:
 275         return letter
 276
 277 def affine_decipher_letter(letter, multiplier=1, adder=0, one_based=True):
 278     """Encipher a letter, given a multiplier and adder
 279
 280     >>> ''.join([affine_decipher_letter(l, 3, 5, True) \
 281             for l in 'HKNQTWZCFILORUXADGJMPSVYBE'])
 282     'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
 283     >>> ''.join([affine_decipher_letter(l, 3, 5, False) \
 284             for l in 'FILORUXADGJMPSVYBEHKNQTWZC'])
 285     'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
 286     """
 287     if letter in string.ascii_letters:
 288         if letter in string.ascii_uppercase:
 289             alphabet_start = ord('A')
 290         else:
 291             alphabet_start = ord('a')
 292         cipher_number = ord(letter) - alphabet_start
 293         if one_based: cipher_number += 1
 294         plaintext_number = ( modular_division_table[multiplier]
 295                                                    [(cipher_number - adder) % 26] )
 296         if one_based: plaintext_number -= 1
 297         return chr(plaintext_number % 26 + alphabet_start)
 298     else:
 299         return letter
 300
 301 def affine_encipher(message, multiplier=1, adder=0, one_based=True):
 302     """Encipher a message
 303
 304     >>> affine_encipher('hours passed during which jerico tried every ' \
 305            'trick he could think of', 15, 22, True)
 306     'lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh'
 307     """
 308     enciphered = [affine_encipher_letter(l, multiplier, adder, one_based)
 309                   for l in message]
 310     return ''.join(enciphered)
 311
 312 def affine_decipher(message, multiplier=1, adder=0, one_based=True):
 313     """Decipher a message
 314
 315     >>> affine_decipher('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg ' \
 316            'jfaoe ls omytd jlaxe mh', 15, 22, True)
 317     'hours passed during which jerico tried every trick he could think of'
 318     """
 319     enciphered = [affine_decipher_letter(l, multiplier, adder, one_based)
 320                   for l in message]
 321     return ''.join(enciphered)
 322
 323
 324 def keyword_cipher_alphabet_of(keyword, wrap_alphabet=0):
 325     """Find the cipher alphabet given a keyword.
 326     wrap_alphabet controls how the rest of the alphabet is added
 327     after the keyword.
 328     0 : from 'a'
 329     1 : from the last letter in the sanitised keyword
 330     2 : from the largest letter in the sanitised keyword
 331
 332     >>> keyword_cipher_alphabet_of('bayes')
 333     'bayescdfghijklmnopqrtuvwxz'
 334     >>> keyword_cipher_alphabet_of('bayes', 0)
 335     'bayescdfghijklmnopqrtuvwxz'
 336     >>> keyword_cipher_alphabet_of('bayes', 1)
 337     'bayestuvwxzcdfghijklmnopqr'
 338     >>> keyword_cipher_alphabet_of('bayes', 2)
 339     'bayeszcdfghijklmnopqrtuvwx'
 340     """
 341     if wrap_alphabet == 0:
 342         cipher_alphabet = ''.join(deduplicate(sanitise(keyword) +
 343                                               string.ascii_lowercase))
 344     else:
 345         if wrap_alphabet == 1:
 346             last_keyword_letter = deduplicate(sanitise(keyword))[-1]
 347         else:
 348             last_keyword_letter = sorted(sanitise(keyword))[-1]
 349         last_keyword_position = string.ascii_lowercase.find(
 350             last_keyword_letter) + 1
 351         cipher_alphabet = ''.join(
 352             deduplicate(sanitise(keyword) +
 353                         string.ascii_lowercase[last_keyword_position:] +
 354                         string.ascii_lowercase))
 355     return cipher_alphabet
 356
 357
 358 def keyword_encipher(message, keyword, wrap_alphabet=0):
 359     """Enciphers a message with a keyword substitution cipher.
 360     wrap_alphabet controls how the rest of the alphabet is added
 361     after the keyword.
 362     0 : from 'a'
 363     1 : from the last letter in the sanitised keyword
 364     2 : from the largest letter in the sanitised keyword
 365
 366     >>> keyword_encipher('test message', 'bayes')
 367     'rsqr ksqqbds'
 368     >>> keyword_encipher('test message', 'bayes', 0)
 369     'rsqr ksqqbds'
 370     >>> keyword_encipher('test message', 'bayes', 1)
 371     'lskl dskkbus'
 372     >>> keyword_encipher('test message', 'bayes', 2)
 373     'qspq jsppbcs'
 374     """
 375     cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
 376     cipher_translation = ''.maketrans(string.ascii_lowercase, cipher_alphabet)
 377     return message.lower().translate(cipher_translation)
 378
 379 def keyword_decipher(message, keyword, wrap_alphabet=0):
 380     """Deciphers a message with a keyword substitution cipher.
 381     wrap_alphabet controls how the rest of the alphabet is added
 382     after the keyword.
 383     0 : from 'a'
 384     1 : from the last letter in the sanitised keyword
 385     2 : from the largest letter in the sanitised keyword
 386
 387     >>> keyword_decipher('rsqr ksqqbds', 'bayes')
 388     'test message'
 389     >>> keyword_decipher('rsqr ksqqbds', 'bayes', 0)
 390     'test message'
 391     >>> keyword_decipher('lskl dskkbus', 'bayes', 1)
 392     'test message'
 393     >>> keyword_decipher('qspq jsppbcs', 'bayes', 2)
 394     'test message'
 395     """
 396     cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
 397     cipher_translation = ''.maketrans(cipher_alphabet, string.ascii_lowercase)
 398     return message.lower().translate(cipher_translation)
 399
 400 def scytale_encipher(message, rows):
 401     """Enciphers using the scytale transposition cipher.
 402     Message is padded with spaces to allow all rows to be the same length.
 403
 404     >>> scytale_encipher('thequickbrownfox', 3)
 405     'tcnhkfeboqrxuo iw '
 406     >>> scytale_encipher('thequickbrownfox', 4)
 407     'tubnhirfecooqkwx'
 408     >>> scytale_encipher('thequickbrownfox', 5)
 409     'tubn hirf ecoo qkwx '
 410     >>> scytale_encipher('thequickbrownfox', 6)
 411     'tqcrnxhukof eibwo '
 412     >>> scytale_encipher('thequickbrownfox', 7)
 413     'tqcrnx hukof  eibwo  '
 414     """
 415     if len(message) % rows != 0:
 416         message += ' '*(rows - len(message) % rows)
 417     row_length = round(len(message) / rows)
 418     slices = [message[i:i+row_length]
 419               for i in range(0, len(message), row_length)]
 420     return ''.join([''.join(r) for r in zip_longest(*slices, fillvalue='')])
 421
 422 def scytale_decipher(message, rows):
 423     """Deciphers using the scytale transposition cipher.
 424     Assumes the message is padded so that all rows are the same length.
 425
 426     >>> scytale_decipher('tcnhkfeboqrxuo iw ', 3)
 427     'thequickbrownfox  '
 428     >>> scytale_decipher('tubnhirfecooqkwx', 4)
 429     'thequickbrownfox'
 430     >>> scytale_decipher('tubn hirf ecoo qkwx ', 5)
 431     'thequickbrownfox    '
 432     >>> scytale_decipher('tqcrnxhukof eibwo ', 6)
 433     'thequickbrownfox  '
 434     >>> scytale_decipher('tqcrnx hukof  eibwo  ', 7)
 435     'thequickbrownfox     '
 436     """
 437     cols = round(len(message) / rows)
 438     columns = [message[i:i+rows] for i in range(0, cols * rows, rows)]
 439     return ''.join([''.join(c) for c in zip_longest(*columns, fillvalue='')])
 440
 441
 442 def transpositions_of(keyword):
 443     """Finds the transpostions given by a keyword. For instance, the keyword
 444     'clever' rearranges to 'celrv', so the first column (0) stays first, the
 445     second column (1) moves to third, the third column (2) moves to second,
 446     and so on.
 447
 448     >>> transpositions_of('clever')
 449     [0, 2, 1, 4, 3]
 450     """
 451     key = deduplicate(keyword)
 452     transpositions = [key.index(l) for l in sorted(key)]
 453     return transpositions
 454
 455 def column_transposition_encipher(message, keyword, fillvalue=' '):
 456     """Enciphers using the column transposition cipher.
 457     Message is padded to allow all rows to be the same length.
 458
 459     >>> column_transposition_encipher('hellothere', 'clever')
 460     'hleolteher'
 461     >>> column_transposition_encipher('hellothere', 'cleverly', fillvalue='!')
 462     'hleolthre!e!'
 463     """
 464     return column_transposition_worker(message, keyword, encipher=True,
 465                                        fillvalue=fillvalue)
 466
 467 def column_transposition_decipher(message, keyword, fillvalue=' '):
 468     """Deciphers using the column transposition cipher.
 469     Message is padded to allow all rows to be the same length.
 470
 471     >>> column_transposition_decipher('hleolteher', 'clever')
 472     'hellothere'
 473     >>> column_transposition_decipher('hleolthre!e!', 'cleverly', fillvalue='?')
 474     'hellothere!!'
 475     """
 476     return column_transposition_worker(message, keyword, encipher=False,
 477                                        fillvalue=fillvalue)
 478
 479 def column_transposition_worker(message, keyword,
 480                                 encipher=True, fillvalue=' '):
 481     """Does the actual work of the column transposition cipher.
 482     Message is padded with spaces to allow all rows to be the same length.
 483
 484     >>> column_transposition_worker('hellothere', 'clever')
 485     'hleolteher'
 486     >>> column_transposition_worker('hellothere', 'clever', encipher=True)
 487     'hleolteher'
 488     >>> column_transposition_worker('hleolteher', 'clever', encipher=False)
 489     'hellothere'
 490     """
 491     transpositions = transpositions_of(keyword)
 492     columns = every_nth(message, len(transpositions), fillvalue=fillvalue)
 493     if encipher:
 494         transposed_columns = transpose(columns, transpositions)
 495     else:
 496         transposed_columns = untranspose(columns, transpositions)
 497     return combine_every_nth(transposed_columns)
 498
 499 def vigenere_encipher(message, keyword):
 500     """Vigenere encipher
 501
 502     >>> vigenere_encipher('hello', 'abc')
 503     'hfnlp'
 504     """
 505     shifts = [ord(l) - ord('a') for l in sanitise(keyword)]
 506     pairs = zip(message, cycle(shifts))
 507     return ''.join([caesar_encipher_letter(l, k) for l, k in pairs])
 508
 509 def vigenere_decipher(message, keyword):
 510     """Vigenere decipher
 511
 512     >>> vigenere_decipher('hfnlp', 'abc')
 513     'hello'
 514     """
 515     shifts = [ord(l) - ord('a') for l in sanitise(keyword)]
 516     pairs = zip(message, cycle(shifts))
 517     return ''.join([caesar_decipher_letter(l, k) for l, k in pairs])
 518
 519
 520
 521 def caesar_break(message,
 522                  metric=norms.euclidean_distance,
 523                  target_counts=normalised_english_counts,
 524                  message_frequency_scaling=norms.normalise):
 525     """Breaks a Caesar cipher using frequency analysis
 526
 527     >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrh' \
 528           'ecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS
 529     (4, 0.31863952890183...)
 530     >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgtee' \
 531           'raxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS
 532     (19, 0.42152901235832...)
 533     >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurer' \
 534           'svaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS
 535     (13, 0.316029208075451...)
 536     """
 537     sanitised_message = sanitise(message)
 538     best_shift = 0
 539     best_fit = float("inf")
 540     for shift in range(26):
 541         plaintext = caesar_decipher(sanitised_message, shift)
 542         counts = message_frequency_scaling(letter_frequencies(plaintext))
 543         fit = metric(target_counts, counts)
 544         logger.debug('Caesar break attempt using key {0} gives fit of {1} '
 545                       'and decrypt starting: {2}'.format(shift, fit, plaintext[:50]))
 546         if fit < best_fit:
 547             best_fit = fit
 548             best_shift = shift
 549     logger.info('Caesar break best fit: key {0} gives fit of {1} and '
 550                 'decrypt starting: {2}'.format(best_shift, best_fit,
 551                     caesar_decipher(sanitised_message, best_shift)[:50]))
 552     return best_shift, best_fit
 553
 554 def affine_break(message,
 555                  metric=norms.euclidean_distance,
 556                  target_counts=normalised_english_counts,
 557                  message_frequency_scaling=norms.normalise):
 558     """Breaks an affine cipher using frequency analysis
 559
 560     >>> affine_break('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls ' \
 561           'omytd jlaxe mh jm bfmibj umis hfsul axubafkjamx. ls kffkxwsd jls ' \
 562           'ofgbjmwfkiu olfmxmtmwaokttg jlsx ls kffkxwsd jlsi zg tsxwjl. jlsx ' \
 563           'ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls ' \
 564           'bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai kxd ' \
 565           'clm ckuxj.') # doctest: +ELLIPSIS
 566     ((15, 22, True), 0.23570361818655...)
 567     """
 568     sanitised_message = sanitise(message)
 569     best_multiplier = 0
 570     best_adder = 0
 571     best_one_based = True
 572     best_fit = float("inf")
 573     for one_based in [True, False]:
 574         for multiplier in range(1, 26, 2):
 575             for adder in range(26):
 576                 plaintext = affine_decipher(sanitised_message,
 577                                             multiplier, adder, one_based)
 578                 counts = message_frequency_scaling(letter_frequencies(plaintext))
 579                 fit = metric(target_counts, counts)
 580                 logger.debug('Affine break attempt using key {0}x+{1} ({2}) '
 581                              'gives fit of {3} and decrypt starting: {4}'.
 582                              format(multiplier, adder, one_based, fit,
 583                                     plaintext[:50]))
 584                 if fit < best_fit:
 585                     best_fit = fit
 586                     best_multiplier = multiplier
 587                     best_adder = adder
 588                     best_one_based = one_based
 589     logger.info('Affine break best fit with key {0}x+{1} ({2}) gives fit of {3} '
 590                 'and decrypt starting: {4}'.format(
 591                     best_multiplier, best_adder, best_one_based, best_fit,
 592                     affine_decipher(sanitised_message, best_multiplier,
 593                         best_adder, best_one_based)[:50]))
 594     return (best_multiplier, best_adder, best_one_based), best_fit
 595
 596 def keyword_break(message,
 597                   wordlist=keywords,
 598                   metric=norms.euclidean_distance,
 599                   target_counts=normalised_english_counts,
 600                   message_frequency_scaling=norms.normalise):
 601     """Breaks a keyword substitution cipher using a dictionary and
 602     frequency analysis
 603
 604     >>> keyword_break(keyword_encipher('this is a test message for the ' \
 605           'keyword decipherment', 'elephant', 1), \
 606           wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
 607     (('elephant', 1), 0.41643991598441...)
 608     """
 609     best_keyword = ''
 610     best_wrap_alphabet = True
 611     best_fit = float("inf")
 612     for wrap_alphabet in range(3):
 613         for keyword in wordlist:
 614             plaintext = keyword_decipher(message, keyword, wrap_alphabet)
 615             counts = message_frequency_scaling(letter_frequencies(plaintext))
 616             fit = metric(target_counts, counts)
 617             logger.debug('Keyword break attempt using key {0} (wrap={1}) '
 618                          'gives fit of {2} and decrypt starting: {3}'.format(
 619                              keyword, wrap_alphabet, fit,
 620                              sanitise(plaintext)[:50]))
 621             if fit < best_fit:
 622                 best_fit = fit
 623                 best_keyword = keyword
 624                 best_wrap_alphabet = wrap_alphabet
 625     logger.info('Keyword break best fit with key {0} (wrap={1}) gives fit of '
 626                 '{2} and decrypt starting: {3}'.format(best_keyword,
 627                     best_wrap_alphabet, best_fit, sanitise(
 628                         keyword_decipher(message, best_keyword,
 629                                          best_wrap_alphabet))[:50]))
 630     return (best_keyword, best_wrap_alphabet), best_fit
 631
 632 def keyword_break_mp(message,
 633                      wordlist=keywords,
 634                      metric=norms.euclidean_distance,
 635                      target_counts=normalised_english_counts,
 636                      message_frequency_scaling=norms.normalise,
 637                      chunksize=500):
 638     """Breaks a keyword substitution cipher using a dictionary and
 639     frequency analysis
 640
 641     >>> keyword_break_mp(keyword_encipher('this is a test message for the ' \
 642           'keyword decipherment', 'elephant', 1), \
 643           wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
 644     (('elephant', 1), 0.41643991598441...)
 645     """
 646     with Pool() as pool:
 647         helper_args = [(message, word, wrap, metric, target_counts,
 648                         message_frequency_scaling)
 649                        for word in wordlist for wrap in range(3)]
 650         # Gotcha: the helper function here needs to be defined at the top level
 651         #   (limitation of Pool.starmap)
 652         breaks = pool.starmap(keyword_break_worker, helper_args, chunksize)
 653         return min(breaks, key=lambda k: k[1])
 654
 655 def keyword_break_worker(message, keyword, wrap_alphabet, metric, target_counts,
 656                       message_frequency_scaling):
 657     plaintext = keyword_decipher(message, keyword, wrap_alphabet)
 658     counts = message_frequency_scaling(letter_frequencies(plaintext))
 659     fit = metric(target_counts, counts)
 660     logger.debug('Keyword break attempt using key {0} (wrap={1}) gives fit of '
 661                  '{2} and decrypt starting: {3}'.format(keyword,
 662                      wrap_alphabet, fit, sanitise(plaintext)[:50]))
 663     return (keyword, wrap_alphabet), fit
 664
 665 def scytale_break(message,
 666                   metric=norms.euclidean_distance,
 667                   target_counts=normalised_english_bigram_counts,
 668                   message_frequency_scaling=norms.normalise):
 669     """Breaks a Scytale cipher
 670
 671     >>> scytale_break('tfeulchtrtteehwahsdehneoifeayfsondmwpltmaoalhikotoere' \
 672            'dcweatehiplwxsnhooacgorrcrcraotohsgullasenylrendaianeplscdriioto' \
 673            'aek') # doctest: +ELLIPSIS
 674     (6, 0.83453041115025...)
 675     """
 676     best_key = 0
 677     best_fit = float("inf")
 678     ngram_length = len(next(iter(target_counts.keys())))
 679     for key in range(1, 20):
 680         if len(message) % key == 0:
 681             plaintext = scytale_decipher(message, key)
 682             counts = message_frequency_scaling(frequencies(
 683                          ngrams(sanitise(plaintext), ngram_length)))
 684             fit = metric(target_counts, counts)
 685             logger.debug('Scytale break attempt using key {0} gives fit of '
 686                          '{1} and decrypt starting: {2}'.format(key,
 687                              fit, sanitise(plaintext)[:50]))
 688             if fit < best_fit:
 689                 best_fit = fit
 690                 best_key = key
 691     logger.info('Scytale break best fit with key {0} gives fit of {1} and '
 692                 'decrypt starting: {2}'.format(best_key, best_fit,
 693                     sanitise(scytale_decipher(message, best_key))[:50]))
 694     return best_key, best_fit
 695
 696 def column_transposition_break(message,
 697                   wordlist=keywords,
 698                   metric=norms.euclidean_distance,
 699                   target_counts=normalised_english_bigram_counts,
 700                   message_frequency_scaling=norms.normalise):
 701     """Breaks a column transposition cipher using a dictionary and
 702     n-gram frequency analysis
 703
 704     >>> column_transposition_break(column_transposition_encipher(sanitise( \
 705         "Turing's homosexuality resulted in a criminal prosecution in 1952, \
 706         when homosexual acts were still illegal in the United Kingdom. "), \
 707         'encipher'), \
 708         wordlist=['encipher', 'keyword', 'fourteen']) # doctest: +ELLIPSIS
 709     ('encipher', 0.898128626285...)
 710     >>> column_transposition_break(column_transposition_encipher(sanitise( \
 711         "Turing's homosexuality resulted in a criminal prosecution in 1952, " \
 712         "when homosexual acts were still illegal in the United Kingdom."), \
 713         'encipher'), \
 714         wordlist=['encipher', 'keyword', 'fourteen'], \
 715         target_counts=normalised_english_trigram_counts) # doctest: +ELLIPSIS
 716     ('encipher', 1.1958792913127...)
 717     """
 718     best_keyword = ''
 719     best_fit = float("inf")
 720     ngram_length = len(next(iter(target_counts.keys())))
 721     for keyword in wordlist:
 722         if len(message) % len(deduplicate(keyword)) == 0:
 723             plaintext = column_transposition_decipher(message, keyword)
 724             counts = message_frequency_scaling(frequencies(
 725                          ngrams(sanitise(plaintext), ngram_length)))
 726             fit = metric(target_counts, counts)
 727             logger.debug('Column transposition break attempt using key {0} '
 728                          'gives fit of {1} and decrypt starting: {2}'.format(
 729                              keyword, fit,
 730                              sanitise(plaintext)[:50]))
 731             if fit < best_fit:
 732                 best_fit = fit
 733                 best_keyword = keyword
 734     logger.info('Column transposition break best fit with key {0} gives fit '
 735                 'of {1} and decrypt starting: {2}'.format(best_keyword,
 736                     best_fit, sanitise(
 737                         column_transposition_decipher(message,
 738                             best_keyword))[:50]))
 739     return best_keyword, best_fit
 740
 741
 742 def column_transposition_break_mp(message,
 743                      wordlist=keywords,
 744                      metric=norms.euclidean_distance,
 745                      target_counts=normalised_english_bigram_counts,
 746                      message_frequency_scaling=norms.normalise,
 747                      chunksize=500):
 748     """Breaks a column transposition cipher using a dictionary and
 749     n-gram frequency analysis
 750
 751     >>> column_transposition_break_mp(column_transposition_encipher(sanitise( \
 752         "Turing's homosexuality resulted in a criminal prosecution in 1952, \
 753         when homosexual acts were still illegal in the United Kingdom. "), \
 754         'encipher'), \
 755         wordlist=['encipher', 'keyword', 'fourteen']) # doctest: +ELLIPSIS
 756     ('encipher', 0.898128626285...)
 757     >>> column_transposition_break_mp(column_transposition_encipher(sanitise( \
 758         "Turing's homosexuality resulted in a criminal prosecution in 1952, " \
 759         "when homosexual acts were still illegal in the United Kingdom."), \
 760         'encipher'), \
 761         wordlist=['encipher', 'keyword', 'fourteen'], \
 762         target_counts=normalised_english_trigram_counts) # doctest: +ELLIPSIS
 763     ('encipher', 1.1958792913127...)
 764     """
 765     ngram_length = len(next(iter(target_counts.keys())))
 766     with Pool() as pool:
 767         helper_args = [(message, word, metric, target_counts, ngram_length,
 768                         message_frequency_scaling)
 769                        for word in wordlist]
 770         # Gotcha: the helper function here needs to be defined at the top level
 771         #   (limitation of Pool.starmap)
 772         breaks = pool.starmap(column_transposition_break_worker, helper_args, chunksize)
 773         return min(breaks, key=lambda k: k[1])
 774
 775 def column_transposition_break_worker(message, keyword, metric, target_counts,
 776                       ngram_length, message_frequency_scaling):
 777     plaintext = column_transposition_decipher(message, keyword)
 778     counts = message_frequency_scaling(frequencies(
 779                          ngrams(sanitise(plaintext), ngram_length)))
 780     fit = metric(target_counts, counts)
 781     logger.debug('Column transposition break attempt using key {0} '
 782                          'gives fit of {1} and decrypt starting: {2}'.format(
 783                              keyword, fit,
 784                              sanitise(plaintext)[:50]))
 785     return keyword, fit
 786
 787 def vigenere_keyword_break(message,
 788                   wordlist=keywords,
 789                   metric=norms.euclidean_distance,
 790                   target_counts=normalised_english_counts,
 791                   message_frequency_scaling=norms.normalise):
 792     """Breaks a vigenere cipher using a dictionary and
 793     frequency analysis
 794
 795     >>> vigenere_keyword_break(keyword_encipher('this is a test message for the ' \
 796              'keyword decipherment', 'elephant', 1), \
 797              wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
 798     ('elephant', 0.7166585201707...)
 799     """
 800     best_keyword = ''
 801     best_fit = float("inf")
 802     for keyword in wordlist:
 803         plaintext = vigenere_decipher(message, keyword)
 804         counts = message_frequency_scaling(letter_frequencies(plaintext))
 805         fit = metric(target_counts, counts)
 806         logger.debug('Vigenere break attempt using key {0} '
 807                          'gives fit of {1} and decrypt starting: {2}'.format(
 808                              keyword, fit,
 809                              sanitise(plaintext)[:50]))
 810         if fit < best_fit:
 811             best_fit = fit
 812             best_keyword = keyword
 813     logger.info('Vigenere break best fit with key {0} gives fit '
 814                 'of {1} and decrypt starting: {2}'.format(best_keyword,
 815                     best_fit, sanitise(
 816                         vigenere_decipher(message, best_keyword))[:50]))
 817     return best_keyword, best_fit
 818
 819 def vigenere_keyword_break_mp(message,
 820                      wordlist=keywords,
 821                      metric=norms.euclidean_distance,
 822                      target_counts=normalised_english_counts,
 823                      message_frequency_scaling=norms.normalise,
 824                      chunksize=500):
 825     """Breaks a vigenere cipher using a dictionary and
 826     frequency analysis
 827
 828     >>> vigenere_keyword_break_mp(keyword_encipher('this is a test message for the ' \
 829              'keyword decipherment', 'elephant', 1), \
 830              wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
 831     ('elephant', 0.7166585201707...)
 832     """
 833     with Pool() as pool:
 834         helper_args = [(message, word, metric, target_counts,
 835                         message_frequency_scaling)
 836                        for word in wordlist]
 837         # Gotcha: the helper function here needs to be defined at the top level
 838         #   (limitation of Pool.starmap)
 839         breaks = pool.starmap(vigenere_keyword_break_worker, helper_args, chunksize)
 840         return min(breaks, key=lambda k: k[1])
 841
 842 def vigenere_keyword_break_worker(message, keyword, metric, target_counts,
 843                       message_frequency_scaling):
 844     plaintext = vigenere_decipher(message, keyword)
 845     counts = message_frequency_scaling(letter_frequencies(plaintext))
 846     fit = metric(target_counts, counts)
 847     logger.debug('Vigenere keyword break attempt using key {0} gives fit of '
 848                  '{1} and decrypt starting: {2}'.format(keyword,
 849                      fit, sanitise(plaintext)[:50]))
 850     return keyword, fit
 851
 852
 853
 854 if __name__ == "__main__":
 855     import doctest
 856     doctest.testmod()