Split breaking routines out into a separate file. Column transposition now uses as...

author Neil Smith <neil.github@njae.me.uk>

Fri, 22 Nov 2013 10:32:20 +0000 (10:32 +0000)

committer Neil Smith <neil.github@njae.me.uk>

Fri, 22 Nov 2013 10:32:20 +0000 (10:32 +0000)
author Neil Smith <neil.github@njae.me.uk>
Fri, 22 Nov 2013 10:32:20 +0000 (10:32 +0000)
committer Neil Smith <neil.github@njae.me.uk>
Fri, 22 Nov 2013 10:32:20 +0000 (10:32 +0000)
diff --git a/2013/solutions.txt b/2013/solutions.txt

index 0ad88be6ff727d44e10b090606b34b9ec1f9acda..b54c58f0f9bbb8bcdb1f0ee8d438f6def2d219bb 100644 (file)
--- a/2013/solutions.txt
+++ b/2013/solutions.txt
@@ -9,4 +9,5 @@
  4a: keyword_decipher(c4a, 'montal', 2)
  4b: keyword_decipher(c4b, 'salvation', 2)
  5a: keyword_decipher(c5a, 'alfredo', 2)
  4a: keyword_decipher(c4a, 'montal', 2)
  4b: keyword_decipher(c4b, 'salvation', 2)
  5a: keyword_decipher(c5a, 'alfredo', 2)
+5b: vigenere_decipher(c5bs, 'florence')
  
  
diff --git a/break.py b/break.py

new file mode 100644 (file)

index 0000000..08b7301
--- /dev/null
+++ b/break.py
@@ -0,0 +1,428 @@
+import string
+import collections
+import norms
+import logging
+from itertools import zip_longest, cycle
+from segment import segment
+from multiprocessing import Pool
+
+from cipher import *
+
+# To time a run:
+#
+# import timeit
+# c5a = open('2012/5a.ciphertext', 'r').read()
+# timeit.timeit('keyword_break(c5a)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break', number=1)
+# timeit.repeat('keyword_break_mp(c5a, chunksize=500)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break_mp', repeat=5, number=1)
+
+
+english_counts = collections.defaultdict(int)
+with open('count_1l.txt', 'r') as f:
+    for line in f:
+        (letter, count) = line.split("\t")
+        english_counts[letter] = int(count)
+normalised_english_counts = norms.normalise(english_counts)
+
+english_bigram_counts = collections.defaultdict(int)
+with open('count_2l.txt', 'r') as f:
+    for line in f:
+        (bigram, count) = line.split("\t")
+        english_bigram_counts[bigram] = int(count)
+normalised_english_bigram_counts = norms.normalise(english_bigram_counts)
+
+english_trigram_counts = collections.defaultdict(int)
+with open('count_3l.txt', 'r') as f:
+    for line in f:
+        (trigram, count) = line.split("\t")
+        english_trigram_counts[trigram] = int(count)
+normalised_english_trigram_counts = norms.normalise(english_trigram_counts)
+
+
+with open('words.txt', 'r') as f:
+    keywords = [line.rstrip() for line in f]
+
+transpositions = collections.defaultdict(list)
+for word in keywords:
+    transpositions[transpositions_of(word)] += [word]
+
+def frequencies(text):
+    """Count the number of occurrences of each character in text
+    
+    >>> sorted(frequencies('abcdefabc').items())
+    [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
+    >>> sorted(frequencies('the quick brown fox jumped over the lazy ' \
+         'dog').items()) # doctest: +NORMALIZE_WHITESPACE
+    [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), 
+     ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), 
+     ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), 
+     ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
+    >>> sorted(frequencies('The Quick BROWN fox jumped! over... the ' \
+         '(9lazy) DOG').items()) # doctest: +NORMALIZE_WHITESPACE
+    [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1), 
+     ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1), 
+     ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2), 
+     ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1), 
+     ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
+    >>> sorted(frequencies(sanitise('The Quick BROWN fox jumped! over... ' \
+         'the (9lazy) DOG')).items()) # doctest: +NORMALIZE_WHITESPACE
+    [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), 
+     ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), 
+     ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), 
+     ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
+    >>> frequencies('abcdefabcdef')['x']
+    0
+    """
+    #counts = collections.defaultdict(int)
+    #for c in text: 
+    #    counts[c] += 1
+    #return counts
+    return collections.Counter(c for c in text)
+letter_frequencies = frequencies
+
+
+
+def caesar_break(message, 
+                 metric=norms.euclidean_distance, 
+                 target_counts=normalised_english_counts, 
+                 message_frequency_scaling=norms.normalise):
+    """Breaks a Caesar cipher using frequency analysis
+    
+    >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrh' \
+          'ecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS
+    (4, 0.31863952890183...)
+    >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgtee' \
+          'raxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS
+    (19, 0.42152901235832...)
+    >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurer' \
+          'svaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS
+    (13, 0.316029208075451...)
+    """
+    sanitised_message = sanitise(message)
+    best_shift = 0
+    best_fit = float("inf")
+    for shift in range(26):
+        plaintext = caesar_decipher(sanitised_message, shift)
+        counts = message_frequency_scaling(letter_frequencies(plaintext))
+        fit = metric(target_counts, counts)
+        logger.debug('Caesar break attempt using key {0} gives fit of {1} '
+                      'and decrypt starting: {2}'.format(shift, fit, plaintext[:50]))
+        if fit < best_fit:
+            best_fit = fit
+            best_shift = shift
+    logger.info('Caesar break best fit: key {0} gives fit of {1} and '
+                'decrypt starting: {2}'.format(best_shift, best_fit, 
+                    caesar_decipher(sanitised_message, best_shift)[:50]))
+    return best_shift, best_fit
+
+def affine_break(message, 
+                 metric=norms.euclidean_distance, 
+                 target_counts=normalised_english_counts, 
+                 message_frequency_scaling=norms.normalise):
+    """Breaks an affine cipher using frequency analysis
+    
+    >>> affine_break('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls ' \
+          'omytd jlaxe mh jm bfmibj umis hfsul axubafkjamx. ls kffkxwsd jls ' \
+          'ofgbjmwfkiu olfmxmtmwaokttg jlsx ls kffkxwsd jlsi zg tsxwjl. jlsx ' \
+          'ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls ' \
+          'bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai kxd ' \
+          'clm ckuxj.') # doctest: +ELLIPSIS
+    ((15, 22, True), 0.23570361818655...)
+    """
+    sanitised_message = sanitise(message)
+    best_multiplier = 0
+    best_adder = 0
+    best_one_based = True
+    best_fit = float("inf")
+    for one_based in [True, False]:
+        for multiplier in range(1, 26, 2):
+            for adder in range(26):
+                plaintext = affine_decipher(sanitised_message, 
+                                            multiplier, adder, one_based)
+                counts = message_frequency_scaling(letter_frequencies(plaintext))
+                fit = metric(target_counts, counts)
+                logger.debug('Affine break attempt using key {0}x+{1} ({2}) '
+                             'gives fit of {3} and decrypt starting: {4}'.
+                             format(multiplier, adder, one_based, fit, 
+                                    plaintext[:50]))
+                if fit < best_fit:
+                    best_fit = fit
+                    best_multiplier = multiplier
+                    best_adder = adder
+                    best_one_based = one_based
+    logger.info('Affine break best fit with key {0}x+{1} ({2}) gives fit of {3} '
+                'and decrypt starting: {4}'.format(
+                    best_multiplier, best_adder, best_one_based, best_fit, 
+                    affine_decipher(sanitised_message, best_multiplier, 
+                        best_adder, best_one_based)[:50]))
+    return (best_multiplier, best_adder, best_one_based), best_fit
+
+def keyword_break(message, 
+                  wordlist=keywords, 
+                  metric=norms.euclidean_distance, 
+                  target_counts=normalised_english_counts, 
+                  message_frequency_scaling=norms.normalise):
+    """Breaks a keyword substitution cipher using a dictionary and 
+    frequency analysis
+
+    >>> keyword_break(keyword_encipher('this is a test message for the ' \
+          'keyword decipherment', 'elephant', 1), \
+          wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
+    (('elephant', 1), 0.41643991598441...)
+    """
+    best_keyword = ''
+    best_wrap_alphabet = True
+    best_fit = float("inf")
+    for wrap_alphabet in range(3):
+        for keyword in wordlist:
+            plaintext = keyword_decipher(message, keyword, wrap_alphabet)
+            counts = message_frequency_scaling(letter_frequencies(plaintext))
+            fit = metric(target_counts, counts)
+            logger.debug('Keyword break attempt using key {0} (wrap={1}) '
+                         'gives fit of {2} and decrypt starting: {3}'.format(
+                             keyword, wrap_alphabet, fit, 
+                             sanitise(plaintext)[:50]))
+            if fit < best_fit:
+                best_fit = fit
+                best_keyword = keyword
+                best_wrap_alphabet = wrap_alphabet
+    logger.info('Keyword break best fit with key {0} (wrap={1}) gives fit of '
+                '{2} and decrypt starting: {3}'.format(best_keyword, 
+                    best_wrap_alphabet, best_fit, sanitise(
+                        keyword_decipher(message, best_keyword, 
+                                         best_wrap_alphabet))[:50]))
+    return (best_keyword, best_wrap_alphabet), best_fit
+
+def keyword_break_mp(message, 
+                     wordlist=keywords, 
+                     metric=norms.euclidean_distance, 
+                     target_counts=normalised_english_counts, 
+                     message_frequency_scaling=norms.normalise, 
+                     chunksize=500):
+    """Breaks a keyword substitution cipher using a dictionary and 
+    frequency analysis
+
+    >>> keyword_break_mp(keyword_encipher('this is a test message for the ' \
+          'keyword decipherment', 'elephant', 1), \
+          wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
+    (('elephant', 1), 0.41643991598441...)
+    """
+    with Pool() as pool:
+        helper_args = [(message, word, wrap, metric, target_counts, 
+                        message_frequency_scaling) 
+                       for word in wordlist for wrap in range(3)]
+        # Gotcha: the helper function here needs to be defined at the top level 
+        #   (limitation of Pool.starmap)
+        breaks = pool.starmap(keyword_break_worker, helper_args, chunksize) 
+        return min(breaks, key=lambda k: k[1])
+
+def keyword_break_worker(message, keyword, wrap_alphabet, metric, target_counts, 
+                      message_frequency_scaling):
+    plaintext = keyword_decipher(message, keyword, wrap_alphabet)
+    counts = message_frequency_scaling(letter_frequencies(plaintext))
+    fit = metric(target_counts, counts)
+    logger.debug('Keyword break attempt using key {0} (wrap={1}) gives fit of '
+                 '{2} and decrypt starting: {3}'.format(keyword, 
+                     wrap_alphabet, fit, sanitise(plaintext)[:50]))
+    return (keyword, wrap_alphabet), fit
+
+def scytale_break(message, 
+                  metric=norms.euclidean_distance, 
+                  target_counts=normalised_english_bigram_counts, 
+                  message_frequency_scaling=norms.normalise):
+    """Breaks a Scytale cipher
+    
+    >>> scytale_break('tfeulchtrtteehwahsdehneoifeayfsondmwpltmaoalhikotoere' \
+           'dcweatehiplwxsnhooacgorrcrcraotohsgullasenylrendaianeplscdriioto' \
+           'aek') # doctest: +ELLIPSIS
+    (6, 0.83453041115025...)
+    """
+    best_key = 0
+    best_fit = float("inf")
+    ngram_length = len(next(iter(target_counts.keys())))
+    for key in range(1, 20):
+        if len(message) % key == 0:
+            plaintext = scytale_decipher(message, key)
+            counts = message_frequency_scaling(frequencies(
+                         ngrams(sanitise(plaintext), ngram_length)))
+            fit = metric(target_counts, counts)
+            logger.debug('Scytale break attempt using key {0} gives fit of '
+                         '{1} and decrypt starting: {2}'.format(key, 
+                             fit, sanitise(plaintext)[:50]))
+            if fit < best_fit:
+                best_fit = fit
+                best_key = key
+    logger.info('Scytale break best fit with key {0} gives fit of {1} and '
+                'decrypt starting: {2}'.format(best_key, best_fit, 
+                    sanitise(scytale_decipher(message, best_key))[:50]))
+    return best_key, best_fit
+
+def column_transposition_break(message, 
+                  translist=transpositions, 
+                  metric=norms.euclidean_distance, 
+                  target_counts=normalised_english_bigram_counts, 
+                  message_frequency_scaling=norms.normalise):
+    """Breaks a column transposition cipher using a dictionary and 
+    n-gram frequency analysis
+
+    >>> column_transposition_break(column_transposition_encipher(sanitise( \
+        "Turing's homosexuality resulted in a criminal prosecution in 1952, \
+        when homosexual acts were still illegal in the United Kingdom. "), \
+        'encipher'), \
+        translist={(2, 0, 5, 3, 1, 4, 6): ['encipher'], \
+                   (5, 0, 6, 1, 3, 4, 2): ['fourteen'], \
+                   (6, 1, 0, 4, 5, 3, 2): ['keyword']}) # doctest: +ELLIPSIS
+    ((2, 0, 5, 3, 1, 4, 6), 0.898128626285...)
+    >>> column_transposition_break(column_transposition_encipher(sanitise( \
+        "Turing's homosexuality resulted in a criminal prosecution in 1952, " \
+        "when homosexual acts were still illegal in the United Kingdom."), \
+        'encipher'), \
+        translist={(2, 0, 5, 3, 1, 4, 6): ['encipher'], \
+                   (5, 0, 6, 1, 3, 4, 2): ['fourteen'], \
+                   (6, 1, 0, 4, 5, 3, 2): ['keyword']}, \
+        target_counts=normalised_english_trigram_counts) # doctest: +ELLIPSIS
+    ((2, 0, 5, 3, 1, 4, 6), 1.1958792913127...)
+    """
+    best_transposition = ''
+    best_fit = float("inf")
+    ngram_length = len(next(iter(target_counts.keys())))
+    for transposition in translist.keys():
+        if len(message) % len(transposition) == 0:
+            plaintext = column_transposition_decipher(message, transposition)
+            counts = message_frequency_scaling(frequencies(
+                         ngrams(sanitise(plaintext), ngram_length)))
+            fit = metric(target_counts, counts)
+            logger.debug('Column transposition break attempt using key {0} '
+                         'gives fit of {1} and decrypt starting: {2}'.format(
+                             translist[transposition][0], fit, 
+                             sanitise(plaintext)[:50]))
+            if fit < best_fit:
+                best_fit = fit
+                best_transposition = transposition
+    logger.info('Column transposition break best fit with key {0} gives fit '
+                'of {1} and decrypt starting: {2}'.format(
+                    translist[best_transposition][0], 
+                    best_fit, sanitise(
+                        column_transposition_decipher(message, 
+                            best_transposition))[:50]))
+    return best_transposition, best_fit
+
+
+def column_transposition_break_mp(message, 
+                     translist=transpositions, 
+                     metric=norms.euclidean_distance, 
+                     target_counts=normalised_english_bigram_counts, 
+                     message_frequency_scaling=norms.normalise, 
+                     chunksize=500):
+    """Breaks a column transposition cipher using a dictionary and 
+    n-gram frequency analysis
+
+    >>> column_transposition_break_mp(column_transposition_encipher(sanitise( \
+        "Turing's homosexuality resulted in a criminal prosecution in 1952, \
+        when homosexual acts were still illegal in the United Kingdom. "), \
+        'encipher'), \
+        translist={(2, 0, 5, 3, 1, 4, 6): ['encipher'], \
+                   (5, 0, 6, 1, 3, 4, 2): ['fourteen'], \
+                   (6, 1, 0, 4, 5, 3, 2): ['keyword']}) # doctest: +ELLIPSIS
+    ((2, 0, 5, 3, 1, 4, 6), 0.898128626285...)
+    >>> column_transposition_break_mp(column_transposition_encipher(sanitise( \
+        "Turing's homosexuality resulted in a criminal prosecution in 1952, " \
+        "when homosexual acts were still illegal in the United Kingdom."), \
+        'encipher'), \
+        translist={(2, 0, 5, 3, 1, 4, 6): ['encipher'], \
+                   (5, 0, 6, 1, 3, 4, 2): ['fourteen'], \
+                   (6, 1, 0, 4, 5, 3, 2): ['keyword']}, \
+        target_counts=normalised_english_trigram_counts) # doctest: +ELLIPSIS
+    ((2, 0, 5, 3, 1, 4, 6), 1.1958792913127...)
+    """
+    ngram_length = len(next(iter(target_counts.keys())))
+    with Pool() as pool:
+        helper_args = [(message, trans, metric, target_counts, ngram_length,
+                        message_frequency_scaling) 
+                       for trans in translist.keys()]
+        # Gotcha: the helper function here needs to be defined at the top level 
+        #   (limitation of Pool.starmap)
+        breaks = pool.starmap(column_transposition_break_worker, helper_args, chunksize) 
+        return min(breaks, key=lambda k: k[1])
+
+def column_transposition_break_worker(message, transposition, metric, target_counts, 
+                      ngram_length, message_frequency_scaling):
+    plaintext = column_transposition_decipher(message, transposition)
+    counts = message_frequency_scaling(frequencies(
+                         ngrams(sanitise(plaintext), ngram_length)))
+    fit = metric(target_counts, counts)
+    logger.debug('Column transposition break attempt using key {0} '
+                         'gives fit of {1} and decrypt starting: {2}'.format(
+                             transposition, fit, 
+                             sanitise(plaintext)[:50]))
+    return transposition, fit
+
+def vigenere_keyword_break(message, 
+                  wordlist=keywords, 
+                  metric=norms.euclidean_distance, 
+                  target_counts=normalised_english_counts, 
+                  message_frequency_scaling=norms.normalise):
+    """Breaks a vigenere cipher using a dictionary and 
+    frequency analysis
+    
+    >>> vigenere_keyword_break(keyword_encipher('this is a test message for the ' \
+             'keyword decipherment', 'elephant', 1), \
+             wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
+    ('elephant', 0.7166585201707...)
+    """
+    best_keyword = ''
+    best_fit = float("inf")
+    for keyword in wordlist:
+        plaintext = vigenere_decipher(message, keyword)
+        counts = message_frequency_scaling(letter_frequencies(plaintext))
+        fit = metric(target_counts, counts)
+        logger.debug('Vigenere break attempt using key {0} '
+                         'gives fit of {1} and decrypt starting: {2}'.format(
+                             keyword, fit, 
+                             sanitise(plaintext)[:50]))
+        if fit < best_fit:
+            best_fit = fit
+            best_keyword = keyword
+    logger.info('Vigenere break best fit with key {0} gives fit '
+                'of {1} and decrypt starting: {2}'.format(best_keyword, 
+                    best_fit, sanitise(
+                        vigenere_decipher(message, best_keyword))[:50]))
+    return best_keyword, best_fit
+
+def vigenere_keyword_break_mp(message, 
+                     wordlist=keywords, 
+                     metric=norms.euclidean_distance, 
+                     target_counts=normalised_english_counts, 
+                     message_frequency_scaling=norms.normalise, 
+                     chunksize=500):
+    """Breaks a vigenere cipher using a dictionary and 
+    frequency analysis
+
+    >>> vigenere_keyword_break_mp(keyword_encipher('this is a test message for the ' \
+             'keyword decipherment', 'elephant', 1), \
+             wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
+    ('elephant', 0.7166585201707...)
+    """
+    with Pool() as pool:
+        helper_args = [(message, word, metric, target_counts, 
+                        message_frequency_scaling) 
+                       for word in wordlist]
+        # Gotcha: the helper function here needs to be defined at the top level 
+        #   (limitation of Pool.starmap)
+        breaks = pool.starmap(vigenere_keyword_break_worker, helper_args, chunksize) 
+        return min(breaks, key=lambda k: k[1])
+
+def vigenere_keyword_break_worker(message, keyword, metric, target_counts, 
+                      message_frequency_scaling):
+    plaintext = vigenere_decipher(message, keyword)
+    counts = message_frequency_scaling(letter_frequencies(plaintext))
+    fit = metric(target_counts, counts)
+    logger.debug('Vigenere keyword break attempt using key {0} gives fit of '
+                 '{1} and decrypt starting: {2}'.format(keyword, 
+                     fit, sanitise(plaintext)[:50]))
+    return keyword, fit
+
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
+
diff --git a/cipher.py b/cipher.py

index 84a4915e0f61952215671a42fe1aee81268ca259..6f72c26a48d35dc3689f33731e820dcc66f41da6 100644 (file)
--- a/cipher.py
+++ b/cipher.py
@@ -1,18 +1,12 @@
  import string
  import collections
  import string
  import collections
-import norms
+# import norms
  import logging
  import logging
-import math
+# import math
  from itertools import zip_longest, repeat, cycle
  from itertools import zip_longest, repeat, cycle
-from segment import segment
-from multiprocessing import Pool
+# from segment import segment
+# from multiprocessing import Pool
  
  
-# To time a run:
-#
-# import timeit
-# c5a = open('2012/5a.ciphertext', 'r').read()
-# timeit.timeit('keyword_break(c5a)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break', number=1)
-# timeit.repeat('keyword_break_mp(c5a, chunksize=500)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break_mp', repeat=5, number=1
  
  logger = logging.getLogger(__name__)
  logger.addHandler(logging.FileHandler('cipher.log'))
  
  logger = logging.getLogger(__name__)
  logger.addHandler(logging.FileHandler('cipher.log'))
@@ -20,30 +14,6 @@ logger.setLevel(logging.WARNING)
  #logger.setLevel(logging.INFO)
  #logger.setLevel(logging.DEBUG)
  
  #logger.setLevel(logging.INFO)
  #logger.setLevel(logging.DEBUG)
  
-english_counts = collections.defaultdict(int)
-with open('count_1l.txt', 'r') as f:
-    for line in f:
-        (letter, count) = line.split("\t")
-        english_counts[letter] = int(count)
-normalised_english_counts = norms.normalise(english_counts)
-
-english_bigram_counts = collections.defaultdict(int)
-with open('count_2l.txt', 'r') as f:
-    for line in f:
-        (bigram, count) = line.split("\t")
-        english_bigram_counts[bigram] = int(count)
-normalised_english_bigram_counts = norms.normalise(english_bigram_counts)
-
-english_trigram_counts = collections.defaultdict(int)
-with open('count_3l.txt', 'r') as f:
-    for line in f:
-        (trigram, count) = line.split("\t")
-        english_trigram_counts[trigram] = int(count)
-normalised_english_trigram_counts = norms.normalise(english_trigram_counts)
-
-
-with open('words.txt', 'r') as f:
-    keywords = [line.rstrip() for line in f]
  
  modular_division_table = [[0]*26 for x in range(26)]
  for a in range(26):
  
  modular_division_table = [[0]*26 for x in range(26)]
  for a in range(26):
@@ -117,11 +87,11 @@ def combine_every_nth(split_text):
  def transpose(items, transposition):
      """Moves items around according to the given transposition
      
  def transpose(items, transposition):
      """Moves items around according to the given transposition
      
-    >>> transpose(['a', 'b', 'c', 'd'], [0,1,2,3])
+    >>> transpose(['a', 'b', 'c', 'd'], (0,1,2,3))
      ['a', 'b', 'c', 'd']
      ['a', 'b', 'c', 'd']
-    >>> transpose(['a', 'b', 'c', 'd'], [3,1,2,0])
+    >>> transpose(['a', 'b', 'c', 'd'], (3,1,2,0))
      ['d', 'b', 'c', 'a']
      ['d', 'b', 'c', 'a']
-    >>> transpose([10,11,12,13,14,15], [3,2,4,1,5,0])  
+    >>> transpose([10,11,12,13,14,15], (3,2,4,1,5,0))
      [13, 12, 14, 11, 15, 10]
      """
      transposed = list(repeat('', len(transposition)))
      [13, 12, 14, 11, 15, 10]
      """
      transposed = list(repeat('', len(transposition)))
@@ -145,39 +115,6 @@ def untranspose(items, transposition):
      return transposed
  
  
      return transposed
  
  
-def frequencies(text):
-    """Count the number of occurrences of each character in text
-    
-    >>> sorted(frequencies('abcdefabc').items())
-    [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
-    >>> sorted(frequencies('the quick brown fox jumped over the lazy ' \
-         'dog').items()) # doctest: +NORMALIZE_WHITESPACE
-    [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), 
-     ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), 
-     ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), 
-     ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
-    >>> sorted(frequencies('The Quick BROWN fox jumped! over... the ' \
-         '(9lazy) DOG').items()) # doctest: +NORMALIZE_WHITESPACE
-    [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1), 
-     ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1), 
-     ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2), 
-     ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1), 
-     ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
-    >>> sorted(frequencies(sanitise('The Quick BROWN fox jumped! over... ' \
-         'the (9lazy) DOG')).items()) # doctest: +NORMALIZE_WHITESPACE
-    [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), 
-     ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), 
-     ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), 
-     ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
-    >>> frequencies('abcdefabcdef')['x']
-    0
-    """
-    #counts = collections.defaultdict(int)
-    #for c in text: 
-    #    counts[c] += 1
-    #return counts
-    return collections.Counter(c for c in text)
-letter_frequencies = frequencies
  
  def deduplicate(text):
      return list(collections.OrderedDict.fromkeys(text))
  
  def deduplicate(text):
      return list(collections.OrderedDict.fromkeys(text))
@@ -445,12 +382,21 @@ def transpositions_of(keyword):
      second column (1) moves to third, the third column (2) moves to second, 
      and so on.
  
      second column (1) moves to third, the third column (2) moves to second, 
      and so on.
  
+    If passed a tuple, assume it's already a transposition and just return it.
+
      >>> transpositions_of('clever')
      >>> transpositions_of('clever')
-    [0, 2, 1, 4, 3]
-    """
-    key = deduplicate(keyword)
-    transpositions = [key.index(l) for l in sorted(key)]
-    return transpositions
+    (0, 2, 1, 4, 3)
+    >>> transpositions_of('fred')
+    (3, 2, 0, 1)
+    >>> transpositions_of((3, 2, 0, 1))
+    (3, 2, 0, 1)
+    """
+    if isinstance(keyword, tuple):
+        return keyword
+    else:
+        key = deduplicate(keyword)
+        transpositions = tuple(key.index(l) for l in sorted(key))
+        return transpositions
  
  def column_transposition_encipher(message, keyword, fillvalue=' '):
      """Enciphers using the column transposition cipher.
  
  def column_transposition_encipher(message, keyword, fillvalue=' '):
      """Enciphers using the column transposition cipher.
@@ -518,339 +464,6 @@ def vigenere_decipher(message, keyword):
  
  
  
  
  
  
-def caesar_break(message, 
-                 metric=norms.euclidean_distance, 
-                 target_counts=normalised_english_counts, 
-                 message_frequency_scaling=norms.normalise):
-    """Breaks a Caesar cipher using frequency analysis
-    
-    >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrh' \
-          'ecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS
-    (4, 0.31863952890183...)
-    >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgtee' \
-          'raxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS
-    (19, 0.42152901235832...)
-    >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurer' \
-          'svaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS
-    (13, 0.316029208075451...)
-    """
-    sanitised_message = sanitise(message)
-    best_shift = 0
-    best_fit = float("inf")
-    for shift in range(26):
-        plaintext = caesar_decipher(sanitised_message, shift)
-        counts = message_frequency_scaling(letter_frequencies(plaintext))
-        fit = metric(target_counts, counts)
-        logger.debug('Caesar break attempt using key {0} gives fit of {1} '
-                      'and decrypt starting: {2}'.format(shift, fit, plaintext[:50]))
-        if fit < best_fit:
-            best_fit = fit
-            best_shift = shift
-    logger.info('Caesar break best fit: key {0} gives fit of {1} and '
-                'decrypt starting: {2}'.format(best_shift, best_fit, 
-                    caesar_decipher(sanitised_message, best_shift)[:50]))
-    return best_shift, best_fit
-
-def affine_break(message, 
-                 metric=norms.euclidean_distance, 
-                 target_counts=normalised_english_counts, 
-                 message_frequency_scaling=norms.normalise):
-    """Breaks an affine cipher using frequency analysis
-    
-    >>> affine_break('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls ' \
-          'omytd jlaxe mh jm bfmibj umis hfsul axubafkjamx. ls kffkxwsd jls ' \
-          'ofgbjmwfkiu olfmxmtmwaokttg jlsx ls kffkxwsd jlsi zg tsxwjl. jlsx ' \
-          'ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls ' \
-          'bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai kxd ' \
-          'clm ckuxj.') # doctest: +ELLIPSIS
-    ((15, 22, True), 0.23570361818655...)
-    """
-    sanitised_message = sanitise(message)
-    best_multiplier = 0
-    best_adder = 0
-    best_one_based = True
-    best_fit = float("inf")
-    for one_based in [True, False]:
-        for multiplier in range(1, 26, 2):
-            for adder in range(26):
-                plaintext = affine_decipher(sanitised_message, 
-                                            multiplier, adder, one_based)
-                counts = message_frequency_scaling(letter_frequencies(plaintext))
-                fit = metric(target_counts, counts)
-                logger.debug('Affine break attempt using key {0}x+{1} ({2}) '
-                             'gives fit of {3} and decrypt starting: {4}'.
-                             format(multiplier, adder, one_based, fit, 
-                                    plaintext[:50]))
-                if fit < best_fit:
-                    best_fit = fit
-                    best_multiplier = multiplier
-                    best_adder = adder
-                    best_one_based = one_based
-    logger.info('Affine break best fit with key {0}x+{1} ({2}) gives fit of {3} '
-                'and decrypt starting: {4}'.format(
-                    best_multiplier, best_adder, best_one_based, best_fit, 
-                    affine_decipher(sanitised_message, best_multiplier, 
-                        best_adder, best_one_based)[:50]))
-    return (best_multiplier, best_adder, best_one_based), best_fit
-
-def keyword_break(message, 
-                  wordlist=keywords, 
-                  metric=norms.euclidean_distance, 
-                  target_counts=normalised_english_counts, 
-                  message_frequency_scaling=norms.normalise):
-    """Breaks a keyword substitution cipher using a dictionary and 
-    frequency analysis
-
-    >>> keyword_break(keyword_encipher('this is a test message for the ' \
-          'keyword decipherment', 'elephant', 1), \
-          wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
-    (('elephant', 1), 0.41643991598441...)
-    """
-    best_keyword = ''
-    best_wrap_alphabet = True
-    best_fit = float("inf")
-    for wrap_alphabet in range(3):
-        for keyword in wordlist:
-            plaintext = keyword_decipher(message, keyword, wrap_alphabet)
-            counts = message_frequency_scaling(letter_frequencies(plaintext))
-            fit = metric(target_counts, counts)
-            logger.debug('Keyword break attempt using key {0} (wrap={1}) '
-                         'gives fit of {2} and decrypt starting: {3}'.format(
-                             keyword, wrap_alphabet, fit, 
-                             sanitise(plaintext)[:50]))
-            if fit < best_fit:
-                best_fit = fit
-                best_keyword = keyword
-                best_wrap_alphabet = wrap_alphabet
-    logger.info('Keyword break best fit with key {0} (wrap={1}) gives fit of '
-                '{2} and decrypt starting: {3}'.format(best_keyword, 
-                    best_wrap_alphabet, best_fit, sanitise(
-                        keyword_decipher(message, best_keyword, 
-                                         best_wrap_alphabet))[:50]))
-    return (best_keyword, best_wrap_alphabet), best_fit
-
-def keyword_break_mp(message, 
-                     wordlist=keywords, 
-                     metric=norms.euclidean_distance, 
-                     target_counts=normalised_english_counts, 
-                     message_frequency_scaling=norms.normalise, 
-                     chunksize=500):
-    """Breaks a keyword substitution cipher using a dictionary and 
-    frequency analysis
-
-    >>> keyword_break_mp(keyword_encipher('this is a test message for the ' \
-          'keyword decipherment', 'elephant', 1), \
-          wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
-    (('elephant', 1), 0.41643991598441...)
-    """
-    with Pool() as pool:
-        helper_args = [(message, word, wrap, metric, target_counts, 
-                        message_frequency_scaling) 
-                       for word in wordlist for wrap in range(3)]
-        # Gotcha: the helper function here needs to be defined at the top level 
-        #   (limitation of Pool.starmap)
-        breaks = pool.starmap(keyword_break_worker, helper_args, chunksize) 
-        return min(breaks, key=lambda k: k[1])
-
-def keyword_break_worker(message, keyword, wrap_alphabet, metric, target_counts, 
-                      message_frequency_scaling):
-    plaintext = keyword_decipher(message, keyword, wrap_alphabet)
-    counts = message_frequency_scaling(letter_frequencies(plaintext))
-    fit = metric(target_counts, counts)
-    logger.debug('Keyword break attempt using key {0} (wrap={1}) gives fit of '
-                 '{2} and decrypt starting: {3}'.format(keyword, 
-                     wrap_alphabet, fit, sanitise(plaintext)[:50]))
-    return (keyword, wrap_alphabet), fit
-
-def scytale_break(message, 
-                  metric=norms.euclidean_distance, 
-                  target_counts=normalised_english_bigram_counts, 
-                  message_frequency_scaling=norms.normalise):
-    """Breaks a Scytale cipher
-    
-    >>> scytale_break('tfeulchtrtteehwahsdehneoifeayfsondmwpltmaoalhikotoere' \
-           'dcweatehiplwxsnhooacgorrcrcraotohsgullasenylrendaianeplscdriioto' \
-           'aek') # doctest: +ELLIPSIS
-    (6, 0.83453041115025...)
-    """
-    best_key = 0
-    best_fit = float("inf")
-    ngram_length = len(next(iter(target_counts.keys())))
-    for key in range(1, 20):
-        if len(message) % key == 0:
-            plaintext = scytale_decipher(message, key)
-            counts = message_frequency_scaling(frequencies(
-                         ngrams(sanitise(plaintext), ngram_length)))
-            fit = metric(target_counts, counts)
-            logger.debug('Scytale break attempt using key {0} gives fit of '
-                         '{1} and decrypt starting: {2}'.format(key, 
-                             fit, sanitise(plaintext)[:50]))
-            if fit < best_fit:
-                best_fit = fit
-                best_key = key
-    logger.info('Scytale break best fit with key {0} gives fit of {1} and '
-                'decrypt starting: {2}'.format(best_key, best_fit, 
-                    sanitise(scytale_decipher(message, best_key))[:50]))
-    return best_key, best_fit
-
-def column_transposition_break(message, 
-                  wordlist=keywords, 
-                  metric=norms.euclidean_distance, 
-                  target_counts=normalised_english_bigram_counts, 
-                  message_frequency_scaling=norms.normalise):
-    """Breaks a column transposition cipher using a dictionary and 
-    n-gram frequency analysis
-
-    >>> column_transposition_break(column_transposition_encipher(sanitise( \
-        "Turing's homosexuality resulted in a criminal prosecution in 1952, \
-        when homosexual acts were still illegal in the United Kingdom. "), \
-        'encipher'), \
-        wordlist=['encipher', 'keyword', 'fourteen']) # doctest: +ELLIPSIS
-    ('encipher', 0.898128626285...)
-    >>> column_transposition_break(column_transposition_encipher(sanitise( \
-        "Turing's homosexuality resulted in a criminal prosecution in 1952, " \
-        "when homosexual acts were still illegal in the United Kingdom."), \
-        'encipher'), \
-        wordlist=['encipher', 'keyword', 'fourteen'], \
-        target_counts=normalised_english_trigram_counts) # doctest: +ELLIPSIS
-    ('encipher', 1.1958792913127...)
-    """
-    best_keyword = ''
-    best_fit = float("inf")
-    ngram_length = len(next(iter(target_counts.keys())))
-    for keyword in wordlist:
-        if len(message) % len(deduplicate(keyword)) == 0:
-            plaintext = column_transposition_decipher(message, keyword)
-            counts = message_frequency_scaling(frequencies(
-                         ngrams(sanitise(plaintext), ngram_length)))
-            fit = metric(target_counts, counts)
-            logger.debug('Column transposition break attempt using key {0} '
-                         'gives fit of {1} and decrypt starting: {2}'.format(
-                             keyword, fit, 
-                             sanitise(plaintext)[:50]))
-            if fit < best_fit:
-                best_fit = fit
-                best_keyword = keyword
-    logger.info('Column transposition break best fit with key {0} gives fit '
-                'of {1} and decrypt starting: {2}'.format(best_keyword, 
-                    best_fit, sanitise(
-                        column_transposition_decipher(message, 
-                            best_keyword))[:50]))
-    return best_keyword, best_fit
-
-
-def column_transposition_break_mp(message, 
-                     wordlist=keywords, 
-                     metric=norms.euclidean_distance, 
-                     target_counts=normalised_english_bigram_counts, 
-                     message_frequency_scaling=norms.normalise, 
-                     chunksize=500):
-    """Breaks a column transposition cipher using a dictionary and 
-    n-gram frequency analysis
-
-    >>> column_transposition_break_mp(column_transposition_encipher(sanitise( \
-        "Turing's homosexuality resulted in a criminal prosecution in 1952, \
-        when homosexual acts were still illegal in the United Kingdom. "), \
-        'encipher'), \
-        wordlist=['encipher', 'keyword', 'fourteen']) # doctest: +ELLIPSIS
-    ('encipher', 0.898128626285...)
-    >>> column_transposition_break_mp(column_transposition_encipher(sanitise( \
-        "Turing's homosexuality resulted in a criminal prosecution in 1952, " \
-        "when homosexual acts were still illegal in the United Kingdom."), \
-        'encipher'), \
-        wordlist=['encipher', 'keyword', 'fourteen'], \
-        target_counts=normalised_english_trigram_counts) # doctest: +ELLIPSIS
-    ('encipher', 1.1958792913127...)
-    """
-    ngram_length = len(next(iter(target_counts.keys())))
-    with Pool() as pool:
-        helper_args = [(message, word, metric, target_counts, ngram_length,
-                        message_frequency_scaling) 
-                       for word in wordlist]
-        # Gotcha: the helper function here needs to be defined at the top level 
-        #   (limitation of Pool.starmap)
-        breaks = pool.starmap(column_transposition_break_worker, helper_args, chunksize) 
-        return min(breaks, key=lambda k: k[1])
-
-def column_transposition_break_worker(message, keyword, metric, target_counts, 
-                      ngram_length, message_frequency_scaling):
-    plaintext = column_transposition_decipher(message, keyword)
-    counts = message_frequency_scaling(frequencies(
-                         ngrams(sanitise(plaintext), ngram_length)))
-    fit = metric(target_counts, counts)
-    logger.debug('Column transposition break attempt using key {0} '
-                         'gives fit of {1} and decrypt starting: {2}'.format(
-                             keyword, fit, 
-                             sanitise(plaintext)[:50]))
-    return keyword, fit
-
-def vigenere_keyword_break(message, 
-                  wordlist=keywords, 
-                  metric=norms.euclidean_distance, 
-                  target_counts=normalised_english_counts, 
-                  message_frequency_scaling=norms.normalise):
-    """Breaks a vigenere cipher using a dictionary and 
-    frequency analysis
-    
-    >>> vigenere_keyword_break(keyword_encipher('this is a test message for the ' \
-             'keyword decipherment', 'elephant', 1), \
-             wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
-    ('elephant', 0.7166585201707...)
-    """
-    best_keyword = ''
-    best_fit = float("inf")
-    for keyword in wordlist:
-        plaintext = vigenere_decipher(message, keyword)
-        counts = message_frequency_scaling(letter_frequencies(plaintext))
-        fit = metric(target_counts, counts)
-        logger.debug('Vigenere break attempt using key {0} '
-                         'gives fit of {1} and decrypt starting: {2}'.format(
-                             keyword, fit, 
-                             sanitise(plaintext)[:50]))
-        if fit < best_fit:
-            best_fit = fit
-            best_keyword = keyword
-    logger.info('Vigenere break best fit with key {0} gives fit '
-                'of {1} and decrypt starting: {2}'.format(best_keyword, 
-                    best_fit, sanitise(
-                        vigenere_decipher(message, best_keyword))[:50]))
-    return best_keyword, best_fit
-
-def vigenere_keyword_break_mp(message, 
-                     wordlist=keywords, 
-                     metric=norms.euclidean_distance, 
-                     target_counts=normalised_english_counts, 
-                     message_frequency_scaling=norms.normalise, 
-                     chunksize=500):
-    """Breaks a vigenere cipher using a dictionary and 
-    frequency analysis
-
-    >>> vigenere_keyword_break_mp(keyword_encipher('this is a test message for the ' \
-             'keyword decipherment', 'elephant', 1), \
-             wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
-    ('elephant', 0.7166585201707...)
-    """
-    with Pool() as pool:
-        helper_args = [(message, word, metric, target_counts, 
-                        message_frequency_scaling) 
-                       for word in wordlist]
-        # Gotcha: the helper function here needs to be defined at the top level 
-        #   (limitation of Pool.starmap)
-        breaks = pool.starmap(vigenere_keyword_break_worker, helper_args, chunksize) 
-        return min(breaks, key=lambda k: k[1])
-
-def vigenere_keyword_break_worker(message, keyword, metric, target_counts, 
-                      message_frequency_scaling):
-    plaintext = vigenere_decipher(message, keyword)
-    counts = message_frequency_scaling(letter_frequencies(plaintext))
-    fit = metric(target_counts, counts)
-    logger.debug('Vigenere keyword break attempt using key {0} gives fit of '
-                 '{1} and decrypt starting: {2}'.format(keyword, 
-                     fit, sanitise(plaintext)[:50]))
-    return keyword, fit
-
-
-
  if __name__ == "__main__":
      import doctest
      doctest.testmod()
  if __name__ == "__main__":
      import doctest
      doctest.testmod()
author	Neil Smith <neil.github@njae.me.uk>
	Fri, 22 Nov 2013 10:32:20 +0000 (10:32 +0000)
committer	Neil Smith <neil.github@njae.me.uk>
	Fri, 22 Nov 2013 10:32:20 +0000 (10:32 +0000)
2013/solutions.txt		patch \| blob \| history
break.py	[new file with mode: 0644]	patch \| blob
cipher.py		patch \| blob \| history