Test for column transposition break now done

[cipher-tools.git] / cipher.py
diff --git a/cipher.py b/cipher.py

index bc173a75200e5c1a997ec7d792f782633e154ff2..db2caffe96c55a6d075709b5a98949e0d8456e4c 100644 (file)
--- a/cipher.py
+++ b/cipher.py
@@ -3,7 +3,7 @@ import collections
  import norms
  import logging
  import math
-from itertools import zip_longest
+from itertools import zip_longest, repeat
  from segment import segment
  from multiprocessing import Pool
  
@@ -34,6 +34,14 @@ with open('count_2l.txt', 'r') as f:
          english_bigram_counts[bigram] = int(count)
  normalised_english_bigram_counts = norms.normalise(english_bigram_counts)
  
+english_trigram_counts = collections.defaultdict(int)
+with open('count_3l.txt', 'r') as f:
+    for line in f:
+        (trigram, count) = line.split("\t")
+        english_trigram_counts[trigram] = int(count)
+normalised_english_trigram_counts = norms.normalise(english_trigram_counts)
+
+
  with open('words.txt', 'r') as f:
      keywords = [line.rstrip() for line in f]
  
@@ -67,20 +75,22 @@ def ngrams(text, n):
      """
      return [text[i:i+n] for i in range(len(text)-n+1)]
  
-def every_nth(text, n):
+def every_nth(text, n, fillvalue=''):
      """Returns n strings, each of which consists of every nth character, 
      starting with the 0th, 1st, 2nd, ... (n-1)th character
      
      >>> every_nth(string.ascii_lowercase, 5)
      ['afkpuz', 'bglqv', 'chmrw', 'dinsx', 'ejoty']
-    >>> every_nth(string.ascii_lowercase, 1)                                                                                                              
+    >>> every_nth(string.ascii_lowercase, 1)
      ['abcdefghijklmnopqrstuvwxyz']
      >>> every_nth(string.ascii_lowercase, 26) # doctest: +NORMALIZE_WHITESPACE
      ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 
       'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
+    >>> every_nth(string.ascii_lowercase, 5, fillvalue='!')
+    ['afkpuz', 'bglqv!', 'chmrw!', 'dinsx!', 'ejoty!']
      """
      split_text = [text[i:i+n] for i in range(0, len(text), n)]
-    return [''.join(l) for l in zip_longest(*split_text, fillvalue='')]
+    return [''.join(l) for l in zip_longest(*split_text, fillvalue=fillvalue)]
  
  def combine_every_nth(split_text):
      """Reforms a text split into every_nth strings
@@ -95,6 +105,36 @@ def combine_every_nth(split_text):
      return ''.join([''.join(l) 
                      for l in zip_longest(*split_text, fillvalue='')])
  
+def transpose(items, transposition):
+    """Moves items around according to the given transposition
+    
+    >>> transpose(['a', 'b', 'c', 'd'], [0,1,2,3])
+    ['a', 'b', 'c', 'd']
+    >>> transpose(['a', 'b', 'c', 'd'], [3,1,2,0])
+    ['d', 'b', 'c', 'a']
+    >>> transpose([10,11,12,13,14,15], [3,2,4,1,5,0])  
+    [13, 12, 14, 11, 15, 10]
+    """
+    transposed = list(repeat('', len(transposition)))
+    for p, t in enumerate(transposition):
+       transposed[p] = items[t]
+    return transposed
+
+def untranspose(items, transposition):
+    """Undoes a transpose
+    
+    >>> untranspose(['a', 'b', 'c', 'd'], [0,1,2,3])
+    ['a', 'b', 'c', 'd']
+    >>> untranspose(['d', 'b', 'c', 'a'], [3,1,2,0])
+    ['a', 'b', 'c', 'd']
+    >>> untranspose([13, 12, 14, 11, 15, 10], [3,2,4,1,5,0])
+    [10, 11, 12, 13, 14, 15]
+    """
+    transposed  = list(repeat('', len(transposition)))
+    for p, t in enumerate(transposition):
+       transposed[t] = items[p]
+    return transposed
+
  
  def frequencies(text):
      """Count the number of occurrences of each character in text
@@ -387,6 +427,65 @@ def scytale_decipher(message, rows):
      return ''.join([''.join(c) for c in zip_longest(*columns, fillvalue='')])
  
  
+def transpositions_of(keyword):
+    """Finds the transpostions given by a keyword. For instance, the keyword
+    'clever' rearranges to 'celrv', so the first column (0) stays first, the
+    second column (1) moves to third, the third column (2) moves to second, 
+    and so on.
+
+    >>> transpositions_of('clever')
+    [0, 2, 1, 4, 3]
+    """
+    key = deduplicate(keyword)
+    transpositions = [key.index(l) for l in sorted(key)]
+    return transpositions
+
+def column_transposition_encipher(message, keyword, fillvalue=' '):
+    """Enciphers using the column transposition cipher.
+    Message is padded to allow all rows to be the same length.
+
+    >>> column_transposition_encipher('hellothere', 'clever')
+    'hleolteher'
+    >>> column_transposition_encipher('hellothere', 'cleverly', fillvalue='!')
+    'hleolthre!e!'
+    """
+    return column_transposition_worker(message, keyword, encipher=True, 
+                                       fillvalue=fillvalue)
+
+def column_transposition_decipher(message, keyword, fillvalue=' '):
+    """Deciphers using the column transposition cipher.
+    Message is padded to allow all rows to be the same length.
+
+    >>> column_transposition_decipher('hleolteher', 'clever')
+    'hellothere'
+    >>> column_transposition_decipher('hleolthre!e!', 'cleverly', fillvalue='?')
+    'hellothere!!'
+    """
+    return column_transposition_worker(message, keyword, encipher=False, 
+                                       fillvalue=fillvalue)
+
+def column_transposition_worker(message, keyword, 
+                                encipher=True, fillvalue=' '):
+    """Does the actual work of the column transposition cipher.
+    Message is padded with spaces to allow all rows to be the same length.
+
+    >>> column_transposition_worker('hellothere', 'clever')
+    'hleolteher'
+    >>> column_transposition_worker('hellothere', 'clever', encipher=True)
+    'hleolteher'
+    >>> column_transposition_worker('hleolteher', 'clever', encipher=False)
+    'hellothere'
+    """
+    transpositions = transpositions_of(keyword)
+    columns = every_nth(message, len(transpositions), fillvalue=fillvalue)
+    if encipher:
+        transposed_columns = transpose(columns, transpositions)
+    else:
+        transposed_columns = untranspose(columns, transpositions)
+    return combine_every_nth(transposed_columns)
+
+
+
  def caesar_break(message, 
                   metric=norms.euclidean_distance, 
                   target_counts=normalised_english_counts, 
@@ -561,6 +660,53 @@ def scytale_break(message,
                      sanitise(scytale_decipher(message, best_key))[:50]))
      return best_key, best_fit
  
+def column_transposition_break(message, 
+                  wordlist=keywords, 
+                  metric=norms.euclidean_distance, 
+                  #test_ngram_length=2,
+                  target_counts=normalised_english_bigram_counts, 
+                  message_frequency_scaling=norms.normalise):
+    """Breaks a column transposition cipher using a dictionary and 
+    n-gram frequency analysis
+
+    >>> column_transposition_break(column_transposition_encipher(sanitise( \
+        "Turing's homosexuality resulted in a criminal prosecution in 1952, \
+        when homosexual acts were still illegal in the United Kingdom. "), \
+        'encipher'), \
+        wordlist=['encipher', 'keyword', 'fourteen']) # doctest: +ELLIPSIS
+    ('encipher', 0.898128626285...)
+    >>> column_transposition_break(column_transposition_encipher(sanitise( \
+        "Turing's homosexuality resulted in a criminal prosecution in 1952, " \
+        "when homosexual acts were still illegal in the United Kingdom."), \
+        'encipher'), \
+        wordlist=['encipher', 'keyword', 'fourteen'], \
+        target_counts=normalised_english_trigram_counts) # doctest: +ELLIPSIS
+    ('encipher', 1.1958792913127...)
+    """
+    best_keyword = ''
+    best_fit = float("inf")
+    ngram_length = len(next(iter(target_counts.keys())))
+    for keyword in wordlist:
+        if len(message) % len(deduplicate(keyword)) == 0:
+            plaintext = column_transposition_decipher(message, keyword)
+            counts = message_frequency_scaling(frequencies(
+                         ngrams(sanitise(plaintext), ngram_length)))
+            fit = metric(target_counts, counts)
+            logger.debug('Column transposition break attempt using key {0} '
+                         'gives fit of {1} and decrypt starting: {2}'.format(
+                             keyword, fit, 
+                             sanitise(plaintext)[:50]))
+            if fit < best_fit:
+                best_fit = fit
+                best_keyword = keyword
+    logger.info('Column transposition break best fit with key {0} gives fit '
+                'of {1} and decrypt starting: {2}'.format(best_keyword, 
+                    best_fit, sanitise(
+                        column_transposition_decipher(message, 
+                            best_keyword))[:50]))
+    return best_keyword, best_fit
+
+
  
  if __name__ == "__main__":
      import doctest