Done keyword ciphers and breaking

[cipher-tools.git] / cipher.py
diff --git a/cipher.py b/cipher.py

index 0fc9e8897f12b8920d07819f25c29b8e9efb59fb..5a8cfa22bd788981557c5fe733467e5ec2dc116e 100644 (file)
--- a/cipher.py
+++ b/cipher.py
@@ -1,5 +1,37 @@
  import string
  import collections
+import norms
+import logging
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.FileHandler('cipher.log'))
+logger.setLevel(logging.WARNING)
+#logger.setLevel(logging.INFO)
+
+english_counts = collections.defaultdict(int)
+with open('count_1l.txt', 'r') as f:
+    for line in f:
+        (letter, count) = line.split("\t")
+        english_counts[letter] = int(count)
+normalised_english_counts = norms.normalise(english_counts)        
+
+keywords = []
+with open('words.txt', 'r') as f:
+    keywords = [line.rstrip() for line in f]
+
+
+modular_division_table = [[0]*26 for x in range(26)]
+for a in range(26):
+    for b in range(26):
+        c = (a * b) % 26
+        modular_division_table[b][c] = a
+
+modular_division_table_one_based = [[0]*27 for x in range(27)]
+for a in range(27):
+    for b in range(27):
+        c = ((a * b)-1) % 26 + 1
+        modular_division_table_one_based[b][c] = a
+
  
  
  def sanitise(text):
@@ -13,6 +45,16 @@ def sanitise(text):
      sanitised = [c.lower() for c in text if c in string.ascii_letters]
      return ''.join(sanitised)
  
+def ngrams(text, n):
+    """Returns all n-grams of a text
+    
+    >>> ngrams(sanitise('the quick brown fox'), 2)
+    [('t', 'h'), ('h', 'e'), ('e', 'q'), ('q', 'u'), ('u', 'i'), ('i', 'c'), ('c', 'k'), ('k', 'b'), ('b', 'r'), ('r', 'o'), ('o', 'w'), ('w', 'n'), ('n', 'f'), ('f', 'o'), ('o', 'x')]
+    >>> ngrams(sanitise('the quick brown fox'), 4)
+    [('t', 'h', 'e', 'q'), ('h', 'e', 'q', 'u'), ('e', 'q', 'u', 'i'), ('q', 'u', 'i', 'c'), ('u', 'i', 'c', 'k'), ('i', 'c', 'k', 'b'), ('c', 'k', 'b', 'r'), ('k', 'b', 'r', 'o'), ('b', 'r', 'o', 'w'), ('r', 'o', 'w', 'n'), ('o', 'w', 'n', 'f'), ('w', 'n', 'f', 'o'), ('n', 'f', 'o', 'x')]
+    """
+    return [tuple(text[i:i+n]) for i in range(len(text)-n+1)]
+
  def letter_frequencies(text):
      """Count the number of occurrences of each character in text
      
@@ -30,106 +72,8 @@ def letter_frequencies(text):
          counts[c] += 1
      return counts
  
-
-def normalise_frequencies(frequencies):
-    """Scale a set of letter frequenies so they add to 1
-    
-    >>> sorted(normalise_frequencies(letter_frequencies('abcdefabc')).items())
-    [('a', 0.2222222222222222), ('b', 0.2222222222222222), ('c', 0.2222222222222222), ('d', 0.1111111111111111), ('e', 0.1111111111111111), ('f', 0.1111111111111111)]
-    >>> sorted(normalise_frequencies(letter_frequencies('the quick brown fox jumped over the lazy dog')).items())
-    [(' ', 0.18181818181818182), ('a', 0.022727272727272728), ('b', 0.022727272727272728), ('c', 0.022727272727272728), ('d', 0.045454545454545456), ('e', 0.09090909090909091), ('f', 0.022727272727272728), ('g', 0.022727272727272728), ('h', 0.045454545454545456), ('i', 0.022727272727272728), ('j', 0.022727272727272728), ('k', 0.022727272727272728), ('l', 0.022727272727272728), ('m', 0.022727272727272728), ('n', 0.022727272727272728), ('o', 0.09090909090909091), ('p', 0.022727272727272728), ('q', 0.022727272727272728), ('r', 0.045454545454545456), ('t', 0.045454545454545456), ('u', 0.045454545454545456), ('v', 0.022727272727272728), ('w', 0.022727272727272728), ('x', 0.022727272727272728), ('y', 0.022727272727272728), ('z', 0.022727272727272728)]
-    >>> sorted(normalise_frequencies(letter_frequencies('The Quick BROWN fox jumped! over... the (9lazy) DOG')).items())
-    [(' ', 0.1568627450980392), ('!', 0.0196078431372549), ('(', 0.0196078431372549), (')', 0.0196078431372549), ('.', 0.058823529411764705), ('9', 0.0196078431372549), ('B', 0.0196078431372549), ('D', 0.0196078431372549), ('G', 0.0196078431372549), ('N', 0.0196078431372549), ('O', 0.0392156862745098), ('Q', 0.0196078431372549), ('R', 0.0196078431372549), ('T', 0.0196078431372549), ('W', 0.0196078431372549), ('a', 0.0196078431372549), ('c', 0.0196078431372549), ('d', 0.0196078431372549), ('e', 0.0784313725490196), ('f', 0.0196078431372549), ('h', 0.0392156862745098), ('i', 0.0196078431372549), ('j', 0.0196078431372549), ('k', 0.0196078431372549), ('l', 0.0196078431372549), ('m', 0.0196078431372549), ('o', 0.0392156862745098), ('p', 0.0196078431372549), ('r', 0.0196078431372549), ('t', 0.0196078431372549), ('u', 0.0392156862745098), ('v', 0.0196078431372549), ('x', 0.0196078431372549), ('y', 0.0196078431372549), ('z', 0.0196078431372549)]
-    >>> sorted(normalise_frequencies(letter_frequencies(sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG'))).items())
-    [('a', 0.027777777777777776), ('b', 0.027777777777777776), ('c', 0.027777777777777776), ('d', 0.05555555555555555), ('e', 0.1111111111111111), ('f', 0.027777777777777776), ('g', 0.027777777777777776), ('h', 0.05555555555555555), ('i', 0.027777777777777776), ('j', 0.027777777777777776), ('k', 0.027777777777777776), ('l', 0.027777777777777776), ('m', 0.027777777777777776), ('n', 0.027777777777777776), ('o', 0.1111111111111111), ('p', 0.027777777777777776), ('q', 0.027777777777777776), ('r', 0.05555555555555555), ('t', 0.05555555555555555), ('u', 0.05555555555555555), ('v', 0.027777777777777776), ('w', 0.027777777777777776), ('x', 0.027777777777777776), ('y', 0.027777777777777776), ('z', 0.027777777777777776)]
-    """
-    total = sum(frequencies.values())
-    return dict((k, v / total) for (k, v) in frequencies.items())
-
-def l2_norm(frequencies1, frequencies2):
-    """Finds the distances between two frequency profiles, expressed as dictionaries.
-    Assumes every key in frequencies1 is also in frequencies2
-    
-    >>> l2_norm({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
-    0.0
-    >>> l2_norm({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
-    0.0
-    >>> l2_norm({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
-    0.816496580927726
-    >>> l2_norm({'a':0, 'b':1}, {'a':1, 'b':1})
-    0.7071067811865476
-    """
-    f1n = normalise_frequencies(frequencies1)
-    f2n = normalise_frequencies(frequencies2)
-    total = 0
-    for k in f1n.keys():
-        total += (f1n[k] - f2n[k]) ** 2
-    return total ** 0.5
-euclidean_distance = l2_norm
-
-def l1_norm(frequencies1, frequencies2):
-    """Finds the distances between two frequency profiles, expressed as dictionaries.
-    Assumes every key in frequencies1 is also in frequencies2
-
-    >>> l1_norm({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
-    0.0
-    >>> l1_norm({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
-    0.0
-    >>> l1_norm({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
-    1.3333333333333333
-    >>> l1_norm({'a':0, 'b':1}, {'a':1, 'b':1})
-    1.0
-    """
-    f1n = normalise_frequencies(frequencies1)
-    f2n = normalise_frequencies(frequencies2)
-    total = 0
-    for k in f1n.keys():
-        total += abs(f1n[k] - f2n[k])
-    return total
-
-def l3_norm(frequencies1, frequencies2):
-    """Finds the distances between two frequency profiles, expressed as dictionaries.
-    Assumes every key in frequencies1 is also in frequencies2
-
-    >>> l3_norm({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
-    0.0
-    >>> l3_norm({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
-    0.0
-    >>> l3_norm({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
-    0.7181448966772946
-    >>> l3_norm({'a':0, 'b':1}, {'a':1, 'b':1})
-    0.6299605249474366
-    """
-    f1n = normalise_frequencies(frequencies1)
-    f2n = normalise_frequencies(frequencies2)
-    total = 0
-    for k in f1n.keys():
-        total += abs(f1n[k] - f2n[k]) ** 3
-    return total ** (1/3)
-
-def cosine_distance(frequencies1, frequencies2):
-    """Finds the distances between two frequency profiles, expressed as dictionaries.
-    Assumes every key in frequencies1 is also in frequencies2
-
-    >>> cosine_distance({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
-    -2.220446049250313e-16
-    >>> cosine_distance({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
-    -2.220446049250313e-16
-    >>> cosine_distance({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
-    0.42264973081037416
-    >>> cosine_distance({'a':0, 'b':1}, {'a':1, 'b':1})
-    0.29289321881345254
-    """
-    numerator = 0
-    length1 = 0
-    length2 = 0
-    for k in frequencies1.keys():
-        numerator += frequencies1[k] * frequencies2[k]
-        length1 += frequencies1[k]**2
-    for k in frequencies2.keys():
-        length2 += frequencies2[k]
-    return 1 - (numerator / (length1 ** 0.5 * length2 ** 0.5))
-
+def deduplicate(text):
+    return list(collections.OrderedDict.fromkeys(text))
  
  
  
@@ -199,28 +143,151 @@ def caesar_decipher(message, shift):
      """
      return caesar_encipher(message, -shift)
  
-def caesar_break(message, metric=euclidean_distance):
+def affine_encipher_letter(letter, multiplier=1, adder=0, one_based=True):
+    """Encipher a letter, given a multiplier and adder
+    
+    >>> ''.join([affine_encipher_letter(l, 3, 5, True) for l in string.ascii_uppercase])
+    'HKNQTWZCFILORUXADGJMPSVYBE'
+    >>> ''.join([affine_encipher_letter(l, 3, 5, False) for l in string.ascii_uppercase])
+    'FILORUXADGJMPSVYBEHKNQTWZC'
+    """
+    if letter in string.ascii_letters:
+        if letter in string.ascii_uppercase:
+            alphabet_start = ord('A')
+        else:
+            alphabet_start = ord('a')
+        letter_number = ord(letter) - alphabet_start
+        if one_based: letter_number += 1
+        raw_cipher_number = (letter_number * multiplier + adder)
+        cipher_number = 0
+        if one_based: 
+            cipher_number = (raw_cipher_number - 1) % 26
+        else:
+            cipher_number = raw_cipher_number % 26        
+        return chr(cipher_number + alphabet_start)
+    else:
+        return letter
+
+def affine_decipher_letter(letter, multiplier=1, adder=0, one_based=True):
+    """Encipher a letter, given a multiplier and adder
+    
+    >>> ''.join([affine_decipher_letter(l, 3, 5, True) for l in 'HKNQTWZCFILORUXADGJMPSVYBE'])
+    'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+    >>> ''.join([affine_decipher_letter(l, 3, 5, False) for l in 'FILORUXADGJMPSVYBEHKNQTWZC'])
+    'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+    """
+    if letter in string.ascii_letters:
+        if letter in string.ascii_uppercase:
+            alphabet_start = ord('A')
+        else:
+            alphabet_start = ord('a')
+        cipher_number = ord(letter) - alphabet_start
+        if one_based: cipher_number += 1
+        plaintext_number = 0
+        if one_based:
+            plaintext_number = (modular_division_table_one_based[multiplier][(cipher_number - adder + 26) % 26] - 1) % 26
+        else:
+            #plaintext_number = (modular_division_table[multiplier][cipher_number] - adder) % 26
+            plaintext_number = modular_division_table[multiplier][(cipher_number - adder + 26) % 26]            
+        return chr(plaintext_number + alphabet_start)
+    else:
+        return letter
+
+def affine_encipher(message, multiplier=1, adder=0, one_based=True):
+    """Encipher a message
+    
+    >>> affine_encipher('hours passed during which jerico tried every trick he could think of', 15, 22, True)
+    'lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh'
+    """
+    
+    enciphered = [affine_encipher_letter(l, multiplier, adder, one_based) for l in message]
+    return ''.join(enciphered)
+
+def affine_decipher(message, multiplier=1, adder=0, one_based=True):
+    """Decipher a message
+    
+    >>> affine_decipher('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh', 15, 22, True)
+    'hours passed during which jerico tried every trick he could think of'
+    """
+    enciphered = [affine_decipher_letter(l, multiplier, adder, one_based) for l in message]
+    return ''.join(enciphered)
+
+
+def keyword_encipher(message, keyword):
+    cipher_alphabet = ''.join(deduplicate(sanitise(keyword) + string.ascii_lowercase))
+    cipher_translation = ''.maketrans(string.ascii_lowercase, cipher_alphabet)
+    return message.lower().translate(cipher_translation)
+
+def keyword_decipher(message, keyword):
+    cipher_alphabet = ''.join(deduplicate(sanitise(keyword) + string.ascii_lowercase))
+    cipher_translation = ''.maketrans(cipher_alphabet, string.ascii_lowercase)
+    return message.lower().translate(cipher_translation)
+
+
+def caesar_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
+    """Breaks a Caesar cipher using frequency analysis
+    
+    >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrhecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm')
+    (4, 0.3186395289018361)
+    >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgteeraxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert')
+    (19, 0.4215290123583277)
+    >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurersvaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur')
+    (13, 0.31602920807545154)
+    """
      sanitised_message = sanitise(message)
      best_shift = 0
      best_fit = float("inf")
-    for shift in range(1, 25):
+    for shift in range(26):
          plaintext = caesar_decipher(sanitised_message, shift)
-        frequencies = letter_frequencies(plaintext)
-        fit = metric(english_counts, frequencies)
+        frequencies = message_frequency_scaling(letter_frequencies(plaintext))
+        fit = metric(target_frequencies, frequencies)
+        logger.info('Caesar break attempt using key {0} gives fit of {1} and decrypt starting: {2}'.format(shift, fit, plaintext[:50]))
          if fit < best_fit:
              best_fit = fit
              best_shift = shift
+    logger.info('Caesar break best fit: key {0} gives fit of {1} and decrypt starting: {2}'.format(best_shift, best_fit, caesar_decipher(sanitised_message, best_shift)[:50]))
      return best_shift, best_fit
  
+def affine_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
+    """Breaks an affine cipher using frequency analysis
+    
+    >>> affine_break('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh jm bfmibj umis hfsul axubafkjamx. ls kffkxwsd jls ofgbjmwfkiu olfmxmtmwaokttg jlsx ls kffkxwsd jlsi zg tsxwjl. jlsx ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai kxd clm ckuxj.')
+    ((15, 22, True), 0.2357036181865554)
+    """
+    sanitised_message = sanitise(message)
+    best_multiplier = 0
+    best_adder = 0
+    best_one_based = True
+    best_fit = float("inf")
+    for one_based in [True, False]:
+        for multiplier in range(1, 26, 2):
+            for adder in range(26):
+                plaintext = affine_decipher(sanitised_message, multiplier, adder, one_based)
+                frequencies = message_frequency_scaling(letter_frequencies(plaintext))
+                fit = metric(target_frequencies, frequencies)
+                logger.info('Affine break attempt using key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(multiplier, adder, one_based, fit, plaintext[:50]))
+                if fit < best_fit:
+                    best_fit = fit
+                    best_multiplier = multiplier
+                    best_adder = adder
+                    best_one_based = one_based
+    logger.info('Affine break best fit with key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(best_multiplier, best_adder, best_one_based, best_fit, affine_decipher(sanitised_message, best_multiplier, best_adder, best_one_based)[:50]))
+    return (best_multiplier, best_adder, best_one_based), best_fit
  
  
-
-
-english_counts = collections.defaultdict(int)
-with open('count_1l.txt', 'r') as f:
-    for line in f:
-        (letter, count) = line.split("\t")
-        english_counts[letter] = int(count)
+def keyword_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
+    best_keyword = ''
+    best_fit = float("inf")
+    for keyword in keywords:
+        plaintext = keyword_decipher(message, keyword)
+        frequencies = message_frequency_scaling(letter_frequencies(plaintext))
+        fit = metric(target_frequencies, frequencies)
+        logger.info('Keyword break attempt using key {0} gives fit of {1} and decrypt starting: {2}'.format(keyword, fit, plaintext[:50]))
+        if fit < best_fit:
+            best_fit = fit
+            best_keyword = keyword
+    logger.info('Keyword break best fit with key {0} gives fit of {1} and decrypt starting: {2}'.format(best_keyword, best_fit, keyword_decipher(message, best_keyword)[:50]))
+    return best_keyword, best_fit
  
  
  if __name__ == "__main__":