Fixed numeric accuracy in doctests

[cipher-tools.git] / cipher.py
diff --git a/cipher.py b/cipher.py

index 3f9e29087408202e91b0294f231bea9edfc0054a..818ce51a8defde50a65e5eca491f6937eba6f545 100644 (file)
--- a/cipher.py
+++ b/cipher.py
@@ -6,7 +6,7 @@ import logging
  logger = logging.getLogger(__name__)
  logger.addHandler(logging.FileHandler('cipher.log'))
  logger.setLevel(logging.WARNING)
-logger.setLevel(logging.INFO)
+#logger.setLevel(logging.INFO)
  
  english_counts = collections.defaultdict(int)
  with open('count_1l.txt', 'r') as f:
@@ -15,6 +15,10 @@ with open('count_1l.txt', 'r') as f:
          english_counts[letter] = int(count)
  normalised_english_counts = norms.normalise(english_counts)        
  
+keywords = []
+with open('words.txt', 'r') as f:
+    keywords = [line.rstrip() for line in f]
+
  
  modular_division_table = [[0]*26 for x in range(26)]
  for a in range(26):
@@ -42,6 +46,13 @@ def sanitise(text):
      return ''.join(sanitised)
  
  def ngrams(text, n):
+    """Returns all n-grams of a text
+    
+    >>> ngrams(sanitise('the quick brown fox'), 2)
+    [('t', 'h'), ('h', 'e'), ('e', 'q'), ('q', 'u'), ('u', 'i'), ('i', 'c'), ('c', 'k'), ('k', 'b'), ('b', 'r'), ('r', 'o'), ('o', 'w'), ('w', 'n'), ('n', 'f'), ('f', 'o'), ('o', 'x')]
+    >>> ngrams(sanitise('the quick brown fox'), 4)
+    [('t', 'h', 'e', 'q'), ('h', 'e', 'q', 'u'), ('e', 'q', 'u', 'i'), ('q', 'u', 'i', 'c'), ('u', 'i', 'c', 'k'), ('i', 'c', 'k', 'b'), ('c', 'k', 'b', 'r'), ('k', 'b', 'r', 'o'), ('b', 'r', 'o', 'w'), ('r', 'o', 'w', 'n'), ('o', 'w', 'n', 'f'), ('w', 'n', 'f', 'o'), ('n', 'f', 'o', 'x')]
+    """
      return [tuple(text[i:i+n]) for i in range(len(text)-n+1)]
  
  def letter_frequencies(text):
@@ -61,6 +72,11 @@ def letter_frequencies(text):
          counts[c] += 1
      return counts
  
+def deduplicate(text):
+    return list(collections.OrderedDict.fromkeys(text))
+
+
+
  def caesar_encipher_letter(letter, shift):
      """Encipher a letter, given a shift amount
  
@@ -197,15 +213,39 @@ def affine_decipher(message, multiplier=1, adder=0, one_based=True):
      return ''.join(enciphered)
  
  
+def keyword_encipher(message, keyword, wrap_alphabet=False):
+    cipher_alphabet = ''
+    if wrap_alphabet:
+        last_keyword_letter = deduplicate(sanitise(keyword))[-1]
+        last_keyword_position = string.ascii_lowercase.find(last_keyword_letter) + 1
+        cipher_alphabet = ''.join(deduplicate(sanitise(keyword) + string.ascii_lowercase[last_keyword_position:] + string.ascii_lowercase[:last_keyword_position]))
+    else:
+        cipher_alphabet = ''.join(deduplicate(sanitise(keyword) + string.ascii_lowercase))
+    cipher_translation = ''.maketrans(string.ascii_lowercase, cipher_alphabet)
+    return message.lower().translate(cipher_translation)
+
+def keyword_decipher(message, keyword, wrap_alphabet=False):
+    cipher_alphabet = ''
+    if wrap_alphabet:
+        last_keyword_letter = deduplicate(sanitise(keyword))[-1]
+        last_keyword_position = string.ascii_lowercase.find(last_keyword_letter) + 1
+        cipher_alphabet = ''.join(deduplicate(sanitise(keyword) + string.ascii_lowercase[last_keyword_position:] + string.ascii_lowercase[:last_keyword_position]))
+    else:
+        cipher_alphabet = ''.join(deduplicate(sanitise(keyword) + string.ascii_lowercase))
+    #cipher_alphabet = ''.join(deduplicate(sanitise(keyword) + string.ascii_lowercase))
+    cipher_translation = ''.maketrans(cipher_alphabet, string.ascii_lowercase)
+    return message.lower().translate(cipher_translation)
+
+
  def caesar_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
      """Breaks a Caesar cipher using frequency analysis
      
-    >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrhecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm')
-    (4, 0.3186395289018361)
-    >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgteeraxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert')
-    (19, 0.4215290123583277)
-    >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurersvaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur')
-    (13, 0.31602920807545154)
+    >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrhecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS
+    (4, 0.31863952890183...)
+    >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgteeraxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS
+    (19, 0.42152901235832...)
+    >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurersvaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS
+    (13, 0.316029208075451...)
      """
      sanitised_message = sanitise(message)
      best_shift = 0
@@ -224,8 +264,8 @@ def caesar_break(message, metric=norms.euclidean_distance, target_frequencies=no
  def affine_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
      """Breaks an affine cipher using frequency analysis
      
-    >>> affine_break('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh jm bfmibj umis hfsul axubafkjamx. ls kffkxwsd jls ofgbjmwfkiu olfmxmtmwaokttg jlsx ls kffkxwsd jlsi zg tsxwjl. jlsx ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai kxd clm ckuxj.')
-    ((15, 22, True), 0.2357036181865554)
+    >>> affine_break('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh jm bfmibj umis hfsul axubafkjamx. ls kffkxwsd jls ofgbjmwfkiu olfmxmtmwaokttg jlsx ls kffkxwsd jlsi zg tsxwjl. jlsx ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai kxd clm ckuxj.') # doctest: +ELLIPSIS
+    ((15, 22, True), 0.23570361818655...)
      """
      sanitised_message = sanitise(message)
      best_multiplier = 0
@@ -248,6 +288,24 @@ def affine_break(message, metric=norms.euclidean_distance, target_frequencies=no
      return (best_multiplier, best_adder, best_one_based), best_fit
  
  
+def keyword_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
+    best_keyword = ''
+    best_wrap_alphabet = True
+    best_fit = float("inf")
+    for wrap_alphabet in [True, False]:
+        for keyword in keywords:
+            plaintext = keyword_decipher(message, keyword, wrap_alphabet)
+            frequencies = message_frequency_scaling(letter_frequencies(plaintext))
+            fit = metric(target_frequencies, frequencies)
+            logger.info('Keyword break attempt using key {0} ({1}) gives fit of {2} and decrypt starting: {3}'.format(keyword, wrap_alphabet, fit, sanitise(plaintext)[:50]))
+            if fit < best_fit:
+                best_fit = fit
+                best_keyword = keyword
+                best_wrap_alphabet = wrap_alphabet
+    logger.info('Keyword break best fit with key {0} ({1}) gives fit of {2} and decrypt starting: {3}'.format(best_keyword, best_wrap_alphabet, best_fit, sanitise(keyword_decipher(message, best_keyword))[:50]))
+    return (best_keyword, best_wrap_alphabet), best_fit
+
+
  if __name__ == "__main__":
      import doctest
      doctest.testmod()