cipher.py

   1 import string
   2 import collections
   3 import norms
   4 import logging
   5 from itertools import zip_longest
   6 from segment import segment
   7
   8 # To time a run:
   9 #
  10 # import timeit
  11 # c5a = open('2012/5a.ciphertext', 'r').read()
  12 # timeit.timeit('keyword_break(c5a)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break', number=1)
  13
  14
  15 logger = logging.getLogger(__name__)
  16 logger.addHandler(logging.FileHandler('cipher.log'))
  17 logger.setLevel(logging.WARNING)
  18 #logger.setLevel(logging.INFO)
  19
  20 english_counts = collections.defaultdict(int)
  21 with open('count_1l.txt', 'r') as f:
  22     for line in f:
  23         (letter, count) = line.split("\t")
  24         english_counts[letter] = int(count)
  25 normalised_english_counts = norms.normalise(english_counts)
  26
  27 with open('words.txt', 'r') as f:
  28     keywords = [line.rstrip() for line in f]
  29
  30 modular_division_table = [[0]*26 for x in range(26)]
  31 for a in range(26):
  32     for b in range(26):
  33         c = (a * b) % 26
  34         modular_division_table[b][c] = a
  35
  36
  37 def sanitise(text):
  38     """Remove all non-alphabetic characters and convert the text to lowercase
  39
  40     >>> sanitise('The Quick')
  41     'thequick'
  42     >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
  43     'thequickbrownfoxjumpedoverthelazydog'
  44     """
  45     sanitised = [c.lower() for c in text if c in string.ascii_letters]
  46     return ''.join(sanitised)
  47
  48 def ngrams(text, n):
  49     """Returns all n-grams of a text
  50
  51     >>> ngrams(sanitise('the quick brown fox'), 2)
  52     [('t', 'h'), ('h', 'e'), ('e', 'q'), ('q', 'u'), ('u', 'i'), ('i', 'c'), ('c', 'k'), ('k', 'b'), ('b', 'r'), ('r', 'o'), ('o', 'w'), ('w', 'n'), ('n', 'f'), ('f', 'o'), ('o', 'x')]
  53     >>> ngrams(sanitise('the quick brown fox'), 4)
  54     [('t', 'h', 'e', 'q'), ('h', 'e', 'q', 'u'), ('e', 'q', 'u', 'i'), ('q', 'u', 'i', 'c'), ('u', 'i', 'c', 'k'), ('i', 'c', 'k', 'b'), ('c', 'k', 'b', 'r'), ('k', 'b', 'r', 'o'), ('b', 'r', 'o', 'w'), ('r', 'o', 'w', 'n'), ('o', 'w', 'n', 'f'), ('w', 'n', 'f', 'o'), ('n', 'f', 'o', 'x')]
  55     """
  56     return [tuple(text[i:i+n]) for i in range(len(text)-n+1)]
  57
  58 def every_nth(text, n):
  59     """Returns n strings, each of which consists of every nth character,
  60     starting with the 0th, 1st, 2nd, ... (n-1)th character
  61
  62     >>> every_nth(string.ascii_lowercase, 5)
  63     ['afkpuz', 'bglqv', 'chmrw', 'dinsx', 'ejoty']
  64     >>> every_nth(string.ascii_lowercase, 1)
  65     ['abcdefghijklmnopqrstuvwxyz']
  66     >>> every_nth(string.ascii_lowercase, 26)
  67     ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
  68     """
  69     split_text = [text[i:i+n] for i in range(0, len(text), n)]
  70     return [''.join(l) for l in zip_longest(*split_text, fillvalue='')]
  71
  72 def combine_every_nth(split_text):
  73     """Reforms a text split into every_nth strings
  74
  75     >>> combine_every_nth(every_nth(string.ascii_lowercase, 5))
  76     'abcdefghijklmnopqrstuvwxyz'
  77     >>> combine_every_nth(every_nth(string.ascii_lowercase, 1))
  78     'abcdefghijklmnopqrstuvwxyz'
  79     >>> combine_every_nth(every_nth(string.ascii_lowercase, 26))
  80     'abcdefghijklmnopqrstuvwxyz'
  81     """
  82     return ''.join([''.join(l) for l in zip_longest(*split_text, fillvalue='')])
  83
  84
  85 def letter_frequencies(text):
  86     """Count the number of occurrences of each character in text
  87
  88     >>> sorted(letter_frequencies('abcdefabc').items())
  89     [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
  90     >>> sorted(letter_frequencies('the quick brown fox jumped over the lazy dog').items())
  91     [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
  92     >>> sorted(letter_frequencies('The Quick BROWN fox jumped! over... the (9lazy) DOG').items())
  93     [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1), ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1), ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1), ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
  94     >>> sorted(letter_frequencies(sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')).items())
  95     [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
  96     """
  97     counts = collections.defaultdict(int)
  98     for c in text:
  99         counts[c] += 1
 100     return counts
 101
 102 def deduplicate(text):
 103     return list(collections.OrderedDict.fromkeys(text))
 104
 105
 106
 107 def caesar_encipher_letter(letter, shift):
 108     """Encipher a letter, given a shift amount
 109
 110     >>> caesar_encipher_letter('a', 1)
 111     'b'
 112     >>> caesar_encipher_letter('a', 2)
 113     'c'
 114     >>> caesar_encipher_letter('b', 2)
 115     'd'
 116     >>> caesar_encipher_letter('x', 2)
 117     'z'
 118     >>> caesar_encipher_letter('y', 2)
 119     'a'
 120     >>> caesar_encipher_letter('z', 2)
 121     'b'
 122     >>> caesar_encipher_letter('z', -1)
 123     'y'
 124     >>> caesar_encipher_letter('a', -1)
 125     'z'
 126     """
 127     if letter in string.ascii_letters:
 128         if letter in string.ascii_uppercase:
 129             alphabet_start = ord('A')
 130         else:
 131             alphabet_start = ord('a')
 132         return chr(((ord(letter) - alphabet_start + shift) % 26) + alphabet_start)
 133     else:
 134         return letter
 135
 136 def caesar_decipher_letter(letter, shift):
 137     """Decipher a letter, given a shift amount
 138
 139     >>> caesar_decipher_letter('b', 1)
 140     'a'
 141     >>> caesar_decipher_letter('b', 2)
 142     'z'
 143     """
 144     return caesar_encipher_letter(letter, -shift)
 145
 146 def caesar_encipher(message, shift):
 147     """Encipher a message with the Caesar cipher of given shift
 148
 149     >>> caesar_encipher('abc', 1)
 150     'bcd'
 151     >>> caesar_encipher('abc', 2)
 152     'cde'
 153     >>> caesar_encipher('abcxyz', 2)
 154     'cdezab'
 155     >>> caesar_encipher('ab cx yz', 2)
 156     'cd ez ab'
 157     """
 158     enciphered = [caesar_encipher_letter(l, shift) for l in message]
 159     return ''.join(enciphered)
 160
 161 def caesar_decipher(message, shift):
 162     """Encipher a message with the Caesar cipher of given shift
 163
 164     >>> caesar_decipher('bcd', 1)
 165     'abc'
 166     >>> caesar_decipher('cde', 2)
 167     'abc'
 168     >>> caesar_decipher('cd ez ab', 2)
 169     'ab cx yz'
 170     """
 171     return caesar_encipher(message, -shift)
 172
 173 def affine_encipher_letter(letter, multiplier=1, adder=0, one_based=True):
 174     """Encipher a letter, given a multiplier and adder
 175
 176     >>> ''.join([affine_encipher_letter(l, 3, 5, True) for l in string.ascii_uppercase])
 177     'HKNQTWZCFILORUXADGJMPSVYBE'
 178     >>> ''.join([affine_encipher_letter(l, 3, 5, False) for l in string.ascii_uppercase])
 179     'FILORUXADGJMPSVYBEHKNQTWZC'
 180     """
 181     if letter in string.ascii_letters:
 182         if letter in string.ascii_uppercase:
 183             alphabet_start = ord('A')
 184         else:
 185             alphabet_start = ord('a')
 186         letter_number = ord(letter) - alphabet_start
 187         if one_based: letter_number += 1
 188         cipher_number = (letter_number * multiplier + adder) % 26
 189         if one_based: cipher_number -= 1
 190         return chr(cipher_number % 26 + alphabet_start)
 191     else:
 192         return letter
 193
 194 def affine_decipher_letter(letter, multiplier=1, adder=0, one_based=True):
 195     """Encipher a letter, given a multiplier and adder
 196
 197     >>> ''.join([affine_decipher_letter(l, 3, 5, True) for l in 'HKNQTWZCFILORUXADGJMPSVYBE'])
 198     'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
 199     >>> ''.join([affine_decipher_letter(l, 3, 5, False) for l in 'FILORUXADGJMPSVYBEHKNQTWZC'])
 200     'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
 201     """
 202     if letter in string.ascii_letters:
 203         if letter in string.ascii_uppercase:
 204             alphabet_start = ord('A')
 205         else:
 206             alphabet_start = ord('a')
 207         cipher_number = ord(letter) - alphabet_start
 208         if one_based: cipher_number += 1
 209         plaintext_number = modular_division_table[multiplier][(cipher_number - adder) % 26]
 210         if one_based: plaintext_number -= 1
 211         return chr(plaintext_number % 26 + alphabet_start)
 212     else:
 213         return letter
 214
 215 def affine_encipher(message, multiplier=1, adder=0, one_based=True):
 216     """Encipher a message
 217
 218     >>> affine_encipher('hours passed during which jerico tried every trick he could think of', 15, 22, True)
 219     'lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh'
 220     """
 221     enciphered = [affine_encipher_letter(l, multiplier, adder, one_based) for l in message]
 222     return ''.join(enciphered)
 223
 224 def affine_decipher(message, multiplier=1, adder=0, one_based=True):
 225     """Decipher a message
 226
 227     >>> affine_decipher('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh', 15, 22, True)
 228     'hours passed during which jerico tried every trick he could think of'
 229     """
 230     enciphered = [affine_decipher_letter(l, multiplier, adder, one_based) for l in message]
 231     return ''.join(enciphered)
 232
 233
 234 def keyword_cipher_alphabet_of(keyword, wrap_alphabet=0):
 235     """Find the cipher alphabet given a keyword.
 236     wrap_alphabet controls how the rest of the alphabet is added
 237     after the keyword.
 238     0 : from 'a'
 239     1 : from the last letter in the sanitised keyword
 240     2 : from the largest letter in the sanitised keyword
 241
 242     >>> keyword_cipher_alphabet_of('bayes')
 243     'bayescdfghijklmnopqrtuvwxz'
 244     >>> keyword_cipher_alphabet_of('bayes', 0)
 245     'bayescdfghijklmnopqrtuvwxz'
 246     >>> keyword_cipher_alphabet_of('bayes', 1)
 247     'bayestuvwxzcdfghijklmnopqr'
 248     >>> keyword_cipher_alphabet_of('bayes', 2)
 249     'bayeszcdfghijklmnopqrtuvwx'
 250     """
 251     if wrap_alphabet == 0:
 252         cipher_alphabet = ''.join(deduplicate(sanitise(keyword) + string.ascii_lowercase))
 253     else:
 254         if wrap_alphabet == 1:
 255             last_keyword_letter = deduplicate(sanitise(keyword))[-1]
 256         else:
 257             last_keyword_letter = sorted(sanitise(keyword))[-1]
 258         last_keyword_position = string.ascii_lowercase.find(last_keyword_letter) + 1
 259         cipher_alphabet = ''.join(deduplicate(sanitise(keyword) +
 260                                               string.ascii_lowercase[last_keyword_position:] +
 261                                               string.ascii_lowercase))
 262     return cipher_alphabet
 263
 264
 265 def keyword_encipher(message, keyword, wrap_alphabet=0):
 266     """Enciphers a message with a keyword substitution cipher.
 267     wrap_alphabet controls how the rest of the alphabet is added
 268     after the keyword.
 269     0 : from 'a'
 270     1 : from the last letter in the sanitised keyword
 271     2 : from the largest letter in the sanitised keyword
 272
 273     >>> keyword_encipher('test message', 'bayes')
 274     'rsqr ksqqbds'
 275     >>> keyword_encipher('test message', 'bayes', 0)
 276     'rsqr ksqqbds'
 277     >>> keyword_encipher('test message', 'bayes', 1)
 278     'lskl dskkbus'
 279     >>> keyword_encipher('test message', 'bayes', 2)
 280     'qspq jsppbcs'
 281     """
 282     cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
 283     cipher_translation = ''.maketrans(string.ascii_lowercase, cipher_alphabet)
 284     return message.lower().translate(cipher_translation)
 285
 286 def keyword_decipher(message, keyword, wrap_alphabet=0):
 287     """Deciphers a message with a keyword substitution cipher.
 288     wrap_alphabet controls how the rest of the alphabet is added
 289     after the keyword.
 290     0 : from 'a'
 291     1 : from the last letter in the sanitised keyword
 292     2 : from the largest letter in the sanitised keyword
 293
 294     >>> keyword_decipher('rsqr ksqqbds', 'bayes')
 295     'test message'
 296     >>> keyword_decipher('rsqr ksqqbds', 'bayes', 0)
 297     'test message'
 298     >>> keyword_decipher('lskl dskkbus', 'bayes', 1)
 299     'test message'
 300     >>> keyword_decipher('qspq jsppbcs', 'bayes', 2)
 301     'test message'
 302     """
 303     cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
 304     cipher_translation = ''.maketrans(cipher_alphabet, string.ascii_lowercase)
 305     return message.lower().translate(cipher_translation)
 306
 307
 308 def caesar_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
 309     """Breaks a Caesar cipher using frequency analysis
 310
 311     >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrhecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS
 312     (4, 0.31863952890183...)
 313     >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgteeraxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS
 314     (19, 0.42152901235832...)
 315     >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurersvaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS
 316     (13, 0.316029208075451...)
 317     """
 318     sanitised_message = sanitise(message)
 319     best_shift = 0
 320     best_fit = float("inf")
 321     for shift in range(26):
 322         plaintext = caesar_decipher(sanitised_message, shift)
 323         frequencies = message_frequency_scaling(letter_frequencies(plaintext))
 324         fit = metric(target_frequencies, frequencies)
 325         logger.debug('Caesar break attempt using key {0} gives fit of {1} and decrypt starting: {2}'.format(shift, fit, plaintext[:50]))
 326         if fit < best_fit:
 327             best_fit = fit
 328             best_shift = shift
 329     logger.info('Caesar break best fit: key {0} gives fit of {1} and decrypt starting: {2}'.format(best_shift, best_fit, caesar_decipher(sanitised_message, best_shift)[:50]))
 330     return best_shift, best_fit
 331
 332 def affine_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
 333     """Breaks an affine cipher using frequency analysis
 334
 335     >>> affine_break('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh jm bfmibj umis hfsul axubafkjamx. ls kffkxwsd jls ofgbjmwfkiu olfmxmtmwaokttg jlsx ls kffkxwsd jlsi zg tsxwjl. jlsx ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai kxd clm ckuxj.') # doctest: +ELLIPSIS
 336     ((15, 22, True), 0.23570361818655...)
 337     """
 338     sanitised_message = sanitise(message)
 339     best_multiplier = 0
 340     best_adder = 0
 341     best_one_based = True
 342     best_fit = float("inf")
 343     for one_based in [True, False]:
 344         for multiplier in range(1, 26, 2):
 345             for adder in range(26):
 346                 plaintext = affine_decipher(sanitised_message, multiplier, adder, one_based)
 347                 frequencies = message_frequency_scaling(letter_frequencies(plaintext))
 348                 fit = metric(target_frequencies, frequencies)
 349                 logger.debug('Affine break attempt using key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(multiplier, adder, one_based, fit, plaintext[:50]))
 350                 if fit < best_fit:
 351                     best_fit = fit
 352                     best_multiplier = multiplier
 353                     best_adder = adder
 354                     best_one_based = one_based
 355     logger.info('Affine break best fit with key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(best_multiplier, best_adder, best_one_based, best_fit, affine_decipher(sanitised_message, best_multiplier, best_adder, best_one_based)[:50]))
 356     return (best_multiplier, best_adder, best_one_based), best_fit
 357
 358
 359 def keyword_break(message, wordlist=keywords, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
 360     """Breaks a keyword substitution cipher using a dictionary and frequency analysis
 361
 362     >>> keyword_break(keyword_encipher('this is a test message for the keyword decipherment', 'elephant', 1), wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
 363     (('elephant', 1), 0.41643991598441...)
 364     """
 365     best_keyword = ''
 366     best_wrap_alphabet = True
 367     best_fit = float("inf")
 368     for wrap_alphabet in range(3):
 369         for keyword in wordlist:
 370             plaintext = keyword_decipher(message, keyword, wrap_alphabet)
 371             frequencies = message_frequency_scaling(letter_frequencies(plaintext))
 372             fit = metric(target_frequencies, frequencies)
 373             logger.debug('Keyword break attempt using key {0} (wrap={1}) gives fit of {2} and decrypt starting: {3}'.format(keyword, wrap_alphabet, fit, sanitise(plaintext)[:50]))
 374             if fit < best_fit:
 375                 best_fit = fit
 376                 best_keyword = keyword
 377                 best_wrap_alphabet = wrap_alphabet
 378     logger.info('Keyword break best fit with key {0} (wrap={1}) gives fit of {2} and decrypt starting: {3}'.format(best_keyword, best_wrap_alphabet, best_fit, sanitise(keyword_decipher(message, best_keyword))[:50]))
 379     return (best_keyword, best_wrap_alphabet), best_fit
 380
 381
 382 if __name__ == "__main__":
 383     import doctest
 384     doctest.testmod()