cipher.py

   1 import string
   2 import collections
   3 import norms
   4 import logging
   5 from segment import segment
   6
   7 # To time a run:
   8 #
   9 # import timeit
  10 # c5a = open('2012/5a.ciphertext', 'r').read()
  11 # timeit.timeit('keyword_break(c5a)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break', number=1)
  12
  13
  14 logger = logging.getLogger(__name__)
  15 logger.addHandler(logging.FileHandler('cipher.log'))
  16 logger.setLevel(logging.WARNING)
  17 #logger.setLevel(logging.INFO)
  18
  19 english_counts = collections.defaultdict(int)
  20 with open('count_1l.txt', 'r') as f:
  21     for line in f:
  22         (letter, count) = line.split("\t")
  23         english_counts[letter] = int(count)
  24 normalised_english_counts = norms.normalise(english_counts)
  25
  26 with open('words.txt', 'r') as f:
  27     keywords = [line.rstrip() for line in f]
  28
  29 modular_division_table = [[0]*26 for x in range(26)]
  30 for a in range(26):
  31     for b in range(26):
  32         c = (a * b) % 26
  33         modular_division_table[b][c] = a
  34
  35 modular_division_table_one_based = [[0]*27 for x in range(27)]
  36 for a in range(27):
  37     for b in range(27):
  38         c = ((a * b)-1) % 26 + 1
  39         modular_division_table_one_based[b][c] = a
  40
  41
  42
  43 def sanitise(text):
  44     """Remove all non-alphabetic characters and convert the text to lowercase
  45
  46     >>> sanitise('The Quick')
  47     'thequick'
  48     >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
  49     'thequickbrownfoxjumpedoverthelazydog'
  50     """
  51     sanitised = [c.lower() for c in text if c in string.ascii_letters]
  52     return ''.join(sanitised)
  53
  54 def ngrams(text, n):
  55     """Returns all n-grams of a text
  56
  57     >>> ngrams(sanitise('the quick brown fox'), 2)
  58     [('t', 'h'), ('h', 'e'), ('e', 'q'), ('q', 'u'), ('u', 'i'), ('i', 'c'), ('c', 'k'), ('k', 'b'), ('b', 'r'), ('r', 'o'), ('o', 'w'), ('w', 'n'), ('n', 'f'), ('f', 'o'), ('o', 'x')]
  59     >>> ngrams(sanitise('the quick brown fox'), 4)
  60     [('t', 'h', 'e', 'q'), ('h', 'e', 'q', 'u'), ('e', 'q', 'u', 'i'), ('q', 'u', 'i', 'c'), ('u', 'i', 'c', 'k'), ('i', 'c', 'k', 'b'), ('c', 'k', 'b', 'r'), ('k', 'b', 'r', 'o'), ('b', 'r', 'o', 'w'), ('r', 'o', 'w', 'n'), ('o', 'w', 'n', 'f'), ('w', 'n', 'f', 'o'), ('n', 'f', 'o', 'x')]
  61     """
  62     return [tuple(text[i:i+n]) for i in range(len(text)-n+1)]
  63
  64 def letter_frequencies(text):
  65     """Count the number of occurrences of each character in text
  66
  67     >>> sorted(letter_frequencies('abcdefabc').items())
  68     [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
  69     >>> sorted(letter_frequencies('the quick brown fox jumped over the lazy dog').items())
  70     [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
  71     >>> sorted(letter_frequencies('The Quick BROWN fox jumped! over... the (9lazy) DOG').items())
  72     [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1), ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1), ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1), ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
  73     >>> sorted(letter_frequencies(sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')).items())
  74     [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
  75     """
  76     counts = collections.defaultdict(int)
  77     for c in text:
  78         counts[c] += 1
  79     return counts
  80
  81 def deduplicate(text):
  82     return list(collections.OrderedDict.fromkeys(text))
  83
  84
  85
  86 def caesar_encipher_letter(letter, shift):
  87     """Encipher a letter, given a shift amount
  88
  89     >>> caesar_encipher_letter('a', 1)
  90     'b'
  91     >>> caesar_encipher_letter('a', 2)
  92     'c'
  93     >>> caesar_encipher_letter('b', 2)
  94     'd'
  95     >>> caesar_encipher_letter('x', 2)
  96     'z'
  97     >>> caesar_encipher_letter('y', 2)
  98     'a'
  99     >>> caesar_encipher_letter('z', 2)
 100     'b'
 101     >>> caesar_encipher_letter('z', -1)
 102     'y'
 103     >>> caesar_encipher_letter('a', -1)
 104     'z'
 105     """
 106     if letter in string.ascii_letters:
 107         if letter in string.ascii_uppercase:
 108             alphabet_start = ord('A')
 109         else:
 110             alphabet_start = ord('a')
 111         return chr(((ord(letter) - alphabet_start + shift) % 26) + alphabet_start)
 112     else:
 113         return letter
 114
 115 def caesar_decipher_letter(letter, shift):
 116     """Decipher a letter, given a shift amount
 117
 118     >>> caesar_decipher_letter('b', 1)
 119     'a'
 120     >>> caesar_decipher_letter('b', 2)
 121     'z'
 122     """
 123     return caesar_encipher_letter(letter, -shift)
 124
 125 def caesar_encipher(message, shift):
 126     """Encipher a message with the Caesar cipher of given shift
 127
 128     >>> caesar_encipher('abc', 1)
 129     'bcd'
 130     >>> caesar_encipher('abc', 2)
 131     'cde'
 132     >>> caesar_encipher('abcxyz', 2)
 133     'cdezab'
 134     >>> caesar_encipher('ab cx yz', 2)
 135     'cd ez ab'
 136     """
 137     enciphered = [caesar_encipher_letter(l, shift) for l in message]
 138     return ''.join(enciphered)
 139
 140 def caesar_decipher(message, shift):
 141     """Encipher a message with the Caesar cipher of given shift
 142
 143     >>> caesar_decipher('bcd', 1)
 144     'abc'
 145     >>> caesar_decipher('cde', 2)
 146     'abc'
 147     >>> caesar_decipher('cd ez ab', 2)
 148     'ab cx yz'
 149     """
 150     return caesar_encipher(message, -shift)
 151
 152 def affine_encipher_letter(letter, multiplier=1, adder=0, one_based=True):
 153     """Encipher a letter, given a multiplier and adder
 154
 155     >>> ''.join([affine_encipher_letter(l, 3, 5, True) for l in string.ascii_uppercase])
 156     'HKNQTWZCFILORUXADGJMPSVYBE'
 157     >>> ''.join([affine_encipher_letter(l, 3, 5, False) for l in string.ascii_uppercase])
 158     'FILORUXADGJMPSVYBEHKNQTWZC'
 159     """
 160     if letter in string.ascii_letters:
 161         if letter in string.ascii_uppercase:
 162             alphabet_start = ord('A')
 163         else:
 164             alphabet_start = ord('a')
 165         letter_number = ord(letter) - alphabet_start
 166         if one_based:
 167             letter_number += 1
 168         raw_cipher_number = (letter_number * multiplier + adder)
 169         cipher_number = 0
 170         if one_based:
 171             cipher_number = (raw_cipher_number - 1) % 26
 172         else:
 173             cipher_number = raw_cipher_number % 26
 174         return chr(cipher_number + alphabet_start)
 175     else:
 176         return letter
 177
 178 def affine_decipher_letter(letter, multiplier=1, adder=0, one_based=True):
 179     """Encipher a letter, given a multiplier and adder
 180
 181     >>> ''.join([affine_decipher_letter(l, 3, 5, True) for l in 'HKNQTWZCFILORUXADGJMPSVYBE'])
 182     'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
 183     >>> ''.join([affine_decipher_letter(l, 3, 5, False) for l in 'FILORUXADGJMPSVYBEHKNQTWZC'])
 184     'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
 185     """
 186     if letter in string.ascii_letters:
 187         if letter in string.ascii_uppercase:
 188             alphabet_start = ord('A')
 189         else:
 190             alphabet_start = ord('a')
 191         cipher_number = ord(letter) - alphabet_start
 192         if one_based:
 193             cipher_number += 1
 194         plaintext_number = 0
 195         if one_based:
 196             plaintext_number = (modular_division_table_one_based[multiplier][(cipher_number - adder + 26) % 26] - 1) % 26
 197         else:
 198             #plaintext_number = (modular_division_table[multiplier][cipher_number] - adder) % 26
 199             plaintext_number = modular_division_table[multiplier][(cipher_number - adder + 26) % 26]
 200         return chr(plaintext_number + alphabet_start)
 201     else:
 202         return letter
 203
 204 def affine_encipher(message, multiplier=1, adder=0, one_based=True):
 205     """Encipher a message
 206
 207     >>> affine_encipher('hours passed during which jerico tried every trick he could think of', 15, 22, True)
 208     'lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh'
 209     """
 210     enciphered = [affine_encipher_letter(l, multiplier, adder, one_based) for l in message]
 211     return ''.join(enciphered)
 212
 213 def affine_decipher(message, multiplier=1, adder=0, one_based=True):
 214     """Decipher a message
 215
 216     >>> affine_decipher('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh', 15, 22, True)
 217     'hours passed during which jerico tried every trick he could think of'
 218     """
 219     enciphered = [affine_decipher_letter(l, multiplier, adder, one_based) for l in message]
 220     return ''.join(enciphered)
 221
 222
 223 def keyword_cipher_alphabet_of(keyword, wrap_alphabet=0):
 224     """Find the cipher alphabet given a keyword.
 225     wrap_alphabet controls how the rest of the alphabet is added
 226     after the keyword.
 227     0 : from 'a'
 228     1 : from the last letter in the sanitised keyword
 229     2 : from the largest letter in the sanitised keyword
 230
 231     >>> keyword_cipher_alphabet_of('bayes')
 232     'bayescdfghijklmnopqrtuvwxz'
 233     >>> keyword_cipher_alphabet_of('bayes', 0)
 234     'bayescdfghijklmnopqrtuvwxz'
 235     >>> keyword_cipher_alphabet_of('bayes', 1)
 236     'bayestuvwxzcdfghijklmnopqr'
 237     >>> keyword_cipher_alphabet_of('bayes', 2)
 238     'bayeszcdfghijklmnopqrtuvwx'
 239     """
 240     if wrap_alphabet == 0:
 241         cipher_alphabet = ''.join(deduplicate(sanitise(keyword) + string.ascii_lowercase))
 242     else:
 243         if wrap_alphabet == 1:
 244             last_keyword_letter = deduplicate(sanitise(keyword))[-1]
 245         else:
 246             last_keyword_letter = sorted(sanitise(keyword))[-1]
 247         last_keyword_position = string.ascii_lowercase.find(last_keyword_letter) + 1
 248         cipher_alphabet = ''.join(deduplicate(sanitise(keyword) +
 249                                               string.ascii_lowercase[last_keyword_position:] +
 250                                               string.ascii_lowercase))
 251     return cipher_alphabet
 252
 253
 254 def keyword_encipher(message, keyword, wrap_alphabet=0):
 255     """Enciphers a message with a keyword substitution cipher.
 256     wrap_alphabet controls how the rest of the alphabet is added
 257     after the keyword.
 258     0 : from 'a'
 259     1 : from the last letter in the sanitised keyword
 260     2 : from the largest letter in the sanitised keyword
 261
 262     >>> keyword_encipher('test message', 'bayes')
 263     'rsqr ksqqbds'
 264     >>> keyword_encipher('test message', 'bayes', 0)
 265     'rsqr ksqqbds'
 266     >>> keyword_encipher('test message', 'bayes', 1)
 267     'lskl dskkbus'
 268     >>> keyword_encipher('test message', 'bayes', 2)
 269     'qspq jsppbcs'
 270     """
 271     cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
 272     cipher_translation = ''.maketrans(string.ascii_lowercase, cipher_alphabet)
 273     return message.lower().translate(cipher_translation)
 274
 275 def keyword_decipher(message, keyword, wrap_alphabet=0):
 276     """Deciphers a message with a keyword substitution cipher.
 277     wrap_alphabet controls how the rest of the alphabet is added
 278     after the keyword.
 279     0 : from 'a'
 280     1 : from the last letter in the sanitised keyword
 281     2 : from the largest letter in the sanitised keyword
 282
 283     >>> keyword_decipher('rsqr ksqqbds', 'bayes')
 284     'test message'
 285     >>> keyword_decipher('rsqr ksqqbds', 'bayes', 0)
 286     'test message'
 287     >>> keyword_decipher('lskl dskkbus', 'bayes', 1)
 288     'test message'
 289     >>> keyword_decipher('qspq jsppbcs', 'bayes', 2)
 290     'test message'
 291     """
 292     cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
 293     cipher_translation = ''.maketrans(cipher_alphabet, string.ascii_lowercase)
 294     return message.lower().translate(cipher_translation)
 295
 296
 297 def caesar_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
 298     """Breaks a Caesar cipher using frequency analysis
 299
 300     >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrhecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS
 301     (4, 0.31863952890183...)
 302     >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgteeraxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS
 303     (19, 0.42152901235832...)
 304     >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurersvaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS
 305     (13, 0.316029208075451...)
 306     """
 307     sanitised_message = sanitise(message)
 308     best_shift = 0
 309     best_fit = float("inf")
 310     for shift in range(26):
 311         plaintext = caesar_decipher(sanitised_message, shift)
 312         frequencies = message_frequency_scaling(letter_frequencies(plaintext))
 313         fit = metric(target_frequencies, frequencies)
 314         logger.debug('Caesar break attempt using key {0} gives fit of {1} and decrypt starting: {2}'.format(shift, fit, plaintext[:50]))
 315         if fit < best_fit:
 316             best_fit = fit
 317             best_shift = shift
 318     logger.info('Caesar break best fit: key {0} gives fit of {1} and decrypt starting: {2}'.format(best_shift, best_fit, caesar_decipher(sanitised_message, best_shift)[:50]))
 319     return best_shift, best_fit
 320
 321 def affine_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
 322     """Breaks an affine cipher using frequency analysis
 323
 324     >>> affine_break('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh jm bfmibj umis hfsul axubafkjamx. ls kffkxwsd jls ofgbjmwfkiu olfmxmtmwaokttg jlsx ls kffkxwsd jlsi zg tsxwjl. jlsx ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai kxd clm ckuxj.') # doctest: +ELLIPSIS
 325     ((15, 22, True), 0.23570361818655...)
 326     """
 327     sanitised_message = sanitise(message)
 328     best_multiplier = 0
 329     best_adder = 0
 330     best_one_based = True
 331     best_fit = float("inf")
 332     for one_based in [True, False]:
 333         for multiplier in range(1, 26, 2):
 334             for adder in range(26):
 335                 plaintext = affine_decipher(sanitised_message, multiplier, adder, one_based)
 336                 frequencies = message_frequency_scaling(letter_frequencies(plaintext))
 337                 fit = metric(target_frequencies, frequencies)
 338                 logger.debug('Affine break attempt using key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(multiplier, adder, one_based, fit, plaintext[:50]))
 339                 if fit < best_fit:
 340                     best_fit = fit
 341                     best_multiplier = multiplier
 342                     best_adder = adder
 343                     best_one_based = one_based
 344     logger.info('Affine break best fit with key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(best_multiplier, best_adder, best_one_based, best_fit, affine_decipher(sanitised_message, best_multiplier, best_adder, best_one_based)[:50]))
 345     return (best_multiplier, best_adder, best_one_based), best_fit
 346
 347
 348 def keyword_break(message, wordlist=keywords, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
 349     """Breaks a keyword substitution cipher using a dictionary and frequency analysis
 350
 351     >>> keyword_break(keyword_encipher('this is a test message for the keyword decipherment', 'elephant', 1), wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
 352     (('elephant', 1), 0.41643991598441...)
 353     """
 354     best_keyword = ''
 355     best_wrap_alphabet = True
 356     best_fit = float("inf")
 357     for wrap_alphabet in range(3):
 358         for keyword in wordlist:
 359             plaintext = keyword_decipher(message, keyword, wrap_alphabet)
 360             frequencies = message_frequency_scaling(letter_frequencies(plaintext))
 361             fit = metric(target_frequencies, frequencies)
 362             logger.debug('Keyword break attempt using key {0} (wrap={1}) gives fit of {2} and decrypt starting: {3}'.format(keyword, wrap_alphabet, fit, sanitise(plaintext)[:50]))
 363             if fit < best_fit:
 364                 best_fit = fit
 365                 best_keyword = keyword
 366                 best_wrap_alphabet = wrap_alphabet
 367     logger.info('Keyword break best fit with key {0} (wrap={1}) gives fit of {2} and decrypt starting: {3}'.format(best_keyword, best_wrap_alphabet, best_fit, sanitise(keyword_decipher(message, best_keyword))[:50]))
 368     return (best_keyword, best_wrap_alphabet), best_fit
 369
 370
 371 if __name__ == "__main__":
 372     import doctest
 373     doctest.testmod()