cipher.py

   1 import string
   2 import collections
   3 import norms
   4 import logging
   5 from itertools import zip_longest
   6 from segment import segment
   7
   8 # To time a run:
   9 #
  10 # import timeit
  11 # c5a = open('2012/5a.ciphertext', 'r').read()
  12 # timeit.timeit('keyword_break(c5a)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break', number=1)
  13
  14
  15 logger = logging.getLogger(__name__)
  16 logger.addHandler(logging.FileHandler('cipher.log'))
  17 logger.setLevel(logging.WARNING)
  18 #logger.setLevel(logging.INFO)
  19
  20 english_counts = collections.defaultdict(int)
  21 with open('count_1l.txt', 'r') as f:
  22     for line in f:
  23         (letter, count) = line.split("\t")
  24         english_counts[letter] = int(count)
  25 normalised_english_counts = norms.normalise(english_counts)
  26
  27 with open('words.txt', 'r') as f:
  28     keywords = [line.rstrip() for line in f]
  29
  30 modular_division_table = [[0]*26 for x in range(26)]
  31 for a in range(26):
  32     for b in range(26):
  33         c = (a * b) % 26
  34         modular_division_table[b][c] = a
  35
  36 modular_division_table_one_based = [[0]*27 for x in range(27)]
  37 for a in range(27):
  38     for b in range(27):
  39         c = ((a * b)-1) % 26 + 1
  40         modular_division_table_one_based[b][c] = a
  41
  42
  43
  44 def sanitise(text):
  45     """Remove all non-alphabetic characters and convert the text to lowercase
  46
  47     >>> sanitise('The Quick')
  48     'thequick'
  49     >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
  50     'thequickbrownfoxjumpedoverthelazydog'
  51     """
  52     sanitised = [c.lower() for c in text if c in string.ascii_letters]
  53     return ''.join(sanitised)
  54
  55 def ngrams(text, n):
  56     """Returns all n-grams of a text
  57
  58     >>> ngrams(sanitise('the quick brown fox'), 2)
  59     [('t', 'h'), ('h', 'e'), ('e', 'q'), ('q', 'u'), ('u', 'i'), ('i', 'c'), ('c', 'k'), ('k', 'b'), ('b', 'r'), ('r', 'o'), ('o', 'w'), ('w', 'n'), ('n', 'f'), ('f', 'o'), ('o', 'x')]
  60     >>> ngrams(sanitise('the quick brown fox'), 4)
  61     [('t', 'h', 'e', 'q'), ('h', 'e', 'q', 'u'), ('e', 'q', 'u', 'i'), ('q', 'u', 'i', 'c'), ('u', 'i', 'c', 'k'), ('i', 'c', 'k', 'b'), ('c', 'k', 'b', 'r'), ('k', 'b', 'r', 'o'), ('b', 'r', 'o', 'w'), ('r', 'o', 'w', 'n'), ('o', 'w', 'n', 'f'), ('w', 'n', 'f', 'o'), ('n', 'f', 'o', 'x')]
  62     """
  63     return [tuple(text[i:i+n]) for i in range(len(text)-n+1)]
  64
  65 def every_nth(text, n):
  66     """Returns n strings, each of which consists of every nth character,
  67     starting with the 0th, 1st, 2nd, ... (n-1)th character
  68
  69     >>> every_nth(string.ascii_lowercase, 5)
  70     ['afkpuz', 'bglqv', 'chmrw', 'dinsx', 'ejoty']
  71     >>> every_nth(string.ascii_lowercase, 1)
  72     ['abcdefghijklmnopqrstuvwxyz']
  73     >>> every_nth(string.ascii_lowercase, 26)
  74     ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
  75     """
  76     split_text = [text[i:i+n] for i in range(0, len(text), n)]
  77     return [''.join(l) for l in zip_longest(*split_text, fillvalue='')]
  78
  79 def combine_every_nth(split_text):
  80     """Reforms a text split into every_nth strings
  81
  82     >>> combine_every_nth(every_nth(string.ascii_lowercase, 5))
  83     'abcdefghijklmnopqrstuvwxyz'
  84     >>> combine_every_nth(every_nth(string.ascii_lowercase, 1))
  85     'abcdefghijklmnopqrstuvwxyz'
  86     >>> combine_every_nth(every_nth(string.ascii_lowercase, 26))
  87     'abcdefghijklmnopqrstuvwxyz'
  88     """
  89     return ''.join([''.join(l) for l in zip_longest(*split_text, fillvalue='')])
  90
  91
  92 def letter_frequencies(text):
  93     """Count the number of occurrences of each character in text
  94
  95     >>> sorted(letter_frequencies('abcdefabc').items())
  96     [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
  97     >>> sorted(letter_frequencies('the quick brown fox jumped over the lazy dog').items())
  98     [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
  99     >>> sorted(letter_frequencies('The Quick BROWN fox jumped! over... the (9lazy) DOG').items())
 100     [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1), ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1), ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1), ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
 101     >>> sorted(letter_frequencies(sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')).items())
 102     [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
 103     """
 104     counts = collections.defaultdict(int)
 105     for c in text:
 106         counts[c] += 1
 107     return counts
 108
 109 def deduplicate(text):
 110     return list(collections.OrderedDict.fromkeys(text))
 111
 112
 113
 114 def caesar_encipher_letter(letter, shift):
 115     """Encipher a letter, given a shift amount
 116
 117     >>> caesar_encipher_letter('a', 1)
 118     'b'
 119     >>> caesar_encipher_letter('a', 2)
 120     'c'
 121     >>> caesar_encipher_letter('b', 2)
 122     'd'
 123     >>> caesar_encipher_letter('x', 2)
 124     'z'
 125     >>> caesar_encipher_letter('y', 2)
 126     'a'
 127     >>> caesar_encipher_letter('z', 2)
 128     'b'
 129     >>> caesar_encipher_letter('z', -1)
 130     'y'
 131     >>> caesar_encipher_letter('a', -1)
 132     'z'
 133     """
 134     if letter in string.ascii_letters:
 135         if letter in string.ascii_uppercase:
 136             alphabet_start = ord('A')
 137         else:
 138             alphabet_start = ord('a')
 139         return chr(((ord(letter) - alphabet_start + shift) % 26) + alphabet_start)
 140     else:
 141         return letter
 142
 143 def caesar_decipher_letter(letter, shift):
 144     """Decipher a letter, given a shift amount
 145
 146     >>> caesar_decipher_letter('b', 1)
 147     'a'
 148     >>> caesar_decipher_letter('b', 2)
 149     'z'
 150     """
 151     return caesar_encipher_letter(letter, -shift)
 152
 153 def caesar_encipher(message, shift):
 154     """Encipher a message with the Caesar cipher of given shift
 155
 156     >>> caesar_encipher('abc', 1)
 157     'bcd'
 158     >>> caesar_encipher('abc', 2)
 159     'cde'
 160     >>> caesar_encipher('abcxyz', 2)
 161     'cdezab'
 162     >>> caesar_encipher('ab cx yz', 2)
 163     'cd ez ab'
 164     """
 165     enciphered = [caesar_encipher_letter(l, shift) for l in message]
 166     return ''.join(enciphered)
 167
 168 def caesar_decipher(message, shift):
 169     """Encipher a message with the Caesar cipher of given shift
 170
 171     >>> caesar_decipher('bcd', 1)
 172     'abc'
 173     >>> caesar_decipher('cde', 2)
 174     'abc'
 175     >>> caesar_decipher('cd ez ab', 2)
 176     'ab cx yz'
 177     """
 178     return caesar_encipher(message, -shift)
 179
 180 def affine_encipher_letter(letter, multiplier=1, adder=0, one_based=True):
 181     """Encipher a letter, given a multiplier and adder
 182
 183     >>> ''.join([affine_encipher_letter(l, 3, 5, True) for l in string.ascii_uppercase])
 184     'HKNQTWZCFILORUXADGJMPSVYBE'
 185     >>> ''.join([affine_encipher_letter(l, 3, 5, False) for l in string.ascii_uppercase])
 186     'FILORUXADGJMPSVYBEHKNQTWZC'
 187     """
 188     if letter in string.ascii_letters:
 189         if letter in string.ascii_uppercase:
 190             alphabet_start = ord('A')
 191         else:
 192             alphabet_start = ord('a')
 193         letter_number = ord(letter) - alphabet_start
 194         if one_based:
 195             letter_number += 1
 196         raw_cipher_number = (letter_number * multiplier + adder)
 197         cipher_number = 0
 198         if one_based:
 199             cipher_number = (raw_cipher_number - 1) % 26
 200         else:
 201             cipher_number = raw_cipher_number % 26
 202         return chr(cipher_number + alphabet_start)
 203     else:
 204         return letter
 205
 206 def affine_decipher_letter(letter, multiplier=1, adder=0, one_based=True):
 207     """Encipher a letter, given a multiplier and adder
 208
 209     >>> ''.join([affine_decipher_letter(l, 3, 5, True) for l in 'HKNQTWZCFILORUXADGJMPSVYBE'])
 210     'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
 211     >>> ''.join([affine_decipher_letter(l, 3, 5, False) for l in 'FILORUXADGJMPSVYBEHKNQTWZC'])
 212     'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
 213     """
 214     if letter in string.ascii_letters:
 215         if letter in string.ascii_uppercase:
 216             alphabet_start = ord('A')
 217         else:
 218             alphabet_start = ord('a')
 219         cipher_number = ord(letter) - alphabet_start
 220         if one_based:
 221             cipher_number += 1
 222         plaintext_number = 0
 223         if one_based:
 224             plaintext_number = (modular_division_table_one_based[multiplier][(cipher_number - adder + 26) % 26] - 1) % 26
 225         else:
 226             #plaintext_number = (modular_division_table[multiplier][cipher_number] - adder) % 26
 227             plaintext_number = modular_division_table[multiplier][(cipher_number - adder + 26) % 26]
 228         return chr(plaintext_number + alphabet_start)
 229     else:
 230         return letter
 231
 232 def affine_encipher(message, multiplier=1, adder=0, one_based=True):
 233     """Encipher a message
 234
 235     >>> affine_encipher('hours passed during which jerico tried every trick he could think of', 15, 22, True)
 236     'lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh'
 237     """
 238     enciphered = [affine_encipher_letter(l, multiplier, adder, one_based) for l in message]
 239     return ''.join(enciphered)
 240
 241 def affine_decipher(message, multiplier=1, adder=0, one_based=True):
 242     """Decipher a message
 243
 244     >>> affine_decipher('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh', 15, 22, True)
 245     'hours passed during which jerico tried every trick he could think of'
 246     """
 247     enciphered = [affine_decipher_letter(l, multiplier, adder, one_based) for l in message]
 248     return ''.join(enciphered)
 249
 250
 251 def keyword_cipher_alphabet_of(keyword, wrap_alphabet=0):
 252     """Find the cipher alphabet given a keyword.
 253     wrap_alphabet controls how the rest of the alphabet is added
 254     after the keyword.
 255     0 : from 'a'
 256     1 : from the last letter in the sanitised keyword
 257     2 : from the largest letter in the sanitised keyword
 258
 259     >>> keyword_cipher_alphabet_of('bayes')
 260     'bayescdfghijklmnopqrtuvwxz'
 261     >>> keyword_cipher_alphabet_of('bayes', 0)
 262     'bayescdfghijklmnopqrtuvwxz'
 263     >>> keyword_cipher_alphabet_of('bayes', 1)
 264     'bayestuvwxzcdfghijklmnopqr'
 265     >>> keyword_cipher_alphabet_of('bayes', 2)
 266     'bayeszcdfghijklmnopqrtuvwx'
 267     """
 268     if wrap_alphabet == 0:
 269         cipher_alphabet = ''.join(deduplicate(sanitise(keyword) + string.ascii_lowercase))
 270     else:
 271         if wrap_alphabet == 1:
 272             last_keyword_letter = deduplicate(sanitise(keyword))[-1]
 273         else:
 274             last_keyword_letter = sorted(sanitise(keyword))[-1]
 275         last_keyword_position = string.ascii_lowercase.find(last_keyword_letter) + 1
 276         cipher_alphabet = ''.join(deduplicate(sanitise(keyword) +
 277                                               string.ascii_lowercase[last_keyword_position:] +
 278                                               string.ascii_lowercase))
 279     return cipher_alphabet
 280
 281
 282 def keyword_encipher(message, keyword, wrap_alphabet=0):
 283     """Enciphers a message with a keyword substitution cipher.
 284     wrap_alphabet controls how the rest of the alphabet is added
 285     after the keyword.
 286     0 : from 'a'
 287     1 : from the last letter in the sanitised keyword
 288     2 : from the largest letter in the sanitised keyword
 289
 290     >>> keyword_encipher('test message', 'bayes')
 291     'rsqr ksqqbds'
 292     >>> keyword_encipher('test message', 'bayes', 0)
 293     'rsqr ksqqbds'
 294     >>> keyword_encipher('test message', 'bayes', 1)
 295     'lskl dskkbus'
 296     >>> keyword_encipher('test message', 'bayes', 2)
 297     'qspq jsppbcs'
 298     """
 299     cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
 300     cipher_translation = ''.maketrans(string.ascii_lowercase, cipher_alphabet)
 301     return message.lower().translate(cipher_translation)
 302
 303 def keyword_decipher(message, keyword, wrap_alphabet=0):
 304     """Deciphers a message with a keyword substitution cipher.
 305     wrap_alphabet controls how the rest of the alphabet is added
 306     after the keyword.
 307     0 : from 'a'
 308     1 : from the last letter in the sanitised keyword
 309     2 : from the largest letter in the sanitised keyword
 310
 311     >>> keyword_decipher('rsqr ksqqbds', 'bayes')
 312     'test message'
 313     >>> keyword_decipher('rsqr ksqqbds', 'bayes', 0)
 314     'test message'
 315     >>> keyword_decipher('lskl dskkbus', 'bayes', 1)
 316     'test message'
 317     >>> keyword_decipher('qspq jsppbcs', 'bayes', 2)
 318     'test message'
 319     """
 320     cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
 321     cipher_translation = ''.maketrans(cipher_alphabet, string.ascii_lowercase)
 322     return message.lower().translate(cipher_translation)
 323
 324
 325 def caesar_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
 326     """Breaks a Caesar cipher using frequency analysis
 327
 328     >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrhecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS
 329     (4, 0.31863952890183...)
 330     >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgteeraxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS
 331     (19, 0.42152901235832...)
 332     >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurersvaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS
 333     (13, 0.316029208075451...)
 334     """
 335     sanitised_message = sanitise(message)
 336     best_shift = 0
 337     best_fit = float("inf")
 338     for shift in range(26):
 339         plaintext = caesar_decipher(sanitised_message, shift)
 340         frequencies = message_frequency_scaling(letter_frequencies(plaintext))
 341         fit = metric(target_frequencies, frequencies)
 342         logger.debug('Caesar break attempt using key {0} gives fit of {1} and decrypt starting: {2}'.format(shift, fit, plaintext[:50]))
 343         if fit < best_fit:
 344             best_fit = fit
 345             best_shift = shift
 346     logger.info('Caesar break best fit: key {0} gives fit of {1} and decrypt starting: {2}'.format(best_shift, best_fit, caesar_decipher(sanitised_message, best_shift)[:50]))
 347     return best_shift, best_fit
 348
 349 def affine_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
 350     """Breaks an affine cipher using frequency analysis
 351
 352     >>> affine_break('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh jm bfmibj umis hfsul axubafkjamx. ls kffkxwsd jls ofgbjmwfkiu olfmxmtmwaokttg jlsx ls kffkxwsd jlsi zg tsxwjl. jlsx ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai kxd clm ckuxj.') # doctest: +ELLIPSIS
 353     ((15, 22, True), 0.23570361818655...)
 354     """
 355     sanitised_message = sanitise(message)
 356     best_multiplier = 0
 357     best_adder = 0
 358     best_one_based = True
 359     best_fit = float("inf")
 360     for one_based in [True, False]:
 361         for multiplier in range(1, 26, 2):
 362             for adder in range(26):
 363                 plaintext = affine_decipher(sanitised_message, multiplier, adder, one_based)
 364                 frequencies = message_frequency_scaling(letter_frequencies(plaintext))
 365                 fit = metric(target_frequencies, frequencies)
 366                 logger.debug('Affine break attempt using key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(multiplier, adder, one_based, fit, plaintext[:50]))
 367                 if fit < best_fit:
 368                     best_fit = fit
 369                     best_multiplier = multiplier
 370                     best_adder = adder
 371                     best_one_based = one_based
 372     logger.info('Affine break best fit with key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(best_multiplier, best_adder, best_one_based, best_fit, affine_decipher(sanitised_message, best_multiplier, best_adder, best_one_based)[:50]))
 373     return (best_multiplier, best_adder, best_one_based), best_fit
 374
 375
 376 def keyword_break(message, wordlist=keywords, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
 377     """Breaks a keyword substitution cipher using a dictionary and frequency analysis
 378
 379     >>> keyword_break(keyword_encipher('this is a test message for the keyword decipherment', 'elephant', 1), wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
 380     (('elephant', 1), 0.41643991598441...)
 381     """
 382     best_keyword = ''
 383     best_wrap_alphabet = True
 384     best_fit = float("inf")
 385     for wrap_alphabet in range(3):
 386         for keyword in wordlist:
 387             plaintext = keyword_decipher(message, keyword, wrap_alphabet)
 388             frequencies = message_frequency_scaling(letter_frequencies(plaintext))
 389             fit = metric(target_frequencies, frequencies)
 390             logger.debug('Keyword break attempt using key {0} (wrap={1}) gives fit of {2} and decrypt starting: {3}'.format(keyword, wrap_alphabet, fit, sanitise(plaintext)[:50]))
 391             if fit < best_fit:
 392                 best_fit = fit
 393                 best_keyword = keyword
 394                 best_wrap_alphabet = wrap_alphabet
 395     logger.info('Keyword break best fit with key {0} (wrap={1}) gives fit of {2} and decrypt starting: {3}'.format(best_keyword, best_wrap_alphabet, best_fit, sanitise(keyword_decipher(message, best_keyword))[:50]))
 396     return (best_keyword, best_wrap_alphabet), best_fit
 397
 398
 399 if __name__ == "__main__":
 400     import doctest
 401     doctest.testmod()