cipher.py

   1 import string
   2 import collections
   3 import norms
   4 import logging
   5 from itertools import zip_longest
   6 from segment import segment
   7
   8 # To time a run:
   9 #
  10 # import timeit
  11 # c5a = open('2012/5a.ciphertext', 'r').read()
  12 # timeit.timeit('keyword_break(c5a)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break', number=1)
  13
  14
  15 logger = logging.getLogger(__name__)
  16 logger.addHandler(logging.FileHandler('cipher.log'))
  17 logger.setLevel(logging.WARNING)
  18 #logger.setLevel(logging.INFO)
  19
  20 english_counts = collections.defaultdict(int)
  21 with open('count_1l.txt', 'r') as f:
  22     for line in f:
  23         (letter, count) = line.split("\t")
  24         english_counts[letter] = int(count)
  25 normalised_english_counts = norms.normalise(english_counts)
  26
  27 with open('words.txt', 'r') as f:
  28     keywords = [line.rstrip() for line in f]
  29
  30 modular_division_table = [[0]*26 for x in range(26)]
  31 for a in range(26):
  32     for b in range(26):
  33         c = (a * b) % 26
  34         modular_division_table[b][c] = a
  35
  36 modular_division_table_one_based = [[0]*27 for x in range(27)]
  37 for a in range(27):
  38     for b in range(27):
  39         c = ((a * b)-1) % 26 + 1
  40         modular_division_table_one_based[b][c] = a
  41
  42
  43
  44 def sanitise(text):
  45     """Remove all non-alphabetic characters and convert the text to lowercase
  46
  47     >>> sanitise('The Quick')
  48     'thequick'
  49     >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
  50     'thequickbrownfoxjumpedoverthelazydog'
  51     """
  52     sanitised = [c.lower() for c in text if c in string.ascii_letters]
  53     return ''.join(sanitised)
  54
  55 def ngrams(text, n):
  56     """Returns all n-grams of a text
  57
  58     >>> ngrams(sanitise('the quick brown fox'), 2)
  59     [('t', 'h'), ('h', 'e'), ('e', 'q'), ('q', 'u'), ('u', 'i'), ('i', 'c'), ('c', 'k'), ('k', 'b'), ('b', 'r'), ('r', 'o'), ('o', 'w'), ('w', 'n'), ('n', 'f'), ('f', 'o'), ('o', 'x')]
  60     >>> ngrams(sanitise('the quick brown fox'), 4)
  61     [('t', 'h', 'e', 'q'), ('h', 'e', 'q', 'u'), ('e', 'q', 'u', 'i'), ('q', 'u', 'i', 'c'), ('u', 'i', 'c', 'k'), ('i', 'c', 'k', 'b'), ('c', 'k', 'b', 'r'), ('k', 'b', 'r', 'o'), ('b', 'r', 'o', 'w'), ('r', 'o', 'w', 'n'), ('o', 'w', 'n', 'f'), ('w', 'n', 'f', 'o'), ('n', 'f', 'o', 'x')]
  62     """
  63     return [tuple(text[i:i+n]) for i in range(len(text)-n+1)]
  64
  65 def every_nth(text, n):
  66     """Returns n strings, each of which consists of every nth character,
  67     starting with the 0th, 1st, 2nd, ... (n-1)th character
  68
  69     >>> every_nth(string.ascii_lowercase, 5)
  70     ['afkpuz', 'bglqv', 'chmrw', 'dinsx', 'ejoty']
  71     >>> every_nth(string.ascii_lowercase, 1)
  72     ['abcdefghijklmnopqrstuvwxyz']
  73     >>> every_nth(string.ascii_lowercase, 26)
  74     ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
  75     """
  76     split_text = [text[i:i+n] for i in range(0, len(text), n)]
  77     return [''.join(l) for l in zip_longest(*split_text, fillvalue='')]
  78
  79 def combine_every_nth(split_text):
  80     """Reforms a text split into every_nth strings
  81
  82     >>> combine_every_nth(every_nth(string.ascii_lowercase, 5))
  83     'abcdefghijklmnopqrstuvwxyz'
  84     >>> combine_every_nth(every_nth(string.ascii_lowercase, 1))
  85     'abcdefghijklmnopqrstuvwxyz'
  86     >>> combine_every_nth(every_nth(string.ascii_lowercase, 26))
  87     'abcdefghijklmnopqrstuvwxyz'
  88     """
  89     return ''.join([''.join(l) for l in zip_longest(*split_text, fillvalue='')])
  90
  91
  92 def letter_frequencies(text):
  93     """Count the number of occurrences of each character in text
  94
  95     >>> sorted(letter_frequencies('abcdefabc').items())
  96     [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
  97     >>> sorted(letter_frequencies('the quick brown fox jumped over the lazy dog').items())
  98     [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
  99     >>> sorted(letter_frequencies('The Quick BROWN fox jumped! over... the (9lazy) DOG').items())
 100     [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1), ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1), ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1), ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
 101     >>> sorted(letter_frequencies(sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')).items())
 102     [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
 103     """
 104     counts = collections.defaultdict(int)
 105     for c in text:
 106         counts[c] += 1
 107     return counts
 108
 109 def deduplicate(text):
 110     return list(collections.OrderedDict.fromkeys(text))
 111
 112
 113
 114 def caesar_encipher_letter(letter, shift):
 115     """Encipher a letter, given a shift amount
 116
 117     >>> caesar_encipher_letter('a', 1)
 118     'b'
 119     >>> caesar_encipher_letter('a', 2)
 120     'c'
 121     >>> caesar_encipher_letter('b', 2)
 122     'd'
 123     >>> caesar_encipher_letter('x', 2)
 124     'z'
 125     >>> caesar_encipher_letter('y', 2)
 126     'a'
 127     >>> caesar_encipher_letter('z', 2)
 128     'b'
 129     >>> caesar_encipher_letter('z', -1)
 130     'y'
 131     >>> caesar_encipher_letter('a', -1)
 132     'z'
 133     """
 134     if letter in string.ascii_letters:
 135         if letter in string.ascii_uppercase:
 136             alphabet_start = ord('A')
 137         else:
 138             alphabet_start = ord('a')
 139         return chr(((ord(letter) - alphabet_start + shift) % 26) + alphabet_start)
 140     else:
 141         return letter
 142
 143 def caesar_decipher_letter(letter, shift):
 144     """Decipher a letter, given a shift amount
 145
 146     >>> caesar_decipher_letter('b', 1)
 147     'a'
 148     >>> caesar_decipher_letter('b', 2)
 149     'z'
 150     """
 151     return caesar_encipher_letter(letter, -shift)
 152
 153 def caesar_encipher(message, shift):
 154     """Encipher a message with the Caesar cipher of given shift
 155
 156     >>> caesar_encipher('abc', 1)
 157     'bcd'
 158     >>> caesar_encipher('abc', 2)
 159     'cde'
 160     >>> caesar_encipher('abcxyz', 2)
 161     'cdezab'
 162     >>> caesar_encipher('ab cx yz', 2)
 163     'cd ez ab'
 164     """
 165     enciphered = [caesar_encipher_letter(l, shift) for l in message]
 166     return ''.join(enciphered)
 167
 168 def caesar_decipher(message, shift):
 169     """Encipher a message with the Caesar cipher of given shift
 170
 171     >>> caesar_decipher('bcd', 1)
 172     'abc'
 173     >>> caesar_decipher('cde', 2)
 174     'abc'
 175     >>> caesar_decipher('cd ez ab', 2)
 176     'ab cx yz'
 177     """
 178     return caesar_encipher(message, -shift)
 179
 180 def affine_encipher_letter(letter, multiplier=1, adder=0, one_based=True):
 181     """Encipher a letter, given a multiplier and adder
 182
 183     >>> ''.join([affine_encipher_letter(l, 3, 5, True) for l in string.ascii_uppercase])
 184     'HKNQTWZCFILORUXADGJMPSVYBE'
 185     >>> ''.join([affine_encipher_letter(l, 3, 5, False) for l in string.ascii_uppercase])
 186     'FILORUXADGJMPSVYBEHKNQTWZC'
 187     """
 188     if letter in string.ascii_letters:
 189         if letter in string.ascii_uppercase:
 190             alphabet_start = ord('A')
 191         else:
 192             alphabet_start = ord('a')
 193         letter_number = ord(letter) - alphabet_start
 194         if one_based:
 195             letter_number += 1
 196         raw_cipher_number = (letter_number * multiplier + adder)
 197         if one_based:
 198             cipher_number = (raw_cipher_number - 1) % 26
 199         else:
 200             cipher_number = raw_cipher_number % 26
 201         return chr(cipher_number + alphabet_start)
 202     else:
 203         return letter
 204
 205 def affine_decipher_letter(letter, multiplier=1, adder=0, one_based=True):
 206     """Encipher a letter, given a multiplier and adder
 207
 208     >>> ''.join([affine_decipher_letter(l, 3, 5, True) for l in 'HKNQTWZCFILORUXADGJMPSVYBE'])
 209     'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
 210     >>> ''.join([affine_decipher_letter(l, 3, 5, False) for l in 'FILORUXADGJMPSVYBEHKNQTWZC'])
 211     'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
 212     """
 213     if letter in string.ascii_letters:
 214         if letter in string.ascii_uppercase:
 215             alphabet_start = ord('A')
 216         else:
 217             alphabet_start = ord('a')
 218         cipher_number = ord(letter) - alphabet_start
 219         if one_based:
 220             plaintext_number = (modular_division_table_one_based[multiplier][(cipher_number + 1 - adder + 26) % 26] - 1) % 26
 221         else:
 222             plaintext_number = modular_division_table[multiplier][(cipher_number - adder + 26) % 26]
 223         return chr(plaintext_number + alphabet_start)
 224     else:
 225         return letter
 226
 227 def affine_encipher(message, multiplier=1, adder=0, one_based=True):
 228     """Encipher a message
 229
 230     >>> affine_encipher('hours passed during which jerico tried every trick he could think of', 15, 22, True)
 231     'lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh'
 232     """
 233     enciphered = [affine_encipher_letter(l, multiplier, adder, one_based) for l in message]
 234     return ''.join(enciphered)
 235
 236 def affine_decipher(message, multiplier=1, adder=0, one_based=True):
 237     """Decipher a message
 238
 239     >>> affine_decipher('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh', 15, 22, True)
 240     'hours passed during which jerico tried every trick he could think of'
 241     """
 242     enciphered = [affine_decipher_letter(l, multiplier, adder, one_based) for l in message]
 243     return ''.join(enciphered)
 244
 245
 246 def keyword_cipher_alphabet_of(keyword, wrap_alphabet=0):
 247     """Find the cipher alphabet given a keyword.
 248     wrap_alphabet controls how the rest of the alphabet is added
 249     after the keyword.
 250     0 : from 'a'
 251     1 : from the last letter in the sanitised keyword
 252     2 : from the largest letter in the sanitised keyword
 253
 254     >>> keyword_cipher_alphabet_of('bayes')
 255     'bayescdfghijklmnopqrtuvwxz'
 256     >>> keyword_cipher_alphabet_of('bayes', 0)
 257     'bayescdfghijklmnopqrtuvwxz'
 258     >>> keyword_cipher_alphabet_of('bayes', 1)
 259     'bayestuvwxzcdfghijklmnopqr'
 260     >>> keyword_cipher_alphabet_of('bayes', 2)
 261     'bayeszcdfghijklmnopqrtuvwx'
 262     """
 263     if wrap_alphabet == 0:
 264         cipher_alphabet = ''.join(deduplicate(sanitise(keyword) + string.ascii_lowercase))
 265     else:
 266         if wrap_alphabet == 1:
 267             last_keyword_letter = deduplicate(sanitise(keyword))[-1]
 268         else:
 269             last_keyword_letter = sorted(sanitise(keyword))[-1]
 270         last_keyword_position = string.ascii_lowercase.find(last_keyword_letter) + 1
 271         cipher_alphabet = ''.join(deduplicate(sanitise(keyword) +
 272                                               string.ascii_lowercase[last_keyword_position:] +
 273                                               string.ascii_lowercase))
 274     return cipher_alphabet
 275
 276
 277 def keyword_encipher(message, keyword, wrap_alphabet=0):
 278     """Enciphers a message with a keyword substitution cipher.
 279     wrap_alphabet controls how the rest of the alphabet is added
 280     after the keyword.
 281     0 : from 'a'
 282     1 : from the last letter in the sanitised keyword
 283     2 : from the largest letter in the sanitised keyword
 284
 285     >>> keyword_encipher('test message', 'bayes')
 286     'rsqr ksqqbds'
 287     >>> keyword_encipher('test message', 'bayes', 0)
 288     'rsqr ksqqbds'
 289     >>> keyword_encipher('test message', 'bayes', 1)
 290     'lskl dskkbus'
 291     >>> keyword_encipher('test message', 'bayes', 2)
 292     'qspq jsppbcs'
 293     """
 294     cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
 295     cipher_translation = ''.maketrans(string.ascii_lowercase, cipher_alphabet)
 296     return message.lower().translate(cipher_translation)
 297
 298 def keyword_decipher(message, keyword, wrap_alphabet=0):
 299     """Deciphers a message with a keyword substitution cipher.
 300     wrap_alphabet controls how the rest of the alphabet is added
 301     after the keyword.
 302     0 : from 'a'
 303     1 : from the last letter in the sanitised keyword
 304     2 : from the largest letter in the sanitised keyword
 305
 306     >>> keyword_decipher('rsqr ksqqbds', 'bayes')
 307     'test message'
 308     >>> keyword_decipher('rsqr ksqqbds', 'bayes', 0)
 309     'test message'
 310     >>> keyword_decipher('lskl dskkbus', 'bayes', 1)
 311     'test message'
 312     >>> keyword_decipher('qspq jsppbcs', 'bayes', 2)
 313     'test message'
 314     """
 315     cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
 316     cipher_translation = ''.maketrans(cipher_alphabet, string.ascii_lowercase)
 317     return message.lower().translate(cipher_translation)
 318
 319
 320 def caesar_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
 321     """Breaks a Caesar cipher using frequency analysis
 322
 323     >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrhecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS
 324     (4, 0.31863952890183...)
 325     >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgteeraxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS
 326     (19, 0.42152901235832...)
 327     >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurersvaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS
 328     (13, 0.316029208075451...)
 329     """
 330     sanitised_message = sanitise(message)
 331     best_shift = 0
 332     best_fit = float("inf")
 333     for shift in range(26):
 334         plaintext = caesar_decipher(sanitised_message, shift)
 335         frequencies = message_frequency_scaling(letter_frequencies(plaintext))
 336         fit = metric(target_frequencies, frequencies)
 337         logger.debug('Caesar break attempt using key {0} gives fit of {1} and decrypt starting: {2}'.format(shift, fit, plaintext[:50]))
 338         if fit < best_fit:
 339             best_fit = fit
 340             best_shift = shift
 341     logger.info('Caesar break best fit: key {0} gives fit of {1} and decrypt starting: {2}'.format(best_shift, best_fit, caesar_decipher(sanitised_message, best_shift)[:50]))
 342     return best_shift, best_fit
 343
 344 def affine_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
 345     """Breaks an affine cipher using frequency analysis
 346
 347     >>> affine_break('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh jm bfmibj umis hfsul axubafkjamx. ls kffkxwsd jls ofgbjmwfkiu olfmxmtmwaokttg jlsx ls kffkxwsd jlsi zg tsxwjl. jlsx ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai kxd clm ckuxj.') # doctest: +ELLIPSIS
 348     ((15, 22, True), 0.23570361818655...)
 349     """
 350     sanitised_message = sanitise(message)
 351     best_multiplier = 0
 352     best_adder = 0
 353     best_one_based = True
 354     best_fit = float("inf")
 355     for one_based in [True, False]:
 356         for multiplier in range(1, 26, 2):
 357             for adder in range(26):
 358                 plaintext = affine_decipher(sanitised_message, multiplier, adder, one_based)
 359                 frequencies = message_frequency_scaling(letter_frequencies(plaintext))
 360                 fit = metric(target_frequencies, frequencies)
 361                 logger.debug('Affine break attempt using key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(multiplier, adder, one_based, fit, plaintext[:50]))
 362                 if fit < best_fit:
 363                     best_fit = fit
 364                     best_multiplier = multiplier
 365                     best_adder = adder
 366                     best_one_based = one_based
 367     logger.info('Affine break best fit with key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(best_multiplier, best_adder, best_one_based, best_fit, affine_decipher(sanitised_message, best_multiplier, best_adder, best_one_based)[:50]))
 368     return (best_multiplier, best_adder, best_one_based), best_fit
 369
 370
 371 def keyword_break(message, wordlist=keywords, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
 372     """Breaks a keyword substitution cipher using a dictionary and frequency analysis
 373
 374     >>> keyword_break(keyword_encipher('this is a test message for the keyword decipherment', 'elephant', 1), wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
 375     (('elephant', 1), 0.41643991598441...)
 376     """
 377     best_keyword = ''
 378     best_wrap_alphabet = True
 379     best_fit = float("inf")
 380     for wrap_alphabet in range(3):
 381         for keyword in wordlist:
 382             plaintext = keyword_decipher(message, keyword, wrap_alphabet)
 383             frequencies = message_frequency_scaling(letter_frequencies(plaintext))
 384             fit = metric(target_frequencies, frequencies)
 385             logger.debug('Keyword break attempt using key {0} (wrap={1}) gives fit of {2} and decrypt starting: {3}'.format(keyword, wrap_alphabet, fit, sanitise(plaintext)[:50]))
 386             if fit < best_fit:
 387                 best_fit = fit
 388                 best_keyword = keyword
 389                 best_wrap_alphabet = wrap_alphabet
 390     logger.info('Keyword break best fit with key {0} (wrap={1}) gives fit of {2} and decrypt starting: {3}'.format(best_keyword, best_wrap_alphabet, best_fit, sanitise(keyword_decipher(message, best_keyword))[:50]))
 391     return (best_keyword, best_wrap_alphabet), best_fit
 392
 393
 394 if __name__ == "__main__":
 395     import doctest
 396     doctest.testmod()