cipher.py

   1 import string
   2 import collections
   3 import norms
   4 import logging
   5 import math
   6 from itertools import zip_longest
   7 from segment import segment
   8
   9 # To time a run:
  10 #
  11 # import timeit
  12 # c5a = open('2012/5a.ciphertext', 'r').read()
  13 # timeit.timeit('keyword_break(c5a)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break', number=1)
  14
  15
  16 logger = logging.getLogger(__name__)
  17 logger.addHandler(logging.FileHandler('cipher.log'))
  18 logger.setLevel(logging.WARNING)
  19 #logger.setLevel(logging.INFO)
  20
  21 english_counts = collections.defaultdict(int)
  22 with open('count_1l.txt', 'r') as f:
  23     for line in f:
  24         (letter, count) = line.split("\t")
  25         english_counts[letter] = int(count)
  26 normalised_english_counts = norms.normalise(english_counts)
  27
  28 with open('words.txt', 'r') as f:
  29     keywords = [line.rstrip() for line in f]
  30
  31 modular_division_table = [[0]*26 for x in range(26)]
  32 for a in range(26):
  33     for b in range(26):
  34         c = (a * b) % 26
  35         modular_division_table[b][c] = a
  36
  37
  38 def sanitise(text):
  39     """Remove all non-alphabetic characters and convert the text to lowercase
  40
  41     >>> sanitise('The Quick')
  42     'thequick'
  43     >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
  44     'thequickbrownfoxjumpedoverthelazydog'
  45     """
  46     sanitised = [c.lower() for c in text if c in string.ascii_letters]
  47     return ''.join(sanitised)
  48
  49 def ngrams(text, n):
  50     """Returns all n-grams of a text
  51
  52     >>> ngrams(sanitise('the quick brown fox'), 2)
  53     [('t', 'h'), ('h', 'e'), ('e', 'q'), ('q', 'u'), ('u', 'i'), ('i', 'c'), ('c', 'k'), ('k', 'b'), ('b', 'r'), ('r', 'o'), ('o', 'w'), ('w', 'n'), ('n', 'f'), ('f', 'o'), ('o', 'x')]
  54     >>> ngrams(sanitise('the quick brown fox'), 4)
  55     [('t', 'h', 'e', 'q'), ('h', 'e', 'q', 'u'), ('e', 'q', 'u', 'i'), ('q', 'u', 'i', 'c'), ('u', 'i', 'c', 'k'), ('i', 'c', 'k', 'b'), ('c', 'k', 'b', 'r'), ('k', 'b', 'r', 'o'), ('b', 'r', 'o', 'w'), ('r', 'o', 'w', 'n'), ('o', 'w', 'n', 'f'), ('w', 'n', 'f', 'o'), ('n', 'f', 'o', 'x')]
  56     """
  57     return [tuple(text[i:i+n]) for i in range(len(text)-n+1)]
  58
  59 def every_nth(text, n):
  60     """Returns n strings, each of which consists of every nth character,
  61     starting with the 0th, 1st, 2nd, ... (n-1)th character
  62
  63     >>> every_nth(string.ascii_lowercase, 5)
  64     ['afkpuz', 'bglqv', 'chmrw', 'dinsx', 'ejoty']
  65     >>> every_nth(string.ascii_lowercase, 1)
  66     ['abcdefghijklmnopqrstuvwxyz']
  67     >>> every_nth(string.ascii_lowercase, 26)
  68     ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
  69     """
  70     split_text = [text[i:i+n] for i in range(0, len(text), n)]
  71     return [''.join(l) for l in zip_longest(*split_text, fillvalue='')]
  72
  73 def combine_every_nth(split_text):
  74     """Reforms a text split into every_nth strings
  75
  76     >>> combine_every_nth(every_nth(string.ascii_lowercase, 5))
  77     'abcdefghijklmnopqrstuvwxyz'
  78     >>> combine_every_nth(every_nth(string.ascii_lowercase, 1))
  79     'abcdefghijklmnopqrstuvwxyz'
  80     >>> combine_every_nth(every_nth(string.ascii_lowercase, 26))
  81     'abcdefghijklmnopqrstuvwxyz'
  82     """
  83     return ''.join([''.join(l) for l in zip_longest(*split_text, fillvalue='')])
  84
  85
  86 def letter_frequencies(text):
  87     """Count the number of occurrences of each character in text
  88
  89     >>> sorted(letter_frequencies('abcdefabc').items())
  90     [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
  91     >>> sorted(letter_frequencies('the quick brown fox jumped over the lazy dog').items())
  92     [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
  93     >>> sorted(letter_frequencies('The Quick BROWN fox jumped! over... the (9lazy) DOG').items())
  94     [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1), ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1), ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1), ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
  95     >>> sorted(letter_frequencies(sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')).items())
  96     [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
  97     """
  98     counts = collections.defaultdict(int)
  99     for c in text:
 100         counts[c] += 1
 101     return counts
 102
 103 def deduplicate(text):
 104     return list(collections.OrderedDict.fromkeys(text))
 105
 106
 107
 108 def caesar_encipher_letter(letter, shift):
 109     """Encipher a letter, given a shift amount
 110
 111     >>> caesar_encipher_letter('a', 1)
 112     'b'
 113     >>> caesar_encipher_letter('a', 2)
 114     'c'
 115     >>> caesar_encipher_letter('b', 2)
 116     'd'
 117     >>> caesar_encipher_letter('x', 2)
 118     'z'
 119     >>> caesar_encipher_letter('y', 2)
 120     'a'
 121     >>> caesar_encipher_letter('z', 2)
 122     'b'
 123     >>> caesar_encipher_letter('z', -1)
 124     'y'
 125     >>> caesar_encipher_letter('a', -1)
 126     'z'
 127     """
 128     if letter in string.ascii_letters:
 129         if letter in string.ascii_uppercase:
 130             alphabet_start = ord('A')
 131         else:
 132             alphabet_start = ord('a')
 133         return chr(((ord(letter) - alphabet_start + shift) % 26) + alphabet_start)
 134     else:
 135         return letter
 136
 137 def caesar_decipher_letter(letter, shift):
 138     """Decipher a letter, given a shift amount
 139
 140     >>> caesar_decipher_letter('b', 1)
 141     'a'
 142     >>> caesar_decipher_letter('b', 2)
 143     'z'
 144     """
 145     return caesar_encipher_letter(letter, -shift)
 146
 147 def caesar_encipher(message, shift):
 148     """Encipher a message with the Caesar cipher of given shift
 149
 150     >>> caesar_encipher('abc', 1)
 151     'bcd'
 152     >>> caesar_encipher('abc', 2)
 153     'cde'
 154     >>> caesar_encipher('abcxyz', 2)
 155     'cdezab'
 156     >>> caesar_encipher('ab cx yz', 2)
 157     'cd ez ab'
 158     """
 159     enciphered = [caesar_encipher_letter(l, shift) for l in message]
 160     return ''.join(enciphered)
 161
 162 def caesar_decipher(message, shift):
 163     """Encipher a message with the Caesar cipher of given shift
 164
 165     >>> caesar_decipher('bcd', 1)
 166     'abc'
 167     >>> caesar_decipher('cde', 2)
 168     'abc'
 169     >>> caesar_decipher('cd ez ab', 2)
 170     'ab cx yz'
 171     """
 172     return caesar_encipher(message, -shift)
 173
 174 def affine_encipher_letter(letter, multiplier=1, adder=0, one_based=True):
 175     """Encipher a letter, given a multiplier and adder
 176
 177     >>> ''.join([affine_encipher_letter(l, 3, 5, True) for l in string.ascii_uppercase])
 178     'HKNQTWZCFILORUXADGJMPSVYBE'
 179     >>> ''.join([affine_encipher_letter(l, 3, 5, False) for l in string.ascii_uppercase])
 180     'FILORUXADGJMPSVYBEHKNQTWZC'
 181     """
 182     if letter in string.ascii_letters:
 183         if letter in string.ascii_uppercase:
 184             alphabet_start = ord('A')
 185         else:
 186             alphabet_start = ord('a')
 187         letter_number = ord(letter) - alphabet_start
 188         if one_based: letter_number += 1
 189         cipher_number = (letter_number * multiplier + adder) % 26
 190         if one_based: cipher_number -= 1
 191         return chr(cipher_number % 26 + alphabet_start)
 192     else:
 193         return letter
 194
 195 def affine_decipher_letter(letter, multiplier=1, adder=0, one_based=True):
 196     """Encipher a letter, given a multiplier and adder
 197
 198     >>> ''.join([affine_decipher_letter(l, 3, 5, True) for l in 'HKNQTWZCFILORUXADGJMPSVYBE'])
 199     'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
 200     >>> ''.join([affine_decipher_letter(l, 3, 5, False) for l in 'FILORUXADGJMPSVYBEHKNQTWZC'])
 201     'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
 202     """
 203     if letter in string.ascii_letters:
 204         if letter in string.ascii_uppercase:
 205             alphabet_start = ord('A')
 206         else:
 207             alphabet_start = ord('a')
 208         cipher_number = ord(letter) - alphabet_start
 209         if one_based: cipher_number += 1
 210         plaintext_number = modular_division_table[multiplier][(cipher_number - adder) % 26]
 211         if one_based: plaintext_number -= 1
 212         return chr(plaintext_number % 26 + alphabet_start)
 213     else:
 214         return letter
 215
 216 def affine_encipher(message, multiplier=1, adder=0, one_based=True):
 217     """Encipher a message
 218
 219     >>> affine_encipher('hours passed during which jerico tried every trick he could think of', 15, 22, True)
 220     'lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh'
 221     """
 222     enciphered = [affine_encipher_letter(l, multiplier, adder, one_based) for l in message]
 223     return ''.join(enciphered)
 224
 225 def affine_decipher(message, multiplier=1, adder=0, one_based=True):
 226     """Decipher a message
 227
 228     >>> affine_decipher('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh', 15, 22, True)
 229     'hours passed during which jerico tried every trick he could think of'
 230     """
 231     enciphered = [affine_decipher_letter(l, multiplier, adder, one_based) for l in message]
 232     return ''.join(enciphered)
 233
 234
 235 def keyword_cipher_alphabet_of(keyword, wrap_alphabet=0):
 236     """Find the cipher alphabet given a keyword.
 237     wrap_alphabet controls how the rest of the alphabet is added
 238     after the keyword.
 239     0 : from 'a'
 240     1 : from the last letter in the sanitised keyword
 241     2 : from the largest letter in the sanitised keyword
 242
 243     >>> keyword_cipher_alphabet_of('bayes')
 244     'bayescdfghijklmnopqrtuvwxz'
 245     >>> keyword_cipher_alphabet_of('bayes', 0)
 246     'bayescdfghijklmnopqrtuvwxz'
 247     >>> keyword_cipher_alphabet_of('bayes', 1)
 248     'bayestuvwxzcdfghijklmnopqr'
 249     >>> keyword_cipher_alphabet_of('bayes', 2)
 250     'bayeszcdfghijklmnopqrtuvwx'
 251     """
 252     if wrap_alphabet == 0:
 253         cipher_alphabet = ''.join(deduplicate(sanitise(keyword) + string.ascii_lowercase))
 254     else:
 255         if wrap_alphabet == 1:
 256             last_keyword_letter = deduplicate(sanitise(keyword))[-1]
 257         else:
 258             last_keyword_letter = sorted(sanitise(keyword))[-1]
 259         last_keyword_position = string.ascii_lowercase.find(last_keyword_letter) + 1
 260         cipher_alphabet = ''.join(deduplicate(sanitise(keyword) +
 261                                               string.ascii_lowercase[last_keyword_position:] +
 262                                               string.ascii_lowercase))
 263     return cipher_alphabet
 264
 265
 266 def keyword_encipher(message, keyword, wrap_alphabet=0):
 267     """Enciphers a message with a keyword substitution cipher.
 268     wrap_alphabet controls how the rest of the alphabet is added
 269     after the keyword.
 270     0 : from 'a'
 271     1 : from the last letter in the sanitised keyword
 272     2 : from the largest letter in the sanitised keyword
 273
 274     >>> keyword_encipher('test message', 'bayes')
 275     'rsqr ksqqbds'
 276     >>> keyword_encipher('test message', 'bayes', 0)
 277     'rsqr ksqqbds'
 278     >>> keyword_encipher('test message', 'bayes', 1)
 279     'lskl dskkbus'
 280     >>> keyword_encipher('test message', 'bayes', 2)
 281     'qspq jsppbcs'
 282     """
 283     cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
 284     cipher_translation = ''.maketrans(string.ascii_lowercase, cipher_alphabet)
 285     return message.lower().translate(cipher_translation)
 286
 287 def keyword_decipher(message, keyword, wrap_alphabet=0):
 288     """Deciphers a message with a keyword substitution cipher.
 289     wrap_alphabet controls how the rest of the alphabet is added
 290     after the keyword.
 291     0 : from 'a'
 292     1 : from the last letter in the sanitised keyword
 293     2 : from the largest letter in the sanitised keyword
 294
 295     >>> keyword_decipher('rsqr ksqqbds', 'bayes')
 296     'test message'
 297     >>> keyword_decipher('rsqr ksqqbds', 'bayes', 0)
 298     'test message'
 299     >>> keyword_decipher('lskl dskkbus', 'bayes', 1)
 300     'test message'
 301     >>> keyword_decipher('qspq jsppbcs', 'bayes', 2)
 302     'test message'
 303     """
 304     cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
 305     cipher_translation = ''.maketrans(cipher_alphabet, string.ascii_lowercase)
 306     return message.lower().translate(cipher_translation)
 307
 308 def scytale_encipher(message, rows):
 309     """Enciphers using the scytale transposition cipher
 310
 311     >>> scytale_encipher('thequickbrownfox', 3)
 312     'tcnhkfeboqrxuoiw'
 313     >>> scytale_encipher('thequickbrownfox', 4)
 314     'tubnhirfecooqkwx'
 315     >>> scytale_encipher('thequickbrownfox', 5)
 316     'tubnhirfecooqkwx'
 317     >>> scytale_encipher('thequickbrownfox', 6)
 318     'tqcrnxhukofeibwo'
 319     >>> scytale_encipher('thequickbrownfox', 7)
 320     'tqcrnxhukofeibwo'
 321     """
 322     row_length = math.ceil(len(message) / rows)
 323     slices = [message[i:i+row_length] for i in range(0, len(message), row_length)]
 324     return ''.join([''.join(r) for r in zip_longest(*slices, fillvalue='')])
 325
 326 def scytale_decipher(message, rows):
 327     """Deciphers using the scytale transposition cipher
 328
 329     >>> scytale_decipher('tcnhkfeboqrxuoiw', 3)
 330     'thequickbrownfox'
 331     """
 332     cols = math.ceil(len(message) / rows)
 333     if len(message) % rows == 0:
 334         part_cols = 0
 335     else:
 336         part_cols = rows - len(message) % rows
 337     full_cols = cols - part_cols
 338     columns = [message[i:i+rows] for i in range(0, full_cols * rows, rows)] + \
 339               [message[i:i+rows-1] for i in range(full_cols * rows, len(message), rows - 1)]
 340     return ''.join([''.join(c) for c in zip_longest(*columns, fillvalue='')])
 341
 342
 343 def caesar_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
 344     """Breaks a Caesar cipher using frequency analysis
 345
 346     >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrhecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS
 347     (4, 0.31863952890183...)
 348     >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgteeraxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS
 349     (19, 0.42152901235832...)
 350     >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurersvaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS
 351     (13, 0.316029208075451...)
 352     """
 353     sanitised_message = sanitise(message)
 354     best_shift = 0
 355     best_fit = float("inf")
 356     for shift in range(26):
 357         plaintext = caesar_decipher(sanitised_message, shift)
 358         frequencies = message_frequency_scaling(letter_frequencies(plaintext))
 359         fit = metric(target_frequencies, frequencies)
 360         logger.debug('Caesar break attempt using key {0} gives fit of {1} and decrypt starting: {2}'.format(shift, fit, plaintext[:50]))
 361         if fit < best_fit:
 362             best_fit = fit
 363             best_shift = shift
 364     logger.info('Caesar break best fit: key {0} gives fit of {1} and decrypt starting: {2}'.format(best_shift, best_fit, caesar_decipher(sanitised_message, best_shift)[:50]))
 365     return best_shift, best_fit
 366
 367 def affine_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
 368     """Breaks an affine cipher using frequency analysis
 369
 370     >>> affine_break('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh jm bfmibj umis hfsul axubafkjamx. ls kffkxwsd jls ofgbjmwfkiu olfmxmtmwaokttg jlsx ls kffkxwsd jlsi zg tsxwjl. jlsx ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai kxd clm ckuxj.') # doctest: +ELLIPSIS
 371     ((15, 22, True), 0.23570361818655...)
 372     """
 373     sanitised_message = sanitise(message)
 374     best_multiplier = 0
 375     best_adder = 0
 376     best_one_based = True
 377     best_fit = float("inf")
 378     for one_based in [True, False]:
 379         for multiplier in range(1, 26, 2):
 380             for adder in range(26):
 381                 plaintext = affine_decipher(sanitised_message, multiplier, adder, one_based)
 382                 frequencies = message_frequency_scaling(letter_frequencies(plaintext))
 383                 fit = metric(target_frequencies, frequencies)
 384                 logger.debug('Affine break attempt using key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(multiplier, adder, one_based, fit, plaintext[:50]))
 385                 if fit < best_fit:
 386                     best_fit = fit
 387                     best_multiplier = multiplier
 388                     best_adder = adder
 389                     best_one_based = one_based
 390     logger.info('Affine break best fit with key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(best_multiplier, best_adder, best_one_based, best_fit, affine_decipher(sanitised_message, best_multiplier, best_adder, best_one_based)[:50]))
 391     return (best_multiplier, best_adder, best_one_based), best_fit
 392
 393
 394 def keyword_break(message, wordlist=keywords, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
 395     """Breaks a keyword substitution cipher using a dictionary and frequency analysis
 396
 397     >>> keyword_break(keyword_encipher('this is a test message for the keyword decipherment', 'elephant', 1), wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
 398     (('elephant', 1), 0.41643991598441...)
 399     """
 400     best_keyword = ''
 401     best_wrap_alphabet = True
 402     best_fit = float("inf")
 403     for wrap_alphabet in range(3):
 404         for keyword in wordlist:
 405             plaintext = keyword_decipher(message, keyword, wrap_alphabet)
 406             frequencies = message_frequency_scaling(letter_frequencies(plaintext))
 407             fit = metric(target_frequencies, frequencies)
 408             logger.debug('Keyword break attempt using key {0} (wrap={1}) gives fit of {2} and decrypt starting: {3}'.format(keyword, wrap_alphabet, fit, sanitise(plaintext)[:50]))
 409             if fit < best_fit:
 410                 best_fit = fit
 411                 best_keyword = keyword
 412                 best_wrap_alphabet = wrap_alphabet
 413     logger.info('Keyword break best fit with key {0} (wrap={1}) gives fit of {2} and decrypt starting: {3}'.format(best_keyword, best_wrap_alphabet, best_fit, sanitise(keyword_decipher(message, best_keyword))[:50]))
 414     return (best_keyword, best_wrap_alphabet), best_fit
 415
 416
 417 if __name__ == "__main__":
 418     import doctest
 419     doctest.testmod()