cipher.py

   1 import string
   2 import collections
   3 import norms
   4 import logging
   5 from segment import segment
   6
   7 logger = logging.getLogger(__name__)
   8 logger.addHandler(logging.FileHandler('cipher.log'))
   9 logger.setLevel(logging.WARNING)
  10 #logger.setLevel(logging.INFO)
  11
  12 english_counts = collections.defaultdict(int)
  13 with open('count_1l.txt', 'r') as f:
  14     for line in f:
  15         (letter, count) = line.split("\t")
  16         english_counts[letter] = int(count)
  17 normalised_english_counts = norms.normalise(english_counts)
  18
  19 with open('words.txt', 'r') as f:
  20     keywords = [line.rstrip() for line in f]
  21
  22 modular_division_table = [[0]*26 for x in range(26)]
  23 for a in range(26):
  24     for b in range(26):
  25         c = (a * b) % 26
  26         modular_division_table[b][c] = a
  27
  28 modular_division_table_one_based = [[0]*27 for x in range(27)]
  29 for a in range(27):
  30     for b in range(27):
  31         c = ((a * b)-1) % 26 + 1
  32         modular_division_table_one_based[b][c] = a
  33
  34
  35
  36 def sanitise(text):
  37     """Remove all non-alphabetic characters and convert the text to lowercase
  38
  39     >>> sanitise('The Quick')
  40     'thequick'
  41     >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
  42     'thequickbrownfoxjumpedoverthelazydog'
  43     """
  44     sanitised = [c.lower() for c in text if c in string.ascii_letters]
  45     return ''.join(sanitised)
  46
  47 def ngrams(text, n):
  48     """Returns all n-grams of a text
  49
  50     >>> ngrams(sanitise('the quick brown fox'), 2)
  51     [('t', 'h'), ('h', 'e'), ('e', 'q'), ('q', 'u'), ('u', 'i'), ('i', 'c'), ('c', 'k'), ('k', 'b'), ('b', 'r'), ('r', 'o'), ('o', 'w'), ('w', 'n'), ('n', 'f'), ('f', 'o'), ('o', 'x')]
  52     >>> ngrams(sanitise('the quick brown fox'), 4)
  53     [('t', 'h', 'e', 'q'), ('h', 'e', 'q', 'u'), ('e', 'q', 'u', 'i'), ('q', 'u', 'i', 'c'), ('u', 'i', 'c', 'k'), ('i', 'c', 'k', 'b'), ('c', 'k', 'b', 'r'), ('k', 'b', 'r', 'o'), ('b', 'r', 'o', 'w'), ('r', 'o', 'w', 'n'), ('o', 'w', 'n', 'f'), ('w', 'n', 'f', 'o'), ('n', 'f', 'o', 'x')]
  54     """
  55     return [tuple(text[i:i+n]) for i in range(len(text)-n+1)]
  56
  57 def letter_frequencies(text):
  58     """Count the number of occurrences of each character in text
  59
  60     >>> sorted(letter_frequencies('abcdefabc').items())
  61     [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
  62     >>> sorted(letter_frequencies('the quick brown fox jumped over the lazy dog').items())
  63     [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
  64     >>> sorted(letter_frequencies('The Quick BROWN fox jumped! over... the (9lazy) DOG').items())
  65     [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1), ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1), ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1), ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
  66     >>> sorted(letter_frequencies(sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')).items())
  67     [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
  68     """
  69     counts = collections.defaultdict(int)
  70     for c in text:
  71         counts[c] += 1
  72     return counts
  73
  74 def deduplicate(text):
  75     return list(collections.OrderedDict.fromkeys(text))
  76
  77
  78
  79 def caesar_encipher_letter(letter, shift):
  80     """Encipher a letter, given a shift amount
  81
  82     >>> caesar_encipher_letter('a', 1)
  83     'b'
  84     >>> caesar_encipher_letter('a', 2)
  85     'c'
  86     >>> caesar_encipher_letter('b', 2)
  87     'd'
  88     >>> caesar_encipher_letter('x', 2)
  89     'z'
  90     >>> caesar_encipher_letter('y', 2)
  91     'a'
  92     >>> caesar_encipher_letter('z', 2)
  93     'b'
  94     >>> caesar_encipher_letter('z', -1)
  95     'y'
  96     >>> caesar_encipher_letter('a', -1)
  97     'z'
  98     """
  99     if letter in string.ascii_letters:
 100         if letter in string.ascii_uppercase:
 101             alphabet_start = ord('A')
 102         else:
 103             alphabet_start = ord('a')
 104         return chr(((ord(letter) - alphabet_start + shift) % 26) + alphabet_start)
 105     else:
 106         return letter
 107
 108 def caesar_decipher_letter(letter, shift):
 109     """Decipher a letter, given a shift amount
 110
 111     >>> caesar_decipher_letter('b', 1)
 112     'a'
 113     >>> caesar_decipher_letter('b', 2)
 114     'z'
 115     """
 116     return caesar_encipher_letter(letter, -shift)
 117
 118 def caesar_encipher(message, shift):
 119     """Encipher a message with the Caesar cipher of given shift
 120
 121     >>> caesar_encipher('abc', 1)
 122     'bcd'
 123     >>> caesar_encipher('abc', 2)
 124     'cde'
 125     >>> caesar_encipher('abcxyz', 2)
 126     'cdezab'
 127     >>> caesar_encipher('ab cx yz', 2)
 128     'cd ez ab'
 129     """
 130     enciphered = [caesar_encipher_letter(l, shift) for l in message]
 131     return ''.join(enciphered)
 132
 133 def caesar_decipher(message, shift):
 134     """Encipher a message with the Caesar cipher of given shift
 135
 136     >>> caesar_decipher('bcd', 1)
 137     'abc'
 138     >>> caesar_decipher('cde', 2)
 139     'abc'
 140     >>> caesar_decipher('cd ez ab', 2)
 141     'ab cx yz'
 142     """
 143     return caesar_encipher(message, -shift)
 144
 145 def affine_encipher_letter(letter, multiplier=1, adder=0, one_based=True):
 146     """Encipher a letter, given a multiplier and adder
 147
 148     >>> ''.join([affine_encipher_letter(l, 3, 5, True) for l in string.ascii_uppercase])
 149     'HKNQTWZCFILORUXADGJMPSVYBE'
 150     >>> ''.join([affine_encipher_letter(l, 3, 5, False) for l in string.ascii_uppercase])
 151     'FILORUXADGJMPSVYBEHKNQTWZC'
 152     """
 153     if letter in string.ascii_letters:
 154         if letter in string.ascii_uppercase:
 155             alphabet_start = ord('A')
 156         else:
 157             alphabet_start = ord('a')
 158         letter_number = ord(letter) - alphabet_start
 159         if one_based:
 160             letter_number += 1
 161         raw_cipher_number = (letter_number * multiplier + adder)
 162         cipher_number = 0
 163         if one_based:
 164             cipher_number = (raw_cipher_number - 1) % 26
 165         else:
 166             cipher_number = raw_cipher_number % 26
 167         return chr(cipher_number + alphabet_start)
 168     else:
 169         return letter
 170
 171 def affine_decipher_letter(letter, multiplier=1, adder=0, one_based=True):
 172     """Encipher a letter, given a multiplier and adder
 173
 174     >>> ''.join([affine_decipher_letter(l, 3, 5, True) for l in 'HKNQTWZCFILORUXADGJMPSVYBE'])
 175     'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
 176     >>> ''.join([affine_decipher_letter(l, 3, 5, False) for l in 'FILORUXADGJMPSVYBEHKNQTWZC'])
 177     'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
 178     """
 179     if letter in string.ascii_letters:
 180         if letter in string.ascii_uppercase:
 181             alphabet_start = ord('A')
 182         else:
 183             alphabet_start = ord('a')
 184         cipher_number = ord(letter) - alphabet_start
 185         if one_based:
 186             cipher_number += 1
 187         plaintext_number = 0
 188         if one_based:
 189             plaintext_number = (modular_division_table_one_based[multiplier][(cipher_number - adder + 26) % 26] - 1) % 26
 190         else:
 191             #plaintext_number = (modular_division_table[multiplier][cipher_number] - adder) % 26
 192             plaintext_number = modular_division_table[multiplier][(cipher_number - adder + 26) % 26]
 193         return chr(plaintext_number + alphabet_start)
 194     else:
 195         return letter
 196
 197 def affine_encipher(message, multiplier=1, adder=0, one_based=True):
 198     """Encipher a message
 199
 200     >>> affine_encipher('hours passed during which jerico tried every trick he could think of', 15, 22, True)
 201     'lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh'
 202     """
 203     enciphered = [affine_encipher_letter(l, multiplier, adder, one_based) for l in message]
 204     return ''.join(enciphered)
 205
 206 def affine_decipher(message, multiplier=1, adder=0, one_based=True):
 207     """Decipher a message
 208
 209     >>> affine_decipher('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh', 15, 22, True)
 210     'hours passed during which jerico tried every trick he could think of'
 211     """
 212     enciphered = [affine_decipher_letter(l, multiplier, adder, one_based) for l in message]
 213     return ''.join(enciphered)
 214
 215
 216 def keyword_cipher_alphabet_of(keyword, wrap_alphabet=0):
 217     """Find the cipher alphabet given a keyword.
 218     wrap_alphabet controls how the rest of the alphabet is added
 219     after the keyword.
 220     0 : from 'a'
 221     1 : from the last letter in the sanitised keyword
 222     2 : from the largest letter in the sanitised keyword
 223
 224     >>> keyword_cipher_alphabet_of('bayes')
 225     'bayescdfghijklmnopqrtuvwxz'
 226     >>> keyword_cipher_alphabet_of('bayes', 0)
 227     'bayescdfghijklmnopqrtuvwxz'
 228     >>> keyword_cipher_alphabet_of('bayes', 1)
 229     'bayestuvwxzcdfghijklmnopqr'
 230     >>> keyword_cipher_alphabet_of('bayes', 2)
 231     'bayeszcdfghijklmnopqrtuvwx'
 232     """
 233     if wrap_alphabet == 0:
 234         cipher_alphabet = ''.join(deduplicate(sanitise(keyword) + string.ascii_lowercase))
 235     else:
 236         if wrap_alphabet == 1:
 237             last_keyword_letter = deduplicate(sanitise(keyword))[-1]
 238         else:
 239             last_keyword_letter = sorted(sanitise(keyword))[-1]
 240         last_keyword_position = string.ascii_lowercase.find(last_keyword_letter) + 1
 241         cipher_alphabet = ''.join(deduplicate(sanitise(keyword) +
 242                                               string.ascii_lowercase[last_keyword_position:] +
 243                                               string.ascii_lowercase))
 244     return cipher_alphabet
 245
 246
 247 def keyword_encipher(message, keyword, wrap_alphabet=0):
 248     """Enciphers a message with a keyword substitution cipher.
 249     wrap_alphabet controls how the rest of the alphabet is added
 250     after the keyword.
 251     0 : from 'a'
 252     1 : from the last letter in the sanitised keyword
 253     2 : from the largest letter in the sanitised keyword
 254
 255     >>> keyword_encipher('test message', 'bayes')
 256     'rsqr ksqqbds'
 257     >>> keyword_encipher('test message', 'bayes', 0)
 258     'rsqr ksqqbds'
 259     >>> keyword_encipher('test message', 'bayes', 1)
 260     'lskl dskkbus'
 261     >>> keyword_encipher('test message', 'bayes', 2)
 262     'qspq jsppbcs'
 263     """
 264     cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
 265     cipher_translation = ''.maketrans(string.ascii_lowercase, cipher_alphabet)
 266     return message.lower().translate(cipher_translation)
 267
 268 def keyword_decipher(message, keyword, wrap_alphabet=0):
 269     """Deciphers a message with a keyword substitution cipher.
 270     wrap_alphabet controls how the rest of the alphabet is added
 271     after the keyword.
 272     0 : from 'a'
 273     1 : from the last letter in the sanitised keyword
 274     2 : from the largest letter in the sanitised keyword
 275
 276     >>> keyword_decipher('rsqr ksqqbds', 'bayes')
 277     'test message'
 278     >>> keyword_decipher('rsqr ksqqbds', 'bayes', 0)
 279     'test message'
 280     >>> keyword_decipher('lskl dskkbus', 'bayes', 1)
 281     'test message'
 282     >>> keyword_decipher('qspq jsppbcs', 'bayes', 2)
 283     'test message'
 284     """
 285     cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
 286     cipher_translation = ''.maketrans(cipher_alphabet, string.ascii_lowercase)
 287     return message.lower().translate(cipher_translation)
 288
 289
 290 def caesar_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
 291     """Breaks a Caesar cipher using frequency analysis
 292
 293     >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrhecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS
 294     (4, 0.31863952890183...)
 295     >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgteeraxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS
 296     (19, 0.42152901235832...)
 297     >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurersvaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS
 298     (13, 0.316029208075451...)
 299     """
 300     sanitised_message = sanitise(message)
 301     best_shift = 0
 302     best_fit = float("inf")
 303     for shift in range(26):
 304         plaintext = caesar_decipher(sanitised_message, shift)
 305         frequencies = message_frequency_scaling(letter_frequencies(plaintext))
 306         fit = metric(target_frequencies, frequencies)
 307         logger.debug('Caesar break attempt using key {0} gives fit of {1} and decrypt starting: {2}'.format(shift, fit, plaintext[:50]))
 308         if fit < best_fit:
 309             best_fit = fit
 310             best_shift = shift
 311     logger.info('Caesar break best fit: key {0} gives fit of {1} and decrypt starting: {2}'.format(best_shift, best_fit, caesar_decipher(sanitised_message, best_shift)[:50]))
 312     return best_shift, best_fit
 313
 314 def affine_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
 315     """Breaks an affine cipher using frequency analysis
 316
 317     >>> affine_break('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls omytd jlaxe mh jm bfmibj umis hfsul axubafkjamx. ls kffkxwsd jls ofgbjmwfkiu olfmxmtmwaokttg jlsx ls kffkxwsd jlsi zg tsxwjl. jlsx ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai kxd clm ckuxj.') # doctest: +ELLIPSIS
 318     ((15, 22, True), 0.23570361818655...)
 319     """
 320     sanitised_message = sanitise(message)
 321     best_multiplier = 0
 322     best_adder = 0
 323     best_one_based = True
 324     best_fit = float("inf")
 325     for one_based in [True, False]:
 326         for multiplier in range(1, 26, 2):
 327             for adder in range(26):
 328                 plaintext = affine_decipher(sanitised_message, multiplier, adder, one_based)
 329                 frequencies = message_frequency_scaling(letter_frequencies(plaintext))
 330                 fit = metric(target_frequencies, frequencies)
 331                 logger.debug('Affine break attempt using key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(multiplier, adder, one_based, fit, plaintext[:50]))
 332                 if fit < best_fit:
 333                     best_fit = fit
 334                     best_multiplier = multiplier
 335                     best_adder = adder
 336                     best_one_based = one_based
 337     logger.info('Affine break best fit with key {0}x+{1} ({2}) gives fit of {3} and decrypt starting: {4}'.format(best_multiplier, best_adder, best_one_based, best_fit, affine_decipher(sanitised_message, best_multiplier, best_adder, best_one_based)[:50]))
 338     return (best_multiplier, best_adder, best_one_based), best_fit
 339
 340
 341 def keyword_break(message, wordlist=keywords, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
 342     """Breaks a keyword substitution cipher using a dictionary and frequency analysis
 343
 344     >>> keyword_break(keyword_encipher('this is a test message for the keyword decipherment', 'elephant', 1), wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
 345     (('elephant', 1), 0.41643991598441...)
 346     """
 347     best_keyword = ''
 348     best_wrap_alphabet = True
 349     best_fit = float("inf")
 350     for wrap_alphabet in range(3):
 351         for keyword in wordlist:
 352             plaintext = keyword_decipher(message, keyword, wrap_alphabet)
 353             frequencies = message_frequency_scaling(letter_frequencies(plaintext))
 354             fit = metric(target_frequencies, frequencies)
 355             logger.debug('Keyword break attempt using key {0} (wrap={1}) gives fit of {2} and decrypt starting: {3}'.format(keyword, wrap_alphabet, fit, sanitise(plaintext)[:50]))
 356             if fit < best_fit:
 357                 best_fit = fit
 358                 best_keyword = keyword
 359                 best_wrap_alphabet = wrap_alphabet
 360     logger.info('Keyword break best fit with key {0} (wrap={1}) gives fit of {2} and decrypt starting: {3}'.format(best_keyword, best_wrap_alphabet, best_fit, sanitise(keyword_decipher(message, best_keyword))[:50]))
 361     return (best_keyword, best_wrap_alphabet), best_fit
 362
 363
 364 if __name__ == "__main__":
 365     import doctest
 366     doctest.testmod()