X-Git-Url: https://git.njae.me.uk/?a=blobdiff_plain;f=cipher.py;h=d2619029de591af1d07e5f85160e187629332751;hb=c03fdcbe5891f43ce328f33121105a71c01eef86;hp=4fa44cf65a2c31fbef399ffa4b3f3a57bc041064;hpb=a573fbbbdcdc110be7b35cbe78a8a45af99da461;p=cipher-tools.git diff --git a/cipher.py b/cipher.py index 4fa44cf..d261902 100644 --- a/cipher.py +++ b/cipher.py @@ -1,18 +1,8 @@ import string import collections -import norms import logging -import math -from itertools import zip_longest, repeat -from segment import segment -from multiprocessing import Pool - -# To time a run: -# -# import timeit -# c5a = open('2012/5a.ciphertext', 'r').read() -# timeit.timeit('keyword_break(c5a)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break', number=1) -# timeit.repeat('keyword_break_mp(c5a, chunksize=500)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break_mp', repeat=5, number=1 +from itertools import zip_longest, cycle, chain +from language_models import * logger = logging.getLogger(__name__) logger.addHandler(logging.FileHandler('cipher.log')) @@ -20,41 +10,14 @@ logger.setLevel(logging.WARNING) #logger.setLevel(logging.INFO) #logger.setLevel(logging.DEBUG) -english_counts = collections.defaultdict(int) -with open('count_1l.txt', 'r') as f: - for line in f: - (letter, count) = line.split("\t") - english_counts[letter] = int(count) -normalised_english_counts = norms.normalise(english_counts) -english_bigram_counts = collections.defaultdict(int) -with open('count_2l.txt', 'r') as f: - for line in f: - (bigram, count) = line.split("\t") - english_bigram_counts[bigram] = int(count) -normalised_english_bigram_counts = norms.normalise(english_bigram_counts) - -with open('words.txt', 'r') as f: - keywords = [line.rstrip() for line in f] - -modular_division_table = [[0]*26 for x in range(26)] +modular_division_table = [[0]*26 for _ in range(26)] for a in range(26): for b in range(26): c = (a * b) % 26 modular_division_table[b][c] = a -def sanitise(text): - """Remove all non-alphabetic characters and convert the text to lowercase - - >>> sanitise('The Quick') - 'thequick' - >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG') - 'thequickbrownfoxjumpedoverthelazydog' - """ - sanitised = [c.lower() for c in text if c in string.ascii_letters] - return ''.join(sanitised) - def ngrams(text, n): """Returns all n-grams of a text @@ -97,17 +60,33 @@ def combine_every_nth(split_text): return ''.join([''.join(l) for l in zip_longest(*split_text, fillvalue='')]) +def chunks(text, n, fillvalue=None): + """Split a text into chunks of n characters + + >>> chunks('abcdefghi', 3) + ['abc', 'def', 'ghi'] + >>> chunks('abcdefghi', 4) + ['abcd', 'efgh', 'i'] + >>> chunks('abcdefghi', 4, fillvalue='!') + ['abcd', 'efgh', 'i!!!'] + """ + if fillvalue: + padding = fillvalue[0] * (n - len(text) % n) + else: + padding = '' + return [(text+padding)[i:i+n] for i in range(0, len(text), n)] + def transpose(items, transposition): """Moves items around according to the given transposition - >>> transpose(['a', 'b', 'c', 'd'], [0,1,2,3]) + >>> transpose(['a', 'b', 'c', 'd'], (0,1,2,3)) ['a', 'b', 'c', 'd'] - >>> transpose(['a', 'b', 'c', 'd'], [3,1,2,0]) + >>> transpose(['a', 'b', 'c', 'd'], (3,1,2,0)) ['d', 'b', 'c', 'a'] - >>> transpose([10,11,12,13,14,15], [3,2,4,1,5,0]) + >>> transpose([10,11,12,13,14,15], (3,2,4,1,5,0)) [13, 12, 14, 11, 15, 10] """ - transposed = list(repeat('', len(transposition))) + transposed = [''] * len(transposition) for p, t in enumerate(transposition): transposed[p] = items[t] return transposed @@ -122,48 +101,15 @@ def untranspose(items, transposition): >>> untranspose([13, 12, 14, 11, 15, 10], [3,2,4,1,5,0]) [10, 11, 12, 13, 14, 15] """ - transposed = list(repeat('', len(transposition))) + transposed = [''] * len(transposition) for p, t in enumerate(transposition): transposed[t] = items[p] return transposed - -def frequencies(text): - """Count the number of occurrences of each character in text - - >>> sorted(frequencies('abcdefabc').items()) - [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)] - >>> sorted(frequencies('the quick brown fox jumped over the lazy ' \ - 'dog').items()) # doctest: +NORMALIZE_WHITESPACE - [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), - ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), - ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), - ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)] - >>> sorted(frequencies('The Quick BROWN fox jumped! over... the ' \ - '(9lazy) DOG').items()) # doctest: +NORMALIZE_WHITESPACE - [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1), - ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1), - ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2), - ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1), - ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)] - >>> sorted(frequencies(sanitise('The Quick BROWN fox jumped! over... ' \ - 'the (9lazy) DOG')).items()) # doctest: +NORMALIZE_WHITESPACE - [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), - ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), - ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), - ('w', 1), ('x', 1), ('y', 1), ('z', 1)] - """ - counts = collections.defaultdict(int) - for c in text: - counts[c] += 1 - return counts -letter_frequencies = frequencies - def deduplicate(text): return list(collections.OrderedDict.fromkeys(text)) - def caesar_encipher_letter(letter, shift): """Encipher a letter, given a shift amount @@ -271,8 +217,9 @@ def affine_decipher_letter(letter, multiplier=1, adder=0, one_based=True): alphabet_start = ord('a') cipher_number = ord(letter) - alphabet_start if one_based: cipher_number += 1 - plaintext_number = ( modular_division_table[multiplier] - [(cipher_number - adder) % 26] ) + plaintext_number = ( + modular_division_table[multiplier] + [(cipher_number - adder) % 26] ) if one_based: plaintext_number -= 1 return chr(plaintext_number % 26 + alphabet_start) else: @@ -420,220 +367,152 @@ def scytale_decipher(message, rows): def transpositions_of(keyword): - """ - >>> transpositions_of('clever') - [0, 2, 1, 4, 3] - """ - key = deduplicate(keyword) - transpositions = [key.index(l) for l in sorted(key)] - return transpositions + """Finds the transpostions given by a keyword. For instance, the keyword + 'clever' rearranges to 'celrv', so the first column (0) stays first, the + second column (1) moves to third, the third column (2) moves to second, + and so on. -def column_transposition_encipher(message, keyword): - """ - >>> column_transposition_encipher('hellothere', 'clever') - 'hleolteher' - """ - return column_transposition_worker(message, keyword, encipher=True) + If passed a tuple, assume it's already a transposition and just return it. -def column_transposition_decipher(message, keyword): - """ - >>> column_transposition_decipher('hleolteher', 'clever') + >>> transpositions_of('clever') + (0, 2, 1, 4, 3) + >>> transpositions_of('fred') + (3, 2, 0, 1) + >>> transpositions_of((3, 2, 0, 1)) + (3, 2, 0, 1) + """ + if isinstance(keyword, tuple): + return keyword + else: + key = deduplicate(keyword) + transpositions = tuple(key.index(l) for l in sorted(key)) + return transpositions + +def pad(message_len, group_len, fillvalue): + padding_length = group_len - message_len % group_len + if padding_length == group_len: padding_length = 0 + padding = '' + for i in range(padding_length): + if callable(fillvalue): + padding += fillvalue() + else: + padding += fillvalue + return padding + +def column_transposition_encipher(message, keyword, fillvalue=' ', + fillcolumnwise=False, + emptycolumnwise=False): + """Enciphers using the column transposition cipher. + Message is padded to allow all rows to be the same length. + + >>> column_transposition_encipher('hellothere', 'abcdef', fillcolumnwise=True) + 'hlohr eltee ' + >>> column_transposition_encipher('hellothere', 'abcdef', fillcolumnwise=True, emptycolumnwise=True) + 'hellothere ' + >>> column_transposition_encipher('hellothere', 'abcdef') + 'hellothere ' + >>> column_transposition_encipher('hellothere', 'abcde') + 'hellothere' + >>> column_transposition_encipher('hellothere', 'abcde', fillcolumnwise=True, emptycolumnwise=True) + 'hellothere' + >>> column_transposition_encipher('hellothere', 'abcde', fillcolumnwise=True, emptycolumnwise=False) + 'hlohreltee' + >>> column_transposition_encipher('hellothere', 'abcde', fillcolumnwise=False, emptycolumnwise=True) + 'htehlelroe' + >>> column_transposition_encipher('hellothere', 'abcde', fillcolumnwise=False, emptycolumnwise=False) 'hellothere' + >>> column_transposition_encipher('hellothere', 'clever', fillcolumnwise=True, emptycolumnwise=True) + 'heotllrehe' + >>> column_transposition_encipher('hellothere', 'clever', fillcolumnwise=True, emptycolumnwise=False) + 'holrhetlee' + >>> column_transposition_encipher('hellothere', 'clever', fillcolumnwise=False, emptycolumnwise=True) + 'htleehoelr' + >>> column_transposition_encipher('hellothere', 'clever', fillcolumnwise=False, emptycolumnwise=False) + 'hleolteher' + >>> column_transposition_encipher('hellothere', 'cleverly') + 'hleolthre e ' + >>> column_transposition_encipher('hellothere', 'cleverly', fillvalue='!') + 'hleolthre!e!' + >>> column_transposition_encipher('hellothere', 'cleverly', fillvalue=lambda: '*') + 'hleolthre*e*' """ - return column_transposition_worker(message, keyword, encipher=False) + transpositions = transpositions_of(keyword) + message += pad(len(message), len(transpositions), fillvalue) + if fillcolumnwise: + rows = every_nth(message, len(message) // len(transpositions)) + else: + rows = chunks(message, len(transpositions)) + transposed = [transpose(r, transpositions) for r in rows] + if emptycolumnwise: + return combine_every_nth(transposed) + else: + return ''.join(chain(*transposed)) -def column_transposition_worker(message, keyword, encipher=True): - """ - >>> column_transposition_worker('hellothere', 'clever') - 'hleolteher' - >>> column_transposition_worker('hellothere', 'clever', encipher=True) - 'hleolteher' - >>> column_transposition_worker('hleolteher', 'clever', encipher=False) +def column_transposition_decipher(message, keyword, fillvalue=' ', + fillcolumnwise=False, + emptycolumnwise=False): + """Deciphers using the column transposition cipher. + Message is padded to allow all rows to be the same length. + + >>> column_transposition_decipher('hellothere', 'abcde', fillcolumnwise=True, emptycolumnwise=True) + 'hellothere' + >>> column_transposition_decipher('hlohreltee', 'abcde', fillcolumnwise=True, emptycolumnwise=False) + 'hellothere' + >>> column_transposition_decipher('htehlelroe', 'abcde', fillcolumnwise=False, emptycolumnwise=True) + 'hellothere' + >>> column_transposition_decipher('hellothere', 'abcde', fillcolumnwise=False, emptycolumnwise=False) + 'hellothere' + >>> column_transposition_decipher('heotllrehe', 'clever', fillcolumnwise=True, emptycolumnwise=True) + 'hellothere' + >>> column_transposition_decipher('holrhetlee', 'clever', fillcolumnwise=True, emptycolumnwise=False) + 'hellothere' + >>> column_transposition_decipher('htleehoelr', 'clever', fillcolumnwise=False, emptycolumnwise=True) + 'hellothere' + >>> column_transposition_decipher('hleolteher', 'clever', fillcolumnwise=False, emptycolumnwise=False) 'hellothere' """ + # >>> column_transposition_decipher('hleolteher', 'clever') + # 'hellothere' + # >>> column_transposition_decipher('hleolthre!e!', 'cleverly', fillvalue='?') + # 'hellothere!!' + # >>> column_transposition_decipher('htleehoelr', 'clever', columnwise=True) + # 'hellothere' + # """ transpositions = transpositions_of(keyword) - columns = every_nth(message, len(transpositions), fillvalue=' ') - if encipher: - transposed_columns = transpose(columns, transpositions) + message += pad(len(message), len(transpositions), '*') + if emptycolumnwise: + rows = every_nth(message, len(message) // len(transpositions)) + else: + rows = chunks(message, len(transpositions)) + untransposed = [untranspose(r, transpositions) for r in rows] + if fillcolumnwise: + return combine_every_nth(untransposed) else: - transposed_columns = untranspose(columns, transpositions) - return combine_every_nth(transposed_columns) + return ''.join(chain(*untransposed)) +def vigenere_encipher(message, keyword): + """Vigenere encipher -def caesar_break(message, - metric=norms.euclidean_distance, - target_counts=normalised_english_counts, - message_frequency_scaling=norms.normalise): - """Breaks a Caesar cipher using frequency analysis - - >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrh' \ - 'ecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS - (4, 0.31863952890183...) - >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgtee' \ - 'raxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS - (19, 0.42152901235832...) - >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurer' \ - 'svaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS - (13, 0.316029208075451...) - """ - sanitised_message = sanitise(message) - best_shift = 0 - best_fit = float("inf") - for shift in range(26): - plaintext = caesar_decipher(sanitised_message, shift) - counts = message_frequency_scaling(letter_frequencies(plaintext)) - fit = metric(target_counts, counts) - logger.debug('Caesar break attempt using key {0} gives fit of {1} ' - 'and decrypt starting: {2}'.format(shift, fit, plaintext[:50])) - if fit < best_fit: - best_fit = fit - best_shift = shift - logger.info('Caesar break best fit: key {0} gives fit of {1} and ' - 'decrypt starting: {2}'.format(best_shift, best_fit, - caesar_decipher(sanitised_message, best_shift)[:50])) - return best_shift, best_fit - -def affine_break(message, - metric=norms.euclidean_distance, - target_counts=normalised_english_counts, - message_frequency_scaling=norms.normalise): - """Breaks an affine cipher using frequency analysis - - >>> affine_break('lmyfu bkuusd dyfaxw claol psfaom jfasd snsfg jfaoe ls ' \ - 'omytd jlaxe mh jm bfmibj umis hfsul axubafkjamx. ls kffkxwsd jls ' \ - 'ofgbjmwfkiu olfmxmtmwaokttg jlsx ls kffkxwsd jlsi zg tsxwjl. jlsx ' \ - 'ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls ' \ - 'bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai kxd ' \ - 'clm ckuxj.') # doctest: +ELLIPSIS - ((15, 22, True), 0.23570361818655...) + >>> vigenere_encipher('hello', 'abc') + 'hfnlp' """ - sanitised_message = sanitise(message) - best_multiplier = 0 - best_adder = 0 - best_one_based = True - best_fit = float("inf") - for one_based in [True, False]: - for multiplier in range(1, 26, 2): - for adder in range(26): - plaintext = affine_decipher(sanitised_message, - multiplier, adder, one_based) - counts = message_frequency_scaling(letter_frequencies(plaintext)) - fit = metric(target_counts, counts) - logger.debug('Affine break attempt using key {0}x+{1} ({2}) ' - 'gives fit of {3} and decrypt starting: {4}'. - format(multiplier, adder, one_based, fit, - plaintext[:50])) - if fit < best_fit: - best_fit = fit - best_multiplier = multiplier - best_adder = adder - best_one_based = one_based - logger.info('Affine break best fit with key {0}x+{1} ({2}) gives fit of {3} ' - 'and decrypt starting: {4}'.format( - best_multiplier, best_adder, best_one_based, best_fit, - affine_decipher(sanitised_message, best_multiplier, - best_adder, best_one_based)[:50])) - return (best_multiplier, best_adder, best_one_based), best_fit - -def keyword_break(message, - wordlist=keywords, - metric=norms.euclidean_distance, - target_counts=normalised_english_counts, - message_frequency_scaling=norms.normalise): - """Breaks a keyword substitution cipher using a dictionary and - frequency analysis - - >>> keyword_break(keyword_encipher('this is a test message for the ' \ - 'keyword decipherment', 'elephant', 1), \ - wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS - (('elephant', 1), 0.41643991598441...) - """ - best_keyword = '' - best_wrap_alphabet = True - best_fit = float("inf") - for wrap_alphabet in range(3): - for keyword in wordlist: - plaintext = keyword_decipher(message, keyword, wrap_alphabet) - counts = message_frequency_scaling(letter_frequencies(plaintext)) - fit = metric(target_counts, counts) - logger.debug('Keyword break attempt using key {0} (wrap={1}) ' - 'gives fit of {2} and decrypt starting: {3}'.format( - keyword, wrap_alphabet, fit, - sanitise(plaintext)[:50])) - if fit < best_fit: - best_fit = fit - best_keyword = keyword - best_wrap_alphabet = wrap_alphabet - logger.info('Keyword break best fit with key {0} (wrap={1}) gives fit of ' - '{2} and decrypt starting: {3}'.format(best_keyword, - best_wrap_alphabet, best_fit, sanitise( - keyword_decipher(message, best_keyword, - best_wrap_alphabet))[:50])) - return (best_keyword, best_wrap_alphabet), best_fit - -def keyword_break_mp(message, - wordlist=keywords, - metric=norms.euclidean_distance, - target_counts=normalised_english_counts, - message_frequency_scaling=norms.normalise, - chunksize=500): - """Breaks a keyword substitution cipher using a dictionary and - frequency analysis - - >>> keyword_break_mp(keyword_encipher('this is a test message for the ' \ - 'keyword decipherment', 'elephant', 1), \ - wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS - (('elephant', 1), 0.41643991598441...) - """ - with Pool() as pool: - helper_args = [(message, word, wrap, metric, target_counts, - message_frequency_scaling) - for word in wordlist for wrap in range(3)] - # Gotcha: the helper function here needs to be defined at the top level - # (limitation of Pool.starmap) - breaks = pool.starmap(keyword_break_one, helper_args, chunksize) - return min(breaks, key=lambda k: k[1]) - -def keyword_break_one(message, keyword, wrap_alphabet, metric, target_counts, - message_frequency_scaling): - plaintext = keyword_decipher(message, keyword, wrap_alphabet) - counts = message_frequency_scaling(letter_frequencies(plaintext)) - fit = metric(target_counts, counts) - logger.debug('Keyword break attempt using key {0} (wrap={1}) gives fit of ' - '{2} and decrypt starting: {3}'.format(keyword, - wrap_alphabet, fit, sanitise(plaintext)[:50])) - return (keyword, wrap_alphabet), fit - -def scytale_break(message, - metric=norms.euclidean_distance, - target_counts=normalised_english_bigram_counts, - message_frequency_scaling=norms.normalise): - """Breaks a Scytale cipher - - >>> scytale_break('tfeulchtrtteehwahsdehneoifeayfsondmwpltmaoalhikotoere' \ - 'dcweatehiplwxsnhooacgorrcrcraotohsgullasenylrendaianeplscdriioto' \ - 'aek') # doctest: +ELLIPSIS - (6, 0.83453041115025...) + shifts = [ord(l) - ord('a') for l in sanitise(keyword)] + pairs = zip(message, cycle(shifts)) + return ''.join([caesar_encipher_letter(l, k) for l, k in pairs]) + +def vigenere_decipher(message, keyword): + """Vigenere decipher + + >>> vigenere_decipher('hfnlp', 'abc') + 'hello' """ - best_key = 0 - best_fit = float("inf") - for key in range(1, 20): - if len(message) % key == 0: - plaintext = scytale_decipher(message, key) - counts = message_frequency_scaling(frequencies( - ngrams(sanitise(plaintext), 2))) - fit = metric(target_counts, counts) - logger.debug('Scytale break attempt using key {0} gives fit of ' - '{1} and decrypt starting: {2}'.format(key, - fit, sanitise(plaintext)[:50])) - if fit < best_fit: - best_fit = fit - best_key = key - logger.info('Scytale break best fit with key {0} gives fit of {1} and ' - 'decrypt starting: {2}'.format(best_key, best_fit, - sanitise(scytale_decipher(message, best_key))[:50])) - return best_key, best_fit + shifts = [ord(l) - ord('a') for l in sanitise(keyword)] + pairs = zip(message, cycle(shifts)) + return ''.join([caesar_decipher_letter(l, k) for l, k in pairs]) + +beaufort_encipher=vigenere_decipher +beaufort_decipher=vigenere_encipher if __name__ == "__main__":