X-Git-Url: https://git.njae.me.uk/?a=blobdiff_plain;f=cipher.py;h=ba4a73f7f39ab8d712b9ba9e52d49e269e1999ed;hb=2a497efd4b256df235ff52606d510d987f8ae325;hp=fdff17fc4e7c0c811253ef295c02d9791e7ec157;hpb=62bbe4277e9676b9255ef98a33ba2ad3dbc0c7ed;p=cipher-tools.git diff --git a/cipher.py b/cipher.py index fdff17f..ba4a73f 100644 --- a/cipher.py +++ b/cipher.py @@ -3,7 +3,7 @@ import collections import norms import logging import math -from itertools import zip_longest +from itertools import zip_longest, repeat from segment import segment from multiprocessing import Pool @@ -34,6 +34,14 @@ with open('count_2l.txt', 'r') as f: english_bigram_counts[bigram] = int(count) normalised_english_bigram_counts = norms.normalise(english_bigram_counts) +english_trigram_counts = collections.defaultdict(int) +with open('count_3l.txt', 'r') as f: + for line in f: + (trigram, count) = line.split("\t") + english_trigram_counts[trigram] = int(count) +normalised_english_trigram_counts = norms.normalise(english_trigram_counts) + + with open('words.txt', 'r') as f: keywords = [line.rstrip() for line in f] @@ -76,20 +84,22 @@ def ngrams(text, n): """ return [text[i:i+n] for i in range(len(text)-n+1)] -def every_nth(text, n): +def every_nth(text, n, fillvalue=''): """Returns n strings, each of which consists of every nth character, starting with the 0th, 1st, 2nd, ... (n-1)th character >>> every_nth(string.ascii_lowercase, 5) ['afkpuz', 'bglqv', 'chmrw', 'dinsx', 'ejoty'] - >>> every_nth(string.ascii_lowercase, 1) + >>> every_nth(string.ascii_lowercase, 1) ['abcdefghijklmnopqrstuvwxyz'] >>> every_nth(string.ascii_lowercase, 26) # doctest: +NORMALIZE_WHITESPACE ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] + >>> every_nth(string.ascii_lowercase, 5, fillvalue='!') + ['afkpuz', 'bglqv!', 'chmrw!', 'dinsx!', 'ejoty!'] """ split_text = [text[i:i+n] for i in range(0, len(text), n)] - return [''.join(l) for l in zip_longest(*split_text, fillvalue='')] + return [''.join(l) for l in zip_longest(*split_text, fillvalue=fillvalue)] def combine_every_nth(split_text): """Reforms a text split into every_nth strings @@ -104,6 +114,36 @@ def combine_every_nth(split_text): return ''.join([''.join(l) for l in zip_longest(*split_text, fillvalue='')]) +def transpose(items, transposition): + """Moves items around according to the given transposition + + >>> transpose(['a', 'b', 'c', 'd'], [0,1,2,3]) + ['a', 'b', 'c', 'd'] + >>> transpose(['a', 'b', 'c', 'd'], [3,1,2,0]) + ['d', 'b', 'c', 'a'] + >>> transpose([10,11,12,13,14,15], [3,2,4,1,5,0]) + [13, 12, 14, 11, 15, 10] + """ + transposed = list(repeat('', len(transposition))) + for p, t in enumerate(transposition): + transposed[p] = items[t] + return transposed + +def untranspose(items, transposition): + """Undoes a transpose + + >>> untranspose(['a', 'b', 'c', 'd'], [0,1,2,3]) + ['a', 'b', 'c', 'd'] + >>> untranspose(['d', 'b', 'c', 'a'], [3,1,2,0]) + ['a', 'b', 'c', 'd'] + >>> untranspose([13, 12, 14, 11, 15, 10], [3,2,4,1,5,0]) + [10, 11, 12, 13, 14, 15] + """ + transposed = list(repeat('', len(transposition))) + for p, t in enumerate(transposition): + transposed[t] = items[p] + return transposed + def frequencies(text): """Count the number of occurrences of each character in text @@ -399,6 +439,65 @@ def scytale_decipher(message, rows): return ''.join([''.join(c) for c in zip_longest(*columns, fillvalue='')]) +def transpositions_of(keyword): + """Finds the transpostions given by a keyword. For instance, the keyword + 'clever' rearranges to 'celrv', so the first column (0) stays first, the + second column (1) moves to third, the third column (2) moves to second, + and so on. + + >>> transpositions_of('clever') + [0, 2, 1, 4, 3] + """ + key = deduplicate(keyword) + transpositions = [key.index(l) for l in sorted(key)] + return transpositions + +def column_transposition_encipher(message, keyword, fillvalue=' '): + """Enciphers using the column transposition cipher. + Message is padded to allow all rows to be the same length. + + >>> column_transposition_encipher('hellothere', 'clever') + 'hleolteher' + >>> column_transposition_encipher('hellothere', 'cleverly', fillvalue='!') + 'hleolthre!e!' + """ + return column_transposition_worker(message, keyword, encipher=True, + fillvalue=fillvalue) + +def column_transposition_decipher(message, keyword, fillvalue=' '): + """Deciphers using the column transposition cipher. + Message is padded to allow all rows to be the same length. + + >>> column_transposition_decipher('hleolteher', 'clever') + 'hellothere' + >>> column_transposition_decipher('hleolthre!e!', 'cleverly', fillvalue='?') + 'hellothere!!' + """ + return column_transposition_worker(message, keyword, encipher=False, + fillvalue=fillvalue) + +def column_transposition_worker(message, keyword, + encipher=True, fillvalue=' '): + """Does the actual work of the column transposition cipher. + Message is padded with spaces to allow all rows to be the same length. + + >>> column_transposition_worker('hellothere', 'clever') + 'hleolteher' + >>> column_transposition_worker('hellothere', 'clever', encipher=True) + 'hleolteher' + >>> column_transposition_worker('hleolteher', 'clever', encipher=False) + 'hellothere' + """ + transpositions = transpositions_of(keyword) + columns = every_nth(message, len(transpositions), fillvalue=fillvalue) + if encipher: + transposed_columns = transpose(columns, transpositions) + else: + transposed_columns = untranspose(columns, transpositions) + return combine_every_nth(transposed_columns) + + + def caesar_break(message, metric=norms.euclidean_distance, target_counts=normalised_english_counts, @@ -573,6 +672,98 @@ def scytale_break(message, sanitise(scytale_decipher(message, best_key))[:50])) return best_key, best_fit +def column_transposition_break(message, + wordlist=keywords, + metric=norms.euclidean_distance, + target_counts=normalised_english_bigram_counts, + message_frequency_scaling=norms.normalise): + """Breaks a column transposition cipher using a dictionary and + n-gram frequency analysis + + >>> column_transposition_break(column_transposition_encipher(sanitise( \ + "Turing's homosexuality resulted in a criminal prosecution in 1952, \ + when homosexual acts were still illegal in the United Kingdom. "), \ + 'encipher'), \ + wordlist=['encipher', 'keyword', 'fourteen']) # doctest: +ELLIPSIS + ('encipher', 0.898128626285...) + >>> column_transposition_break(column_transposition_encipher(sanitise( \ + "Turing's homosexuality resulted in a criminal prosecution in 1952, " \ + "when homosexual acts were still illegal in the United Kingdom."), \ + 'encipher'), \ + wordlist=['encipher', 'keyword', 'fourteen'], \ + target_counts=normalised_english_trigram_counts) # doctest: +ELLIPSIS + ('encipher', 1.1958792913127...) + """ + best_keyword = '' + best_fit = float("inf") + ngram_length = len(next(iter(target_counts.keys()))) + for keyword in wordlist: + if len(message) % len(deduplicate(keyword)) == 0: + plaintext = column_transposition_decipher(message, keyword) + counts = message_frequency_scaling(frequencies( + ngrams(sanitise(plaintext), ngram_length))) + fit = metric(target_counts, counts) + logger.debug('Column transposition break attempt using key {0} ' + 'gives fit of {1} and decrypt starting: {2}'.format( + keyword, fit, + sanitise(plaintext)[:50])) + if fit < best_fit: + best_fit = fit + best_keyword = keyword + logger.info('Column transposition break best fit with key {0} gives fit ' + 'of {1} and decrypt starting: {2}'.format(best_keyword, + best_fit, sanitise( + column_transposition_decipher(message, + best_keyword))[:50])) + return best_keyword, best_fit + + +def column_transposition_break_mp(message, + wordlist=keywords, + metric=norms.euclidean_distance, + target_counts=normalised_english_bigram_counts, + message_frequency_scaling=norms.normalise, + chunksize=500): + """Breaks a column transposition cipher using a dictionary and + n-gram frequency analysis + + >>> column_transposition_break_mp(column_transposition_encipher(sanitise( \ + "Turing's homosexuality resulted in a criminal prosecution in 1952, \ + when homosexual acts were still illegal in the United Kingdom. "), \ + 'encipher'), \ + wordlist=['encipher', 'keyword', 'fourteen']) # doctest: +ELLIPSIS + ('encipher', 0.898128626285...) + >>> column_transposition_break_mp(column_transposition_encipher(sanitise( \ + "Turing's homosexuality resulted in a criminal prosecution in 1952, " \ + "when homosexual acts were still illegal in the United Kingdom."), \ + 'encipher'), \ + wordlist=['encipher', 'keyword', 'fourteen'], \ + target_counts=normalised_english_trigram_counts) # doctest: +ELLIPSIS + ('encipher', 1.1958792913127...) + """ + ngram_length = len(next(iter(target_counts.keys()))) + with Pool() as pool: + helper_args = [(message, word, metric, target_counts, ngram_length, + message_frequency_scaling) + for word in wordlist] + # Gotcha: the helper function here needs to be defined at the top level + # (limitation of Pool.starmap) + breaks = pool.starmap(column_transposition_break_worker, helper_args, chunksize) + return min(breaks, key=lambda k: k[1]) + +def column_transposition_break_worker(message, keyword, metric, target_counts, + ngram_length, message_frequency_scaling): + plaintext = column_transposition_decipher(message, keyword) + counts = message_frequency_scaling(frequencies( + ngrams(sanitise(plaintext), ngram_length))) + fit = metric(target_counts, counts) + logger.debug('Column transposition break attempt using key {0} ' + 'gives fit of {1} and decrypt starting: {2}'.format( + keyword, fit, + sanitise(plaintext)[:50])) + return keyword, fit + + if __name__ == "__main__": import doctest