From 5022df4c849824315bf0ee432b223f6cdc07712d Mon Sep 17 00:00:00 2001 From: Neil Smith Date: Mon, 14 Jul 2014 21:01:42 +0100 Subject: [PATCH] Caesar ciphers --- cipher.py | 7 +-- cipherbreak.py | 103 --------------------------------------------- count_1l.txt | 26 ------------ language_models.py | 38 ----------------- lettercount.py | 13 ------ 5 files changed, 4 insertions(+), 183 deletions(-) delete mode 100644 cipherbreak.py delete mode 100644 count_1l.txt delete mode 100644 lettercount.py diff --git a/cipher.py b/cipher.py index e71902d..c5b29f0 100644 --- a/cipher.py +++ b/cipher.py @@ -3,9 +3,10 @@ them. See cipherbreak for automatic breaking of these ciphers """ import string -import collections -from enum import Enum -from itertools import zip_longest, cycle, chain +# import collections +# from enum import Enum +# from itertools import zip_longest, cycle, chain + from language_models import unaccent, sanitise diff --git a/cipherbreak.py b/cipherbreak.py deleted file mode 100644 index ddcb423..0000000 --- a/cipherbreak.py +++ /dev/null @@ -1,103 +0,0 @@ -"""A set of functions to break the ciphers give in ciphers.py. -""" - -import string -import collections -import norms -import logging -import math - -import matplotlib.pyplot as plt - -logger = logging.getLogger(__name__) -logger.addHandler(logging.FileHandler('cipher.log')) -logger.setLevel(logging.WARNING) -#logger.setLevel(logging.INFO) -#logger.setLevel(logging.DEBUG) - -from cipher import * -from language_models import * - -# To time a run: -# -# import timeit -# c5a = open('2012/5a.ciphertext', 'r').read() -# timeit.timeit('keyword_break(c5a)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break', number=1) -# timeit.repeat('keyword_break_mp(c5a, chunksize=500)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break_mp', repeat=5, number=1) - - -def frequencies(text): - """Count the number of occurrences of each character in text - - >>> sorted(frequencies('abcdefabc').items()) - [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)] - >>> sorted(frequencies('the quick brown fox jumped over the lazy ' \ - 'dog').items()) # doctest: +NORMALIZE_WHITESPACE - [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), - ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), - ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), - ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)] - >>> sorted(frequencies('The Quick BROWN fox jumped! over... the ' \ - '(9lazy) DOG').items()) # doctest: +NORMALIZE_WHITESPACE - [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1), - ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1), - ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2), - ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1), - ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)] - >>> sorted(frequencies(sanitise('The Quick BROWN fox jumped! over... '\ - 'the (9lazy) DOG')).items()) # doctest: +NORMALIZE_WHITESPACE - [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), - ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), - ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), - ('w', 1), ('x', 1), ('y', 1), ('z', 1)] - >>> frequencies('abcdefabcdef')['x'] - 0 - """ - return collections.Counter(c for c in text) - - -def caesar_break(message, fitness=Pletters): - """Breaks a Caesar cipher using frequency analysis - - >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrh' \ - 'ecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS - (4, -130.849989015...) - >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgtee' \ - 'raxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS - (19, -128.82410410...) - >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurer' \ - 'svaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS - (13, -126.25403935...) - """ - sanitised_message = sanitise(message) - best_shift = 0 - best_fit = float('-inf') - for shift in range(26): - plaintext = caesar_decipher(sanitised_message, shift) - fit = fitness(plaintext) - logger.debug('Caesar break attempt using key {0} gives fit of {1} ' - 'and decrypt starting: {2}'.format(shift, fit, - plaintext[:50])) - if fit > best_fit: - best_fit = fit - best_shift = shift - logger.info('Caesar break best fit: key {0} gives fit of {1} and ' - 'decrypt starting: {2}'.format(best_shift, best_fit, - caesar_decipher(sanitised_message, best_shift)[:50])) - return best_shift, best_fit - - -def plot_frequency_histogram(freqs, sort_key=None): - x = range(len(freqs.keys())) - y = [freqs[l] for l in sorted(freqs.keys(), key=sort_key)] - f = plt.figure() - ax = f.add_axes([0.1, 0.1, 0.9, 0.9]) - ax.bar(x, y, align='center') - ax.set_xticks(x) - ax.set_xticklabels(sorted(freqs.keys(), key=sort_key)) - f.show() - - -if __name__ == "__main__": - import doctest - doctest.testmod() diff --git a/count_1l.txt b/count_1l.txt deleted file mode 100644 index e9ac0c6..0000000 --- a/count_1l.txt +++ /dev/null @@ -1,26 +0,0 @@ -e 758103 -t 560576 -o 504520 -a 490129 -i 421240 -n 419374 -h 416369 -s 404473 -r 373599 -d 267917 -l 259023 -u 190269 -m 172199 -w 154157 -y 143040 -c 141094 -f 135318 -g 117888 -p 100690 -b 92919 -v 65297 -k 54248 -x 7414 -j 6679 -q 5499 -z 3577 diff --git a/language_models.py b/language_models.py index 4798d73..0ee0394 100644 --- a/language_models.py +++ b/language_models.py @@ -3,12 +3,7 @@ its use. """ import string -import random -import norms -import collections import unicodedata -import itertools -from math import log10 def letters(text): """Remove all non-alphabetic characters from a text @@ -54,39 +49,6 @@ def sanitise(text): return letters(unaccent(text)).lower() -def datafile(name, sep='\t'): - """Read key,value pairs from file. - """ - with open(name, 'r') as f: - for line in f: - splits = line.split(sep) - yield [splits[0], int(splits[1])] - -english_counts = collections.Counter(dict(datafile('count_1l.txt'))) -normalised_english_counts = norms.normalise(english_counts) - -# The log 10 probabilities of letters -Pl = {l: log10(n) for l, n in normalised_english_counts.items()} - -with open('words.txt', 'r') as f: - keywords = [line.rstrip() for line in f] - -def Pletters(letters): - """The Naive Bayes log probability of a sequence of letters. - """ - return sum(Pl[l.lower()] for l in letters) - - -def cosine_similarity_score(text): - """Finds the dissimilarity of a text to English, using the cosine distance - of the frequency distribution. - - >>> cosine_similarity_score('abcabc') # doctest: +ELLIPSIS - 0.26228882... - """ - return norms.cosine_similarity(english_counts, - collections.Counter(sanitise(text))) - if __name__ == "__main__": import doctest diff --git a/lettercount.py b/lettercount.py deleted file mode 100644 index 956eca1..0000000 --- a/lettercount.py +++ /dev/null @@ -1,13 +0,0 @@ -from language_models import sanitise -import collections - -corpora = ['shakespeare.txt', 'sherlock-holmes.txt', 'war-and-peace.txt'] -counts = collections.Counter() - -for corpus in corpora: - text = sanitise(open(corpus).read()) - counts.update(text) - -with open('count_1l.txt', 'w') as f: - for l, c in counts.most_common(): - f.write("{}\t{}\n".format(l, c)) -- 2.34.1