From: Neil Smith Date: Sun, 6 Oct 2013 17:09:49 +0000 (+0100) Subject: Added experiment for checking which metrics work best for using unigram frequency... X-Git-Url: https://git.njae.me.uk/?a=commitdiff_plain;h=26d9d2228e47a6ff8b8696d37c0a8d6d6b906c67;p=cipher-tools.git Added experiment for checking which metrics work best for using unigram frequency analysis for breaking Caesar ciphers --- diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..14ba524 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +*~ +/.bundle +/config/database.yml +/config/deploy.rb +*doc +/log/*log +/tmp + diff --git a/__pycache__/cipher.cpython-33.pyc b/__pycache__/cipher.cpython-33.pyc new file mode 100644 index 0000000..05222b0 Binary files /dev/null and b/__pycache__/cipher.cpython-33.pyc differ diff --git a/caesar_break_parameter_trials.csv b/caesar_break_parameter_trials.csv new file mode 100644 index 0000000..df9b836 --- /dev/null +++ b/caesar_break_parameter_trials.csv @@ -0,0 +1,144 @@ +l1, normalised_english_counts, normalise, 3000, 0.9616 +l1, normalised_english_counts, normalise, 1000, 0.9562 +l1, normalised_english_counts, normalise, 300, 0.9598 +l1, normalised_english_counts, normalise, 100, 0.9622 +l1, normalised_english_counts, normalise, 50, 0.9584 +l1, normalised_english_counts, normalise, 30, 0.953 +l1, normalised_english_counts, normalise, 20, 0.917 +l1, normalised_english_counts, normalise, 10, 0.7328 +l1, normalised_english_counts, normalise, 5, 0.4394 +l1, normalised_english_counts, scale, 3000, 0.9618 +l1, normalised_english_counts, scale, 1000, 0.9574 +l1, normalised_english_counts, scale, 300, 0.9624 +l1, normalised_english_counts, scale, 100, 0.9566 +l1, normalised_english_counts, scale, 50, 0.959 +l1, normalised_english_counts, scale, 30, 0.9476 +l1, normalised_english_counts, scale, 20, 0.8968 +l1, normalised_english_counts, scale, 10, 0.6844 +l1, normalised_english_counts, scale, 5, 0.4298 +l1, scaled_english_counts, normalise, 3000, 0.957 +l1, scaled_english_counts, normalise, 1000, 0.9662 +l1, scaled_english_counts, normalise, 300, 0.9604 +l1, scaled_english_counts, normalise, 100, 0.9602 +l1, scaled_english_counts, normalise, 50, 0.9578 +l1, scaled_english_counts, normalise, 30, 0.9504 +l1, scaled_english_counts, normalise, 20, 0.9174 +l1, scaled_english_counts, normalise, 10, 0.7204 +l1, scaled_english_counts, normalise, 5, 0.4506 +l1, scaled_english_counts, scale, 3000, 0.9584 +l1, scaled_english_counts, scale, 1000, 0.9586 +l1, scaled_english_counts, scale, 300, 0.964 +l1, scaled_english_counts, scale, 100, 0.9582 +l1, scaled_english_counts, scale, 50, 0.9606 +l1, scaled_english_counts, scale, 30, 0.944 +l1, scaled_english_counts, scale, 20, 0.915 +l1, scaled_english_counts, scale, 10, 0.7324 +l1, scaled_english_counts, scale, 5, 0.4446 +l2, normalised_english_counts, normalise, 3000, 0.953 +l2, normalised_english_counts, normalise, 1000, 0.962 +l2, normalised_english_counts, normalise, 300, 0.9638 +l2, normalised_english_counts, normalise, 100, 0.9632 +l2, normalised_english_counts, normalise, 50, 0.9604 +l2, normalised_english_counts, normalise, 30, 0.95 +l2, normalised_english_counts, normalise, 20, 0.892 +l2, normalised_english_counts, normalise, 10, 0.7124 +l2, normalised_english_counts, normalise, 5, 0.4406 +l2, normalised_english_counts, scale, 3000, 0.9626 +l2, normalised_english_counts, scale, 1000, 0.956 +l2, normalised_english_counts, scale, 300, 0.962 +l2, normalised_english_counts, scale, 100, 0.9572 +l2, normalised_english_counts, scale, 50, 0.9526 +l2, normalised_english_counts, scale, 30, 0.9478 +l2, normalised_english_counts, scale, 20, 0.9046 +l2, normalised_english_counts, scale, 10, 0.6896 +l2, normalised_english_counts, scale, 5, 0.4308 +l2, scaled_english_counts, normalise, 3000, 0.9574 +l2, scaled_english_counts, normalise, 1000, 0.9568 +l2, scaled_english_counts, normalise, 300, 0.9536 +l2, scaled_english_counts, normalise, 100, 0.9624 +l2, scaled_english_counts, normalise, 50, 0.9606 +l2, scaled_english_counts, normalise, 30, 0.9384 +l2, scaled_english_counts, normalise, 20, 0.8914 +l2, scaled_english_counts, normalise, 10, 0.6892 +l2, scaled_english_counts, normalise, 5, 0.4196 +l2, scaled_english_counts, scale, 3000, 0.9532 +l2, scaled_english_counts, scale, 1000, 0.9588 +l2, scaled_english_counts, scale, 300, 0.9644 +l2, scaled_english_counts, scale, 100, 0.9572 +l2, scaled_english_counts, scale, 50, 0.9586 +l2, scaled_english_counts, scale, 30, 0.9436 +l2, scaled_english_counts, scale, 20, 0.9036 +l2, scaled_english_counts, scale, 10, 0.693 +l2, scaled_english_counts, scale, 5, 0.4376 +l3, normalised_english_counts, normalise, 3000, 0.9626 +l3, normalised_english_counts, normalise, 1000, 0.9582 +l3, normalised_english_counts, normalise, 300, 0.9542 +l3, normalised_english_counts, normalise, 100, 0.9606 +l3, normalised_english_counts, normalise, 50, 0.953 +l3, normalised_english_counts, normalise, 30, 0.9248 +l3, normalised_english_counts, normalise, 20, 0.8684 +l3, normalised_english_counts, normalise, 10, 0.6106 +l3, normalised_english_counts, normalise, 5, 0.4064 +l3, normalised_english_counts, scale, 3000, 0.961 +l3, normalised_english_counts, scale, 1000, 0.9568 +l3, normalised_english_counts, scale, 300, 0.9566 +l3, normalised_english_counts, scale, 100, 0.9554 +l3, normalised_english_counts, scale, 50, 0.9436 +l3, normalised_english_counts, scale, 30, 0.8936 +l3, normalised_english_counts, scale, 20, 0.8016 +l3, normalised_english_counts, scale, 10, 0.579 +l3, normalised_english_counts, scale, 5, 0.4114 +l3, scaled_english_counts, normalise, 3000, 0.9616 +l3, scaled_english_counts, normalise, 1000, 0.9612 +l3, scaled_english_counts, normalise, 300, 0.9624 +l3, scaled_english_counts, normalise, 100, 0.9524 +l3, scaled_english_counts, normalise, 50, 0.9474 +l3, scaled_english_counts, normalise, 30, 0.9066 +l3, scaled_english_counts, normalise, 20, 0.8004 +l3, scaled_english_counts, normalise, 10, 0.5686 +l3, scaled_english_counts, normalise, 5, 0.3404 +l3, scaled_english_counts, scale, 3000, 0.96 +l3, scaled_english_counts, scale, 1000, 0.96 +l3, scaled_english_counts, scale, 300, 0.9596 +l3, scaled_english_counts, scale, 100, 0.96 +l3, scaled_english_counts, scale, 50, 0.954 +l3, scaled_english_counts, scale, 30, 0.9374 +l3, scaled_english_counts, scale, 20, 0.862 +l3, scaled_english_counts, scale, 10, 0.6276 +l3, scaled_english_counts, scale, 5, 0.399 +cosine_distance, normalised_english_counts, normalise, 3000, 0.9618 +cosine_distance, normalised_english_counts, normalise, 1000, 0.96 +cosine_distance, normalised_english_counts, normalise, 300, 0.9604 +cosine_distance, normalised_english_counts, normalise, 100, 0.9538 +cosine_distance, normalised_english_counts, normalise, 50, 0.9608 +cosine_distance, normalised_english_counts, normalise, 30, 0.9426 +cosine_distance, normalised_english_counts, normalise, 20, 0.9012 +cosine_distance, normalised_english_counts, normalise, 10, 0.6916 +cosine_distance, normalised_english_counts, normalise, 5, 0.4286 +cosine_distance, normalised_english_counts, scale, 3000, 0.9606 +cosine_distance, normalised_english_counts, scale, 1000, 0.9572 +cosine_distance, normalised_english_counts, scale, 300, 0.9628 +cosine_distance, normalised_english_counts, scale, 100, 0.959 +cosine_distance, normalised_english_counts, scale, 50, 0.9542 +cosine_distance, normalised_english_counts, scale, 30, 0.951 +cosine_distance, normalised_english_counts, scale, 20, 0.9028 +cosine_distance, normalised_english_counts, scale, 10, 0.7028 +cosine_distance, normalised_english_counts, scale, 5, 0.44 +cosine_distance, scaled_english_counts, normalise, 3000, 0.9582 +cosine_distance, scaled_english_counts, normalise, 1000, 0.9614 +cosine_distance, scaled_english_counts, normalise, 300, 0.9632 +cosine_distance, scaled_english_counts, normalise, 100, 0.9584 +cosine_distance, scaled_english_counts, normalise, 50, 0.9574 +cosine_distance, scaled_english_counts, normalise, 30, 0.9506 +cosine_distance, scaled_english_counts, normalise, 20, 0.8956 +cosine_distance, scaled_english_counts, normalise, 10, 0.6916 +cosine_distance, scaled_english_counts, normalise, 5, 0.4356 +cosine_distance, scaled_english_counts, scale, 3000, 0.9572 +cosine_distance, scaled_english_counts, scale, 1000, 0.961 +cosine_distance, scaled_english_counts, scale, 300, 0.9596 +cosine_distance, scaled_english_counts, scale, 100, 0.9544 +cosine_distance, scaled_english_counts, scale, 50, 0.9598 +cosine_distance, scaled_english_counts, scale, 30, 0.9414 +cosine_distance, scaled_english_counts, scale, 20, 0.9036 +cosine_distance, scaled_english_counts, scale, 10, 0.6928 +cosine_distance, scaled_english_counts, scale, 5, 0.4178 diff --git a/find_best_caesar_break_parameters.py b/find_best_caesar_break_parameters.py new file mode 100644 index 0000000..711cff0 --- /dev/null +++ b/find_best_caesar_break_parameters.py @@ -0,0 +1,60 @@ +import random +from cipher import * + + +corpus = sanitise(''.join([open('shakespeare.txt', 'r').read(), open('sherlock-holmes.txt', 'r').read(), open('war-and-peace.txt', 'r').read()])) +corpus_length = len(corpus) + +scaled_english_counts = norms.scale(english_counts) + + +metrics = [norms.l1, norms.l2, norms.l3, norms.cosine_distance, norms.harmonic_mean, norms.geometric_mean] +corpus_frequencies = [normalised_english_counts, scaled_english_counts] +scalings = [norms.normalise, norms.scale] +message_lengths = [3000, 1000, 300, 100, 50, 30, 20, 10, 5] + +metric_names = ['l1', 'l2', 'l3', 'cosine_distance', 'harmonic_mean', 'geometric_mean'] +corpus_frequency_names = ['normalised_english_counts', 'scaled_english_counts'] +scaling_names = ['normalise', 'scale'] + +trials = 5000 + +scores = collections.defaultdict(int) +for metric in range(len(metrics)): + scores[metric_names[metric]] = collections.defaultdict(int) + for corpus_freqency in range(len(corpus_frequencies)): + scores[metric_names[metric]][corpus_frequency_names[corpus_freqency]] = collections.defaultdict(int) + for scaling in range(len(scalings)): + scores[metric_names[metric]][corpus_frequency_names[corpus_freqency]][scaling_names[scaling]] = collections.defaultdict(int) + for message_length in message_lengths: + for i in range(trials): + sample_start = random.randint(0, corpus_length - message_length) + sample = corpus[sample_start:(sample_start + message_length)] + key = random.randint(1, 25) + sample_ciphertext = caesar_encipher(sample, key) + (found_key, score) = caesar_break(sample_ciphertext, + metric=metrics[metric], + target_frequencies=corpus_frequencies[corpus_freqency], + message_frequency_scaling=scalings[scaling]) + if found_key == key: + scores[metric_names[metric]][corpus_frequency_names[corpus_freqency]][scaling_names[scaling]][message_length] += 1 + print(', '.join([metric_names[metric], + corpus_frequency_names[corpus_freqency], + scaling_names[scaling], + str(message_length), + str(scores[metric_names[metric]][corpus_frequency_names[corpus_freqency]][scaling_names[scaling]][message_length] / trials) ])) + + +with open('caesar_break_parameter_trials.csv', 'w') as f: + for metric in range(len(metrics)): + for corpus_freqency in range(len(corpus_frequencies)): + for scaling in range(len(scalings)): + for message_length in message_lengths: + print(', '.join([metric_names[metric], + corpus_frequency_names[corpus_freqency], + scaling_names[scaling], + str(message_length), + str(scores[metric_names[metric]][corpus_frequency_names[corpus_freqency]][scaling_names[scaling]][message_length] / trials) ]), + file=f) + + \ No newline at end of file diff --git a/norms.py b/norms.py index 744cbe4..4fdf1e3 100644 --- a/norms.py +++ b/norms.py @@ -96,6 +96,27 @@ def l3(frequencies1, frequencies2): total += abs(frequencies1[k] - frequencies2[k]) ** 3 return total ** (1/3) +def geometric_mean(frequencies1, frequencies2): + """Finds the distances between two frequency profiles, expressed as dictionaries. + Assumes every key in frequencies1 is also in frequencies2 + + """ + total = 0 + for k in frequencies1.keys(): + total *= abs(frequencies1[k] - frequencies2[k]) + return total + +def harmonic_mean(frequencies1, frequencies2): + """Finds the distances between two frequency profiles, expressed as dictionaries. + Assumes every key in frequencies1 is also in frequencies2 + + """ + total = 0 + for k in frequencies1.keys(): + total += 1 / abs(frequencies1[k] - frequencies2[k]) + return 1 / total + + def cosine_distance(frequencies1, frequencies2): """Finds the distances between two frequency profiles, expressed as dictionaries. Assumes every key in frequencies1 is also in frequencies2 diff --git a/norms.pyc b/norms.pyc new file mode 100644 index 0000000..540caf5 Binary files /dev/null and b/norms.pyc differ