X-Git-Url: https://git.njae.me.uk/?a=blobdiff_plain;f=find_best_caesar_break_parameters.py;h=edab90fcc2c55bfe5dc6b74b425413837030f1cd;hb=3906e8a686e3d1943e22746b65c394a4def34fc0;hp=246400ff65a6628b433e57c237fbf0f493f2f593;hpb=36820d02361529d5327ad040432d0198b72baed2;p=cipher-tools.git diff --git a/find_best_caesar_break_parameters.py b/find_best_caesar_break_parameters.py index 246400f..edab90f 100644 --- a/find_best_caesar_break_parameters.py +++ b/find_best_caesar_break_parameters.py @@ -2,8 +2,7 @@ import random import collections from cipher import * from cipherbreak import * - -print('Loading...') +import itertools corpus = sanitise(''.join([open('shakespeare.txt', 'r').read(), open('sherlock-holmes.txt', 'r').read(), @@ -14,9 +13,9 @@ euclidean_scaled_english_counts = norms.euclidean_scale(english_counts) metrics = [{'func': norms.l1, 'name': 'l1'}, {'func': norms.l2, 'name': 'l2'}, - {'func': norms.l3, 'name': 'l2'}, + {'func': norms.l3, 'name': 'l3'}, {'func': norms.cosine_distance, 'name': 'cosine_distance'}, - {'func': norms.harmonic_mean, 'name': 'harminic_mean'}, + {'func': norms.harmonic_mean, 'name': 'harmonic_mean'}, {'func': norms.geometric_mean, 'name': 'geometric_mean'}, {'func': norms.inverse_log_pl, 'name': 'inverse_log_pl'}] scalings = [{'corpus_frequency': normalised_english_counts, @@ -32,34 +31,35 @@ message_lengths = [300, 100, 50, 30, 20, 10, 5] trials = 5000 -# rebuild with itertools.product and itertools.starmap -# e.g. results = starmap(one_trial, product(metrics, scalings, message_lengths)) -# ... which would then be easy parallelise. +scores = collections.defaultdict(int) + +def eval_all(): + list(itertools.starmap(eval_one_parameter_set, + itertools.product(metrics, scalings, message_lengths))) + +def eval_one_parameter_set(metric, scaling, message_length): + for _ in range(trials): + sample_start = random.randint(0, corpus_length - message_length) + sample = corpus[sample_start:(sample_start + message_length)] + key = random.randint(1, 25) + sample_ciphertext = caesar_encipher(sample, key) + found_key, _ = caesar_break(sample_ciphertext, + metric=metric['func'], + target_counts=scaling['corpus_frequency'], + message_frequency_scaling=scaling['scaling']) + if found_key == key: + scores[(metric['name'], scaling['name'], message_length)] += 1 + return scores[(metric['name'], scaling['name'], message_length)] + +def show_results(): + with open('caesar_break_parameter_trials.csv', 'w') as f: + print(',message_length', file = f) + print('metric+scaling,', ','.join([str(l) for l in message_lengths]), file = f) + for (metric, scaling) in itertools.product(metrics, scalings): + print('{}:{}'.format(metric['name'], scaling['name']), end='', file=f) + for l in message_lengths: + print(',', scores[(metric['name'], scaling['name'], l)] / trials, end='', file=f) + print('', file = f) -print('Starting:', end='', flush=True) -with open('caesar_break_parameter_trials.csv', 'w') as f: - print('metric,scaling,message_length,score', file = f) - scores = collections.defaultdict(int) - for metric in metrics: - scores[metric['name']] = collections.defaultdict(int) - for scaling in scalings: - scores[metric['name']][scaling['name']] = collections.defaultdict(int) - for message_length in message_lengths: - for i in range(trials): - sample_start = random.randint(0, corpus_length - message_length) - sample = corpus[sample_start:(sample_start + message_length)] - key = random.randint(1, 25) - sample_ciphertext = caesar_encipher(sample, key) - (found_key, score) = caesar_break(sample_ciphertext, - metric=metric['func'], - target_counts=scaling['corpus_frequency'], - message_frequency_scaling=scaling['scaling']) - if found_key == key: - scores[metric['name']][scaling['name']][message_length] += 1 - print('.', end='', flush=True) - print(', '.join([metric['name'], - scaling['name'], - str(message_length), - str(scores[metric['name']][scaling['name']][message_length] / trials) ]), - file = f) -print() +eval_all() +show_results()