Tuned default chunksize for keyword break
[cipher-tools.git] / find_best_caesar_break_parameters.py
1 import random
2 from cipher import *
3
4
5 corpus = sanitise(''.join([open('shakespeare.txt', 'r').read(), open('sherlock-holmes.txt', 'r').read(), open('war-and-peace.txt', 'r').read()]))
6 corpus_length = len(corpus)
7
8 scaled_english_counts = norms.scale(english_counts)
9
10
11 metrics = [norms.l1, norms.l2, norms.l3, norms.cosine_distance, norms.harmonic_mean, norms.geometric_mean]
12 corpus_frequencies = [normalised_english_counts, scaled_english_counts]
13 scalings = [norms.normalise, norms.scale]
14 message_lengths = [300, 100, 50, 30, 20, 10, 5]
15
16 metric_names = ['l1', 'l2', 'l3', 'cosine_distance', 'harmonic_mean', 'geometric_mean']
17 corpus_frequency_names = ['normalised_english_counts', 'scaled_english_counts']
18 scaling_names = ['normalise', 'scale']
19
20 trials = 5000
21
22 scores = collections.defaultdict(int)
23 for metric in range(len(metrics)):
24 scores[metric_names[metric]] = collections.defaultdict(int)
25 for corpus_freqency in range(len(corpus_frequencies)):
26 scores[metric_names[metric]][corpus_frequency_names[corpus_freqency]] = collections.defaultdict(int)
27 for scaling in range(len(scalings)):
28 scores[metric_names[metric]][corpus_frequency_names[corpus_freqency]][scaling_names[scaling]] = collections.defaultdict(int)
29 for message_length in message_lengths:
30 for i in range(trials):
31 sample_start = random.randint(0, corpus_length - message_length)
32 sample = corpus[sample_start:(sample_start + message_length)]
33 key = random.randint(1, 25)
34 sample_ciphertext = caesar_encipher(sample, key)
35 (found_key, score) = caesar_break(sample_ciphertext,
36 metric=metrics[metric],
37 target_frequencies=corpus_frequencies[corpus_freqency],
38 message_frequency_scaling=scalings[scaling])
39 if found_key == key:
40 scores[metric_names[metric]][corpus_frequency_names[corpus_freqency]][scaling_names[scaling]][message_length] += 1
41 print(', '.join([metric_names[metric],
42 corpus_frequency_names[corpus_freqency],
43 scaling_names[scaling],
44 str(message_length),
45 str(scores[metric_names[metric]][corpus_frequency_names[corpus_freqency]][scaling_names[scaling]][message_length] / trials) ]))
46
47
48 with open('caesar_break_parameter_trials.csv', 'w') as f:
49 for metric in range(len(metrics)):
50 for corpus_freqency in range(len(corpus_frequencies)):
51 for scaling in range(len(scalings)):
52 for message_length in message_lengths:
53 print(', '.join([metric_names[metric],
54 corpus_frequency_names[corpus_freqency],
55 scaling_names[scaling],
56 str(message_length),
57 str(scores[metric_names[metric]][corpus_frequency_names[corpus_freqency]][scaling_names[scaling]][message_length] / trials) ]),
58 file=f)
59
60