4 from cipherbreak
import *
8 corpus
= sanitise(''.join([open('shakespeare.txt', 'r').read(),
9 open('sherlock-holmes.txt', 'r').read(),
10 open('war-and-peace.txt', 'r').read()]))
11 corpus_length
= len(corpus
)
13 euclidean_scaled_english_counts
= norms
.euclidean_scale(english_counts
)
15 metrics
= [{'func': norms.l1, 'name': 'l1'}
,
16 {'func': norms.l2, 'name': 'l2'}
,
17 {'func': norms.l3, 'name': 'l2'}
,
18 {'func': norms.cosine_distance, 'name': 'cosine_distance'}
,
19 {'func': norms.harmonic_mean, 'name': 'harminic_mean'}
,
20 {'func': norms.geometric_mean, 'name': 'geometric_mean'}
,
21 {'func': norms.inverse_log_pl, 'name': 'inverse_log_pl'}
]
22 scalings
= [{'corpus_frequency': normalised_english_counts
,
23 'scaling': norms
.normalise
,
24 'name': 'normalised'},
25 {'corpus_frequency': euclidean_scaled_english_counts
,
26 'scaling': norms
.euclidean_scale
,
27 'name': 'euclidean_scaled'},
28 {'corpus_frequency': normalised_english_counts
,
29 'scaling': norms
.identity_scale
,
30 'name': 'normalised_with_identity'}]
31 message_lengths
= [300, 100, 50, 30, 20, 10, 5]
35 # rebuild with itertools.product and itertools.starmap
36 # e.g. results = starmap(one_trial, product(metrics, scalings, message_lengths))
37 # ... which would then be easy parallelise.
39 print('Starting:', end
='', flush
=True)
40 with open('caesar_break_parameter_trials.csv', 'w') as f
:
41 print('metric,scaling,message_length,score', file = f
)
42 scores
= collections
.defaultdict(int)
43 for metric
in metrics
:
44 scores
[metric
['name']] = collections
.defaultdict(int)
45 for scaling
in scalings
:
46 scores
[metric
['name']][scaling
['name']] = collections
.defaultdict(int)
47 for message_length
in message_lengths
:
48 for i
in range(trials
):
49 sample_start
= random
.randint(0, corpus_length
- message_length
)
50 sample
= corpus
[sample_start
:(sample_start
+ message_length
)]
51 key
= random
.randint(1, 25)
52 sample_ciphertext
= caesar_encipher(sample
, key
)
53 (found_key
, score
) = caesar_break(sample_ciphertext
,
54 metric
=metric
['func'],
55 target_counts
=scaling
['corpus_frequency'],
56 message_frequency_scaling
=scaling
['scaling'])
58 scores
[metric
['name']][scaling
['name']][message_length
] += 1
59 print('.', end
='', flush
=True)
60 print(', '.join([metric
['name'],
63 str(scores
[metric
['name']][scaling
['name']][message_length
] / trials
) ]),