4 from cipherbreak
import *
7 corpus
= sanitise(''.join([open('shakespeare.txt', 'r').read(),
8 open('sherlock-holmes.txt', 'r').read(),
9 open('war-and-peace.txt', 'r').read()]))
10 corpus_length
= len(corpus
)
12 euclidean_scaled_english_counts
= norms
.euclidean_scale(english_counts
)
14 # def frequency_compare(text, target_frequency, frequency_scaling, metric):
15 # counts = frequency_scaling(frequencies(text))
16 # return -1 * metric(target_frequency, counts)
18 # def euclidean_compare(text):
19 # return frequency_compare(text, norms.euclidean_scale(english_counts),
20 # norms.euclidean_scale, norms.euclidean_distance)
22 metrics
= [{'func': norms
.l1
, 'invert': True, 'name': 'l1'},
23 {'func': norms
.l2
, 'invert': True, 'name': 'l2'},
24 {'func': norms
.l3
, 'invert': True, 'name': 'l3'},
25 {'func': norms
.cosine_distance
, 'invert': False, 'name': 'cosine_distance'},
26 {'func': norms
.harmonic_mean
, 'invert': True, 'name': 'harmonic_mean'},
27 {'func': norms
.geometric_mean
, 'invert': True, 'name': 'geometric_mean'}]
28 scalings
= [{'corpus_frequency': normalised_english_counts
,
29 'scaling': norms
.normalise
,
30 'name': 'normalised'},
31 {'corpus_frequency': euclidean_scaled_english_counts
,
32 'scaling': norms
.euclidean_scale
,
33 'name': 'euclidean_scaled'}]
34 message_lengths
= [300, 100, 50, 30, 20, 10, 5]
41 def make_frequency_compare_function(target_frequency
, frequency_scaling
, metric
, invert
):
42 def frequency_compare(text
):
43 counts
= frequency_scaling(frequencies(text
))
45 score
= -1 * metric(target_frequency
, counts
)
47 score
= metric(target_frequency
, counts
)
49 return frequency_compare
52 def scoring_functions():
53 return [{'func': make_frequency_compare_function(s
['corpus_frequency'],
54 s
['scaling'], m
['func'], m
['invert']),
55 'name': '{} + {}'.format(m
['name'], s
['name'])}
57 for s
in scalings
] + [{'func': Pletters
, 'name': 'Pletters'}]
61 for f
in scoring_functions()
62 for l
in message_lengths
]
64 def eval_one_score(scoring_function
, message_length
):
65 print(scoring_function
['name'], message_length
)
66 if scoring_function
['name'] not in scores
:
67 scores
[scoring_function
['name']] = collections
.defaultdict(int)
68 for _
in range(trials
):
69 sample_start
= random
.randint(0, corpus_length
- message_length
)
70 sample
= corpus
[sample_start
:(sample_start
+ message_length
)]
71 key
= random
.randint(1, 25)
72 ciphertext
= caesar_encipher(sample
, key
)
73 found_key
, _
= caesar_break(ciphertext
, scoring_function
['func'])
75 scores
[scoring_function
['name']][message_length
] += 1
76 return scores
[scoring_function
['name']][message_length
]
79 with
open('caesar_break_parameter_trials.csv', 'w') as f
:
80 print(',message_length', file = f
)
81 print('scoring,', ', '.join([str(l
) for l
in message_lengths
]), file = f
)
82 for scoring
in sorted(scores
.keys()):
83 for length
in message_lengths
:
84 print(scoring
, end
='', sep
='', file=f
)
85 for l
in message_lengths
:
86 print(',', scores
[scoring
][l
] / trials
, end
='', file=f
)