Added experiment for checking which metrics work best for using unigram frequency...
authorNeil Smith <neil.github@njae.me.uk>
Sun, 6 Oct 2013 17:09:49 +0000 (18:09 +0100)
committerNeil Smith <neil.github@njae.me.uk>
Sun, 6 Oct 2013 17:09:49 +0000 (18:09 +0100)
.gitignore [new file with mode: 0644]
__pycache__/cipher.cpython-33.pyc [new file with mode: 0644]
caesar_break_parameter_trials.csv [new file with mode: 0644]
find_best_caesar_break_parameters.py [new file with mode: 0644]
norms.py
norms.pyc [new file with mode: 0644]

diff --git a/.gitignore b/.gitignore
new file mode 100644 (file)
index 0000000..14ba524
--- /dev/null
@@ -0,0 +1,8 @@
+*~
+/.bundle
+/config/database.yml
+/config/deploy.rb
+*doc
+/log/*log
+/tmp
+
diff --git a/__pycache__/cipher.cpython-33.pyc b/__pycache__/cipher.cpython-33.pyc
new file mode 100644 (file)
index 0000000..05222b0
Binary files /dev/null and b/__pycache__/cipher.cpython-33.pyc differ
diff --git a/caesar_break_parameter_trials.csv b/caesar_break_parameter_trials.csv
new file mode 100644 (file)
index 0000000..df9b836
--- /dev/null
@@ -0,0 +1,144 @@
+l1, normalised_english_counts, normalise, 3000, 0.9616
+l1, normalised_english_counts, normalise, 1000, 0.9562
+l1, normalised_english_counts, normalise, 300, 0.9598
+l1, normalised_english_counts, normalise, 100, 0.9622
+l1, normalised_english_counts, normalise, 50, 0.9584
+l1, normalised_english_counts, normalise, 30, 0.953
+l1, normalised_english_counts, normalise, 20, 0.917
+l1, normalised_english_counts, normalise, 10, 0.7328
+l1, normalised_english_counts, normalise, 5, 0.4394
+l1, normalised_english_counts, scale, 3000, 0.9618
+l1, normalised_english_counts, scale, 1000, 0.9574
+l1, normalised_english_counts, scale, 300, 0.9624
+l1, normalised_english_counts, scale, 100, 0.9566
+l1, normalised_english_counts, scale, 50, 0.959
+l1, normalised_english_counts, scale, 30, 0.9476
+l1, normalised_english_counts, scale, 20, 0.8968
+l1, normalised_english_counts, scale, 10, 0.6844
+l1, normalised_english_counts, scale, 5, 0.4298
+l1, scaled_english_counts, normalise, 3000, 0.957
+l1, scaled_english_counts, normalise, 1000, 0.9662
+l1, scaled_english_counts, normalise, 300, 0.9604
+l1, scaled_english_counts, normalise, 100, 0.9602
+l1, scaled_english_counts, normalise, 50, 0.9578
+l1, scaled_english_counts, normalise, 30, 0.9504
+l1, scaled_english_counts, normalise, 20, 0.9174
+l1, scaled_english_counts, normalise, 10, 0.7204
+l1, scaled_english_counts, normalise, 5, 0.4506
+l1, scaled_english_counts, scale, 3000, 0.9584
+l1, scaled_english_counts, scale, 1000, 0.9586
+l1, scaled_english_counts, scale, 300, 0.964
+l1, scaled_english_counts, scale, 100, 0.9582
+l1, scaled_english_counts, scale, 50, 0.9606
+l1, scaled_english_counts, scale, 30, 0.944
+l1, scaled_english_counts, scale, 20, 0.915
+l1, scaled_english_counts, scale, 10, 0.7324
+l1, scaled_english_counts, scale, 5, 0.4446
+l2, normalised_english_counts, normalise, 3000, 0.953
+l2, normalised_english_counts, normalise, 1000, 0.962
+l2, normalised_english_counts, normalise, 300, 0.9638
+l2, normalised_english_counts, normalise, 100, 0.9632
+l2, normalised_english_counts, normalise, 50, 0.9604
+l2, normalised_english_counts, normalise, 30, 0.95
+l2, normalised_english_counts, normalise, 20, 0.892
+l2, normalised_english_counts, normalise, 10, 0.7124
+l2, normalised_english_counts, normalise, 5, 0.4406
+l2, normalised_english_counts, scale, 3000, 0.9626
+l2, normalised_english_counts, scale, 1000, 0.956
+l2, normalised_english_counts, scale, 300, 0.962
+l2, normalised_english_counts, scale, 100, 0.9572
+l2, normalised_english_counts, scale, 50, 0.9526
+l2, normalised_english_counts, scale, 30, 0.9478
+l2, normalised_english_counts, scale, 20, 0.9046
+l2, normalised_english_counts, scale, 10, 0.6896
+l2, normalised_english_counts, scale, 5, 0.4308
+l2, scaled_english_counts, normalise, 3000, 0.9574
+l2, scaled_english_counts, normalise, 1000, 0.9568
+l2, scaled_english_counts, normalise, 300, 0.9536
+l2, scaled_english_counts, normalise, 100, 0.9624
+l2, scaled_english_counts, normalise, 50, 0.9606
+l2, scaled_english_counts, normalise, 30, 0.9384
+l2, scaled_english_counts, normalise, 20, 0.8914
+l2, scaled_english_counts, normalise, 10, 0.6892
+l2, scaled_english_counts, normalise, 5, 0.4196
+l2, scaled_english_counts, scale, 3000, 0.9532
+l2, scaled_english_counts, scale, 1000, 0.9588
+l2, scaled_english_counts, scale, 300, 0.9644
+l2, scaled_english_counts, scale, 100, 0.9572
+l2, scaled_english_counts, scale, 50, 0.9586
+l2, scaled_english_counts, scale, 30, 0.9436
+l2, scaled_english_counts, scale, 20, 0.9036
+l2, scaled_english_counts, scale, 10, 0.693
+l2, scaled_english_counts, scale, 5, 0.4376
+l3, normalised_english_counts, normalise, 3000, 0.9626
+l3, normalised_english_counts, normalise, 1000, 0.9582
+l3, normalised_english_counts, normalise, 300, 0.9542
+l3, normalised_english_counts, normalise, 100, 0.9606
+l3, normalised_english_counts, normalise, 50, 0.953
+l3, normalised_english_counts, normalise, 30, 0.9248
+l3, normalised_english_counts, normalise, 20, 0.8684
+l3, normalised_english_counts, normalise, 10, 0.6106
+l3, normalised_english_counts, normalise, 5, 0.4064
+l3, normalised_english_counts, scale, 3000, 0.961
+l3, normalised_english_counts, scale, 1000, 0.9568
+l3, normalised_english_counts, scale, 300, 0.9566
+l3, normalised_english_counts, scale, 100, 0.9554
+l3, normalised_english_counts, scale, 50, 0.9436
+l3, normalised_english_counts, scale, 30, 0.8936
+l3, normalised_english_counts, scale, 20, 0.8016
+l3, normalised_english_counts, scale, 10, 0.579
+l3, normalised_english_counts, scale, 5, 0.4114
+l3, scaled_english_counts, normalise, 3000, 0.9616
+l3, scaled_english_counts, normalise, 1000, 0.9612
+l3, scaled_english_counts, normalise, 300, 0.9624
+l3, scaled_english_counts, normalise, 100, 0.9524
+l3, scaled_english_counts, normalise, 50, 0.9474
+l3, scaled_english_counts, normalise, 30, 0.9066
+l3, scaled_english_counts, normalise, 20, 0.8004
+l3, scaled_english_counts, normalise, 10, 0.5686
+l3, scaled_english_counts, normalise, 5, 0.3404
+l3, scaled_english_counts, scale, 3000, 0.96
+l3, scaled_english_counts, scale, 1000, 0.96
+l3, scaled_english_counts, scale, 300, 0.9596
+l3, scaled_english_counts, scale, 100, 0.96
+l3, scaled_english_counts, scale, 50, 0.954
+l3, scaled_english_counts, scale, 30, 0.9374
+l3, scaled_english_counts, scale, 20, 0.862
+l3, scaled_english_counts, scale, 10, 0.6276
+l3, scaled_english_counts, scale, 5, 0.399
+cosine_distance, normalised_english_counts, normalise, 3000, 0.9618
+cosine_distance, normalised_english_counts, normalise, 1000, 0.96
+cosine_distance, normalised_english_counts, normalise, 300, 0.9604
+cosine_distance, normalised_english_counts, normalise, 100, 0.9538
+cosine_distance, normalised_english_counts, normalise, 50, 0.9608
+cosine_distance, normalised_english_counts, normalise, 30, 0.9426
+cosine_distance, normalised_english_counts, normalise, 20, 0.9012
+cosine_distance, normalised_english_counts, normalise, 10, 0.6916
+cosine_distance, normalised_english_counts, normalise, 5, 0.4286
+cosine_distance, normalised_english_counts, scale, 3000, 0.9606
+cosine_distance, normalised_english_counts, scale, 1000, 0.9572
+cosine_distance, normalised_english_counts, scale, 300, 0.9628
+cosine_distance, normalised_english_counts, scale, 100, 0.959
+cosine_distance, normalised_english_counts, scale, 50, 0.9542
+cosine_distance, normalised_english_counts, scale, 30, 0.951
+cosine_distance, normalised_english_counts, scale, 20, 0.9028
+cosine_distance, normalised_english_counts, scale, 10, 0.7028
+cosine_distance, normalised_english_counts, scale, 5, 0.44
+cosine_distance, scaled_english_counts, normalise, 3000, 0.9582
+cosine_distance, scaled_english_counts, normalise, 1000, 0.9614
+cosine_distance, scaled_english_counts, normalise, 300, 0.9632
+cosine_distance, scaled_english_counts, normalise, 100, 0.9584
+cosine_distance, scaled_english_counts, normalise, 50, 0.9574
+cosine_distance, scaled_english_counts, normalise, 30, 0.9506
+cosine_distance, scaled_english_counts, normalise, 20, 0.8956
+cosine_distance, scaled_english_counts, normalise, 10, 0.6916
+cosine_distance, scaled_english_counts, normalise, 5, 0.4356
+cosine_distance, scaled_english_counts, scale, 3000, 0.9572
+cosine_distance, scaled_english_counts, scale, 1000, 0.961
+cosine_distance, scaled_english_counts, scale, 300, 0.9596
+cosine_distance, scaled_english_counts, scale, 100, 0.9544
+cosine_distance, scaled_english_counts, scale, 50, 0.9598
+cosine_distance, scaled_english_counts, scale, 30, 0.9414
+cosine_distance, scaled_english_counts, scale, 20, 0.9036
+cosine_distance, scaled_english_counts, scale, 10, 0.6928
+cosine_distance, scaled_english_counts, scale, 5, 0.4178
diff --git a/find_best_caesar_break_parameters.py b/find_best_caesar_break_parameters.py
new file mode 100644 (file)
index 0000000..711cff0
--- /dev/null
@@ -0,0 +1,60 @@
+import random
+from cipher import *
+
+
+corpus = sanitise(''.join([open('shakespeare.txt', 'r').read(), open('sherlock-holmes.txt', 'r').read(), open('war-and-peace.txt', 'r').read()]))
+corpus_length = len(corpus)
+
+scaled_english_counts = norms.scale(english_counts)
+
+
+metrics = [norms.l1, norms.l2, norms.l3, norms.cosine_distance, norms.harmonic_mean, norms.geometric_mean]
+corpus_frequencies = [normalised_english_counts, scaled_english_counts]
+scalings = [norms.normalise, norms.scale]
+message_lengths = [3000, 1000, 300, 100, 50, 30, 20, 10, 5]
+
+metric_names = ['l1', 'l2', 'l3', 'cosine_distance', 'harmonic_mean', 'geometric_mean']
+corpus_frequency_names = ['normalised_english_counts', 'scaled_english_counts']
+scaling_names = ['normalise', 'scale']
+
+trials = 5000
+
+scores = collections.defaultdict(int)
+for metric in range(len(metrics)):
+    scores[metric_names[metric]] = collections.defaultdict(int)
+    for corpus_freqency in range(len(corpus_frequencies)):
+        scores[metric_names[metric]][corpus_frequency_names[corpus_freqency]] = collections.defaultdict(int)
+        for scaling in range(len(scalings)):
+            scores[metric_names[metric]][corpus_frequency_names[corpus_freqency]][scaling_names[scaling]] = collections.defaultdict(int)
+            for message_length in message_lengths:
+                for i in range(trials):
+                    sample_start = random.randint(0, corpus_length - message_length)
+                    sample = corpus[sample_start:(sample_start + message_length)]
+                    key = random.randint(1, 25)
+                    sample_ciphertext = caesar_encipher(sample, key)
+                    (found_key, score) = caesar_break(sample_ciphertext, 
+                                                      metric=metrics[metric], 
+                                                      target_frequencies=corpus_frequencies[corpus_freqency], 
+                                                      message_frequency_scaling=scalings[scaling])
+                    if found_key == key:
+                        scores[metric_names[metric]][corpus_frequency_names[corpus_freqency]][scaling_names[scaling]][message_length] += 1 
+                print(', '.join([metric_names[metric], 
+                                 corpus_frequency_names[corpus_freqency], 
+                                 scaling_names[scaling], 
+                                 str(message_length), 
+                                 str(scores[metric_names[metric]][corpus_frequency_names[corpus_freqency]][scaling_names[scaling]][message_length] / trials) ]))
+
+
+with open('caesar_break_parameter_trials.csv', 'w') as f:
+    for metric in range(len(metrics)):
+        for corpus_freqency in range(len(corpus_frequencies)):
+            for scaling in range(len(scalings)):
+                for message_length in message_lengths:
+                    print(', '.join([metric_names[metric], 
+                                     corpus_frequency_names[corpus_freqency], 
+                                     scaling_names[scaling], 
+                                     str(message_length), 
+                                     str(scores[metric_names[metric]][corpus_frequency_names[corpus_freqency]][scaling_names[scaling]][message_length] / trials) ]), 
+                          file=f)
+                      
+                            
\ No newline at end of file
index 744cbe4d9d4336f574d7b0885fed973189acd7d9..4fdf1e3d85bb347c501bcb88c6caec7a8c969035 100644 (file)
--- a/norms.py
+++ b/norms.py
@@ -96,6 +96,27 @@ def l3(frequencies1, frequencies2):
         total += abs(frequencies1[k] - frequencies2[k]) ** 3
     return total ** (1/3)
 
+def geometric_mean(frequencies1, frequencies2):
+    """Finds the distances between two frequency profiles, expressed as dictionaries.
+    Assumes every key in frequencies1 is also in frequencies2
+
+    """
+    total = 0
+    for k in frequencies1.keys():
+        total *= abs(frequencies1[k] - frequencies2[k])
+    return total
+
+def harmonic_mean(frequencies1, frequencies2):
+    """Finds the distances between two frequency profiles, expressed as dictionaries.
+    Assumes every key in frequencies1 is also in frequencies2
+
+    """
+    total = 0
+    for k in frequencies1.keys():
+        total += 1 / abs(frequencies1[k] - frequencies2[k])
+    return 1 / total
+
+
 def cosine_distance(frequencies1, frequencies2):
     """Finds the distances between two frequency profiles, expressed as dictionaries.
     Assumes every key in frequencies1 is also in frequencies2
diff --git a/norms.pyc b/norms.pyc
new file mode 100644 (file)
index 0000000..540caf5
Binary files /dev/null and b/norms.pyc differ