Fixed bugs in geometric and harmonic means, added some tests.
authorNeil Smith <neil.github@njae.me.uk>
Mon, 7 Oct 2013 12:28:24 +0000 (13:28 +0100)
committerNeil Smith <neil.github@njae.me.uk>
Mon, 7 Oct 2013 12:28:24 +0000 (13:28 +0100)
__pycache__/cipher.cpython-33.pyc
__pycache__/norms.cpython-33.pyc
caesar_break_parameter_trials.csv
cipher.py
find_best_caesar_break_parameters.py
norms.py

index 05222b00a2b63868a0e7d9cae75cb0884fed535a..71c263ebc64d9ac84291551734dcedb0484849e7 100644 (file)
Binary files a/__pycache__/cipher.cpython-33.pyc and b/__pycache__/cipher.cpython-33.pyc differ
index b18ed4b715278fb73b31ca57c434135a232eb74b..7d10c2f61bc74f9763ecdc1456169892160835f5 100644 (file)
Binary files a/__pycache__/norms.cpython-33.pyc and b/__pycache__/norms.cpython-33.pyc differ
index df9b836ebdfec843d0f42cb319729dfd1a45836c..ba7ee273a0682301beab9d4da6d0db1cba45f0ec 100644 (file)
-l1, normalised_english_counts, normalise, 3000, 0.9616
-l1, normalised_english_counts, normalise, 1000, 0.9562
-l1, normalised_english_counts, normalise, 300, 0.9598
-l1, normalised_english_counts, normalise, 100, 0.9622
-l1, normalised_english_counts, normalise, 50, 0.9584
-l1, normalised_english_counts, normalise, 30, 0.953
-l1, normalised_english_counts, normalise, 20, 0.917
-l1, normalised_english_counts, normalise, 10, 0.7328
-l1, normalised_english_counts, normalise, 5, 0.4394
-l1, normalised_english_counts, scale, 3000, 0.9618
-l1, normalised_english_counts, scale, 1000, 0.9574
-l1, normalised_english_counts, scale, 300, 0.9624
-l1, normalised_english_counts, scale, 100, 0.9566
-l1, normalised_english_counts, scale, 50, 0.959
-l1, normalised_english_counts, scale, 30, 0.9476
-l1, normalised_english_counts, scale, 20, 0.8968
-l1, normalised_english_counts, scale, 10, 0.6844
-l1, normalised_english_counts, scale, 5, 0.4298
-l1, scaled_english_counts, normalise, 3000, 0.957
-l1, scaled_english_counts, normalise, 1000, 0.9662
-l1, scaled_english_counts, normalise, 300, 0.9604
-l1, scaled_english_counts, normalise, 100, 0.9602
-l1, scaled_english_counts, normalise, 50, 0.9578
-l1, scaled_english_counts, normalise, 30, 0.9504
-l1, scaled_english_counts, normalise, 20, 0.9174
-l1, scaled_english_counts, normalise, 10, 0.7204
-l1, scaled_english_counts, normalise, 5, 0.4506
-l1, scaled_english_counts, scale, 3000, 0.9584
-l1, scaled_english_counts, scale, 1000, 0.9586
-l1, scaled_english_counts, scale, 300, 0.964
-l1, scaled_english_counts, scale, 100, 0.9582
-l1, scaled_english_counts, scale, 50, 0.9606
-l1, scaled_english_counts, scale, 30, 0.944
-l1, scaled_english_counts, scale, 20, 0.915
-l1, scaled_english_counts, scale, 10, 0.7324
-l1, scaled_english_counts, scale, 5, 0.4446
-l2, normalised_english_counts, normalise, 3000, 0.953
-l2, normalised_english_counts, normalise, 1000, 0.962
-l2, normalised_english_counts, normalise, 300, 0.9638
-l2, normalised_english_counts, normalise, 100, 0.9632
-l2, normalised_english_counts, normalise, 50, 0.9604
-l2, normalised_english_counts, normalise, 30, 0.95
-l2, normalised_english_counts, normalise, 20, 0.892
-l2, normalised_english_counts, normalise, 10, 0.7124
-l2, normalised_english_counts, normalise, 5, 0.4406
-l2, normalised_english_counts, scale, 3000, 0.9626
-l2, normalised_english_counts, scale, 1000, 0.956
-l2, normalised_english_counts, scale, 300, 0.962
-l2, normalised_english_counts, scale, 100, 0.9572
-l2, normalised_english_counts, scale, 50, 0.9526
-l2, normalised_english_counts, scale, 30, 0.9478
-l2, normalised_english_counts, scale, 20, 0.9046
-l2, normalised_english_counts, scale, 10, 0.6896
-l2, normalised_english_counts, scale, 5, 0.4308
-l2, scaled_english_counts, normalise, 3000, 0.9574
-l2, scaled_english_counts, normalise, 1000, 0.9568
-l2, scaled_english_counts, normalise, 300, 0.9536
-l2, scaled_english_counts, normalise, 100, 0.9624
-l2, scaled_english_counts, normalise, 50, 0.9606
-l2, scaled_english_counts, normalise, 30, 0.9384
-l2, scaled_english_counts, normalise, 20, 0.8914
-l2, scaled_english_counts, normalise, 10, 0.6892
-l2, scaled_english_counts, normalise, 5, 0.4196
-l2, scaled_english_counts, scale, 3000, 0.9532
-l2, scaled_english_counts, scale, 1000, 0.9588
-l2, scaled_english_counts, scale, 300, 0.9644
-l2, scaled_english_counts, scale, 100, 0.9572
-l2, scaled_english_counts, scale, 50, 0.9586
-l2, scaled_english_counts, scale, 30, 0.9436
-l2, scaled_english_counts, scale, 20, 0.9036
-l2, scaled_english_counts, scale, 10, 0.693
-l2, scaled_english_counts, scale, 5, 0.4376
-l3, normalised_english_counts, normalise, 3000, 0.9626
-l3, normalised_english_counts, normalise, 1000, 0.9582
-l3, normalised_english_counts, normalise, 300, 0.9542
-l3, normalised_english_counts, normalise, 100, 0.9606
-l3, normalised_english_counts, normalise, 50, 0.953
-l3, normalised_english_counts, normalise, 30, 0.9248
-l3, normalised_english_counts, normalise, 20, 0.8684
-l3, normalised_english_counts, normalise, 10, 0.6106
-l3, normalised_english_counts, normalise, 5, 0.4064
-l3, normalised_english_counts, scale, 3000, 0.961
-l3, normalised_english_counts, scale, 1000, 0.9568
-l3, normalised_english_counts, scale, 300, 0.9566
-l3, normalised_english_counts, scale, 100, 0.9554
-l3, normalised_english_counts, scale, 50, 0.9436
-l3, normalised_english_counts, scale, 30, 0.8936
-l3, normalised_english_counts, scale, 20, 0.8016
-l3, normalised_english_counts, scale, 10, 0.579
+l1, normalised_english_counts, normalise, 300, 0.9992
+l1, normalised_english_counts, normalise, 100, 0.9996
+l1, normalised_english_counts, normalise, 50, 0.9992
+l1, normalised_english_counts, normalise, 30, 0.9914
+l1, normalised_english_counts, normalise, 20, 0.9532
+l1, normalised_english_counts, normalise, 10, 0.7442
+l1, normalised_english_counts, normalise, 5, 0.4358
+l1, normalised_english_counts, scale, 300, 1.0
+l1, normalised_english_counts, scale, 100, 0.999
+l1, normalised_english_counts, scale, 50, 0.9988
+l1, normalised_english_counts, scale, 30, 0.9848
+l1, normalised_english_counts, scale, 20, 0.9316
+l1, normalised_english_counts, scale, 10, 0.715
+l1, normalised_english_counts, scale, 5, 0.436
+l1, scaled_english_counts, normalise, 300, 0.9994
+l1, scaled_english_counts, normalise, 100, 0.9998
+l1, scaled_english_counts, normalise, 50, 0.999
+l1, scaled_english_counts, normalise, 30, 0.9868
+l1, scaled_english_counts, normalise, 20, 0.9482
+l1, scaled_english_counts, normalise, 10, 0.7434
+l1, scaled_english_counts, normalise, 5, 0.4532
+l1, scaled_english_counts, scale, 300, 0.9996
+l1, scaled_english_counts, scale, 100, 1.0
+l1, scaled_english_counts, scale, 50, 0.9988
+l1, scaled_english_counts, scale, 30, 0.9874
+l1, scaled_english_counts, scale, 20, 0.9488
+l1, scaled_english_counts, scale, 10, 0.745
+l1, scaled_english_counts, scale, 5, 0.4548
+l2, normalised_english_counts, normalise, 300, 0.9994
+l2, normalised_english_counts, normalise, 100, 0.9992
+l2, normalised_english_counts, normalise, 50, 0.9978
+l2, normalised_english_counts, normalise, 30, 0.9836
+l2, normalised_english_counts, normalise, 20, 0.9318
+l2, normalised_english_counts, normalise, 10, 0.7072
+l2, normalised_english_counts, normalise, 5, 0.4294
+l2, normalised_english_counts, scale, 300, 0.9988
+l2, normalised_english_counts, scale, 100, 0.9998
+l2, normalised_english_counts, scale, 50, 0.9978
+l2, normalised_english_counts, scale, 30, 0.9868
+l2, normalised_english_counts, scale, 20, 0.9364
+l2, normalised_english_counts, scale, 10, 0.7136
+l2, normalised_english_counts, scale, 5, 0.446
+l2, scaled_english_counts, normalise, 300, 0.9992
+l2, scaled_english_counts, normalise, 100, 0.9996
+l2, scaled_english_counts, normalise, 50, 0.9984
+l2, scaled_english_counts, normalise, 30, 0.9854
+l2, scaled_english_counts, normalise, 20, 0.9328
+l2, scaled_english_counts, normalise, 10, 0.7122
+l2, scaled_english_counts, normalise, 5, 0.4328
+l2, scaled_english_counts, scale, 300, 1.0
+l2, scaled_english_counts, scale, 100, 0.9998
+l2, scaled_english_counts, scale, 50, 0.9972
+l2, scaled_english_counts, scale, 30, 0.9842
+l2, scaled_english_counts, scale, 20, 0.9356
+l2, scaled_english_counts, scale, 10, 0.7126
+l2, scaled_english_counts, scale, 5, 0.4318
+l3, normalised_english_counts, normalise, 300, 0.9996
+l3, normalised_english_counts, normalise, 100, 0.999
+l3, normalised_english_counts, normalise, 50, 0.994
+l3, normalised_english_counts, normalise, 30, 0.9658
+l3, normalised_english_counts, normalise, 20, 0.8926
+l3, normalised_english_counts, normalise, 10, 0.6252
+l3, normalised_english_counts, normalise, 5, 0.3974
+l3, normalised_english_counts, scale, 300, 0.9996
+l3, normalised_english_counts, scale, 100, 0.998
+l3, normalised_english_counts, scale, 50, 0.9828
+l3, normalised_english_counts, scale, 30, 0.9334
+l3, normalised_english_counts, scale, 20, 0.8304
+l3, normalised_english_counts, scale, 10, 0.5968
 l3, normalised_english_counts, scale, 5, 0.4114
-l3, scaled_english_counts, normalise, 3000, 0.9616
-l3, scaled_english_counts, normalise, 1000, 0.9612
-l3, scaled_english_counts, normalise, 300, 0.9624
-l3, scaled_english_counts, normalise, 100, 0.9524
-l3, scaled_english_counts, normalise, 50, 0.9474
-l3, scaled_english_counts, normalise, 30, 0.9066
-l3, scaled_english_counts, normalise, 20, 0.8004
-l3, scaled_english_counts, normalise, 10, 0.5686
-l3, scaled_english_counts, normalise, 5, 0.3404
-l3, scaled_english_counts, scale, 3000, 0.96
-l3, scaled_english_counts, scale, 1000, 0.96
-l3, scaled_english_counts, scale, 300, 0.9596
-l3, scaled_english_counts, scale, 100, 0.96
-l3, scaled_english_counts, scale, 50, 0.954
-l3, scaled_english_counts, scale, 30, 0.9374
-l3, scaled_english_counts, scale, 20, 0.862
-l3, scaled_english_counts, scale, 10, 0.6276
-l3, scaled_english_counts, scale, 5, 0.399
-cosine_distance, normalised_english_counts, normalise, 3000, 0.9618
-cosine_distance, normalised_english_counts, normalise, 1000, 0.96
-cosine_distance, normalised_english_counts, normalise, 300, 0.9604
-cosine_distance, normalised_english_counts, normalise, 100, 0.9538
-cosine_distance, normalised_english_counts, normalise, 50, 0.9608
-cosine_distance, normalised_english_counts, normalise, 30, 0.9426
-cosine_distance, normalised_english_counts, normalise, 20, 0.9012
-cosine_distance, normalised_english_counts, normalise, 10, 0.6916
-cosine_distance, normalised_english_counts, normalise, 5, 0.4286
-cosine_distance, normalised_english_counts, scale, 3000, 0.9606
-cosine_distance, normalised_english_counts, scale, 1000, 0.9572
-cosine_distance, normalised_english_counts, scale, 300, 0.9628
-cosine_distance, normalised_english_counts, scale, 100, 0.959
-cosine_distance, normalised_english_counts, scale, 50, 0.9542
-cosine_distance, normalised_english_counts, scale, 30, 0.951
-cosine_distance, normalised_english_counts, scale, 20, 0.9028
-cosine_distance, normalised_english_counts, scale, 10, 0.7028
-cosine_distance, normalised_english_counts, scale, 5, 0.44
-cosine_distance, scaled_english_counts, normalise, 3000, 0.9582
-cosine_distance, scaled_english_counts, normalise, 1000, 0.9614
-cosine_distance, scaled_english_counts, normalise, 300, 0.9632
-cosine_distance, scaled_english_counts, normalise, 100, 0.9584
-cosine_distance, scaled_english_counts, normalise, 50, 0.9574
-cosine_distance, scaled_english_counts, normalise, 30, 0.9506
-cosine_distance, scaled_english_counts, normalise, 20, 0.8956
-cosine_distance, scaled_english_counts, normalise, 10, 0.6916
-cosine_distance, scaled_english_counts, normalise, 5, 0.4356
-cosine_distance, scaled_english_counts, scale, 3000, 0.9572
-cosine_distance, scaled_english_counts, scale, 1000, 0.961
-cosine_distance, scaled_english_counts, scale, 300, 0.9596
-cosine_distance, scaled_english_counts, scale, 100, 0.9544
-cosine_distance, scaled_english_counts, scale, 50, 0.9598
-cosine_distance, scaled_english_counts, scale, 30, 0.9414
-cosine_distance, scaled_english_counts, scale, 20, 0.9036
-cosine_distance, scaled_english_counts, scale, 10, 0.6928
-cosine_distance, scaled_english_counts, scale, 5, 0.4178
+l3, scaled_english_counts, normalise, 300, 0.9994
+l3, scaled_english_counts, normalise, 100, 0.9984
+l3, scaled_english_counts, normalise, 50, 0.9876
+l3, scaled_english_counts, normalise, 30, 0.9284
+l3, scaled_english_counts, normalise, 20, 0.8322
+l3, scaled_english_counts, normalise, 10, 0.579
+l3, scaled_english_counts, normalise, 5, 0.3466
+l3, scaled_english_counts, scale, 300, 1.0
+l3, scaled_english_counts, scale, 100, 0.999
+l3, scaled_english_counts, scale, 50, 0.994
+l3, scaled_english_counts, scale, 30, 0.9688
+l3, scaled_english_counts, scale, 20, 0.8952
+l3, scaled_english_counts, scale, 10, 0.6416
+l3, scaled_english_counts, scale, 5, 0.4042
+cosine_distance, normalised_english_counts, normalise, 300, 0.9994
+cosine_distance, normalised_english_counts, normalise, 100, 1.0
+cosine_distance, normalised_english_counts, normalise, 50, 0.9978
+cosine_distance, normalised_english_counts, normalise, 30, 0.9856
+cosine_distance, normalised_english_counts, normalise, 20, 0.9374
+cosine_distance, normalised_english_counts, normalise, 10, 0.7212
+cosine_distance, normalised_english_counts, normalise, 5, 0.4282
+cosine_distance, normalised_english_counts, scale, 300, 0.9998
+cosine_distance, normalised_english_counts, scale, 100, 0.9994
+cosine_distance, normalised_english_counts, scale, 50, 0.9972
+cosine_distance, normalised_english_counts, scale, 30, 0.9846
+cosine_distance, normalised_english_counts, scale, 20, 0.9324
+cosine_distance, normalised_english_counts, scale, 10, 0.7144
+cosine_distance, normalised_english_counts, scale, 5, 0.4284
+cosine_distance, scaled_english_counts, normalise, 300, 0.9994
+cosine_distance, scaled_english_counts, normalise, 100, 0.9996
+cosine_distance, scaled_english_counts, normalise, 50, 0.9978
+cosine_distance, scaled_english_counts, normalise, 30, 0.9856
+cosine_distance, scaled_english_counts, normalise, 20, 0.935
+cosine_distance, scaled_english_counts, normalise, 10, 0.7232
+cosine_distance, scaled_english_counts, normalise, 5, 0.415
+cosine_distance, scaled_english_counts, scale, 300, 0.9982
+cosine_distance, scaled_english_counts, scale, 100, 0.9988
+cosine_distance, scaled_english_counts, scale, 50, 0.9976
+cosine_distance, scaled_english_counts, scale, 30, 0.9844
+cosine_distance, scaled_english_counts, scale, 20, 0.9314
+cosine_distance, scaled_english_counts, scale, 10, 0.7102
+cosine_distance, scaled_english_counts, scale, 5, 0.4376
+harmonic_mean, normalised_english_counts, normalise, 300, 0.4684
+harmonic_mean, normalised_english_counts, normalise, 100, 0.5068
+harmonic_mean, normalised_english_counts, normalise, 50, 0.6978
+harmonic_mean, normalised_english_counts, normalise, 30, 0.593
+harmonic_mean, normalised_english_counts, normalise, 20, 0.536
+harmonic_mean, normalised_english_counts, normalise, 10, 0.4284
+harmonic_mean, normalised_english_counts, normalise, 5, 0.3542
+harmonic_mean, normalised_english_counts, scale, 300, 0.3602
+harmonic_mean, normalised_english_counts, scale, 100, 0.57
+harmonic_mean, normalised_english_counts, scale, 50, 0.795
+harmonic_mean, normalised_english_counts, scale, 30, 0.7694
+harmonic_mean, normalised_english_counts, scale, 20, 0.6924
+harmonic_mean, normalised_english_counts, scale, 10, 0.559
+harmonic_mean, normalised_english_counts, scale, 5, 0.39
+harmonic_mean, scaled_english_counts, normalise, 300, 0.1214
+harmonic_mean, scaled_english_counts, normalise, 100, 0.132
+harmonic_mean, scaled_english_counts, normalise, 50, 0.1956
+harmonic_mean, scaled_english_counts, normalise, 30, 0.2686
+harmonic_mean, scaled_english_counts, normalise, 20, 0.258
+harmonic_mean, scaled_english_counts, normalise, 10, 0.2042
+harmonic_mean, scaled_english_counts, normalise, 5, 0.227
+harmonic_mean, scaled_english_counts, scale, 300, 0.7956
+harmonic_mean, scaled_english_counts, scale, 100, 0.5672
+harmonic_mean, scaled_english_counts, scale, 50, 0.4404
+harmonic_mean, scaled_english_counts, scale, 30, 0.3584
+harmonic_mean, scaled_english_counts, scale, 20, 0.3012
+harmonic_mean, scaled_english_counts, scale, 10, 0.2136
+harmonic_mean, scaled_english_counts, scale, 5, 0.1426
+geometric_mean, normalised_english_counts, normalise, 300, 0.9996
+geometric_mean, normalised_english_counts, normalise, 100, 0.9992
+geometric_mean, normalised_english_counts, normalise, 50, 0.9928
+geometric_mean, normalised_english_counts, normalise, 30, 0.9552
+geometric_mean, normalised_english_counts, normalise, 20, 0.8936
+geometric_mean, normalised_english_counts, normalise, 10, 0.6582
+geometric_mean, normalised_english_counts, normalise, 5, 0.4316
+geometric_mean, normalised_english_counts, scale, 300, 0.97
+geometric_mean, normalised_english_counts, scale, 100, 0.9762
+geometric_mean, normalised_english_counts, scale, 50, 0.9724
+geometric_mean, normalised_english_counts, scale, 30, 0.9224
+geometric_mean, normalised_english_counts, scale, 20, 0.8496
+geometric_mean, normalised_english_counts, scale, 10, 0.6846
+geometric_mean, normalised_english_counts, scale, 5, 0.4268
+geometric_mean, scaled_english_counts, normalise, 300, 0.9556
+geometric_mean, scaled_english_counts, normalise, 100, 0.8724
+geometric_mean, scaled_english_counts, normalise, 50, 0.7176
+geometric_mean, scaled_english_counts, normalise, 30, 0.6536
+geometric_mean, scaled_english_counts, normalise, 20, 0.5586
+geometric_mean, scaled_english_counts, normalise, 10, 0.3926
+geometric_mean, scaled_english_counts, normalise, 5, 0.319
+geometric_mean, scaled_english_counts, scale, 300, 0.7822
+geometric_mean, scaled_english_counts, scale, 100, 0.5784
+geometric_mean, scaled_english_counts, scale, 50, 0.4318
+geometric_mean, scaled_english_counts, scale, 30, 0.349
+geometric_mean, scaled_english_counts, scale, 20, 0.2932
+geometric_mean, scaled_english_counts, scale, 10, 0.2098
+geometric_mean, scaled_english_counts, scale, 5, 0.1406
index 053635008df52ccdaba6b354dd5c4ddd11462ba5..b883abea3cbc32bf81bb37a342efeaf576ffbf02 100644 (file)
--- a/cipher.py
+++ b/cipher.py
@@ -21,6 +21,9 @@ def sanitise(text):
     sanitised = [c.lower() for c in text if c in string.ascii_letters]
     return ''.join(sanitised)
 
+def ngrams(text, n):
+    return [tuple(text[i:i+n]) for i in range(len(text)-n+1)]
+
 def letter_frequencies(text):
     """Count the number of occurrences of each character in text
     
@@ -105,10 +108,22 @@ def caesar_decipher(message, shift):
     return caesar_encipher(message, -shift)
 
 def caesar_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise):
+    """Breaks a Caesar cipher using frequency analysis
+    
+    
+    >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrhecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm')
+    (4, 0.3186395289018361)
+    >>> caesar_break('jhzhuhfrqilqhgwrdevwudfwuhdvrqlqjwkhqkdylqjvxemhfwhgwrfulwlflvpwkhhasodqdwlrqrisrzhuwkdwmxulglfdovfl')
+    (3, 0.32902042861730835)
+    >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgteeraxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert')
+    (19, 0.4215290123583277)
+    >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurersvaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur')
+    (13, 0.31602920807545154)
+    """
     sanitised_message = sanitise(message)
     best_shift = 0
     best_fit = float("inf")
-    for shift in range(1, 25):
+    for shift in range(26):
         plaintext = caesar_decipher(sanitised_message, shift)
         frequencies = message_frequency_scaling(letter_frequencies(plaintext))
         fit = metric(target_frequencies, frequencies)
index 711cff0f5a3fbe7a10bcdc413da212b564700578..ed8bbaac3f278f122b97612875ff3d614bc5c442 100644 (file)
@@ -11,7 +11,7 @@ scaled_english_counts = norms.scale(english_counts)
 metrics = [norms.l1, norms.l2, norms.l3, norms.cosine_distance, norms.harmonic_mean, norms.geometric_mean]
 corpus_frequencies = [normalised_english_counts, scaled_english_counts]
 scalings = [norms.normalise, norms.scale]
-message_lengths = [3000, 1000, 300, 100, 50, 30, 20, 10, 5]
+message_lengths = [300, 100, 50, 30, 20, 10, 5]
 
 metric_names = ['l1', 'l2', 'l3', 'cosine_distance', 'harmonic_mean', 'geometric_mean']
 corpus_frequency_names = ['normalised_english_counts', 'scaled_english_counts']
index 4fdf1e3d85bb347c501bcb88c6caec7a8c969035..08cff74b82541f2e2331f2ce85775db68ea44399 100644 (file)
--- a/norms.py
+++ b/norms.py
@@ -97,24 +97,52 @@ def l3(frequencies1, frequencies2):
     return total ** (1/3)
 
 def geometric_mean(frequencies1, frequencies2):
-    """Finds the distances between two frequency profiles, expressed as dictionaries.
+    """Finds the geometric mean of the absolute differences between two frequency profiles, 
+    expressed as dictionaries.
     Assumes every key in frequencies1 is also in frequencies2
-
+    
+    >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
+    1
+    >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
+    1
+    >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1})
+    3
+    >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':5, 'c':1}))
+    0.057022248808851934
+    >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
+    0.0
+    >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':0}))
+    0.009720703533656434
     """
-    total = 0
+    total = 1
     for k in frequencies1.keys():
         total *= abs(frequencies1[k] - frequencies2[k])
     return total
 
 def harmonic_mean(frequencies1, frequencies2):
-    """Finds the distances between two frequency profiles, expressed as dictionaries.
+    """Finds the harmonic mean of the absolute differences between two frequency profiles, 
+    expressed as dictionaries.
     Assumes every key in frequencies1 is also in frequencies2
 
+    >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
+    1.0
+    >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
+    1.0
+    >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1})
+    1.2857142857142858
+    >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':5, 'c':1}))
+    0.3849001794597505
+    >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
+    0
+    >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':0}))
+    0.17497266360581604
     """
     total = 0
     for k in frequencies1.keys():
+        if abs(frequencies1[k] - frequencies2[k]) == 0:
+            return 0
         total += 1 / abs(frequencies1[k] - frequencies2[k])
-    return 1 / total
+    return len(frequencies1) / total
 
 
 def cosine_distance(frequencies1, frequencies2):