From: Neil Smith Date: Mon, 7 Oct 2013 12:28:24 +0000 (+0100) Subject: Fixed bugs in geometric and harmonic means, added some tests. X-Git-Url: https://git.njae.me.uk/?a=commitdiff_plain;h=ac47ee478ba8e785037c9f25367c111f630eea54;p=cipher-tools.git Fixed bugs in geometric and harmonic means, added some tests. --- diff --git a/__pycache__/cipher.cpython-33.pyc b/__pycache__/cipher.cpython-33.pyc index 05222b0..71c263e 100644 Binary files a/__pycache__/cipher.cpython-33.pyc and b/__pycache__/cipher.cpython-33.pyc differ diff --git a/__pycache__/norms.cpython-33.pyc b/__pycache__/norms.cpython-33.pyc index b18ed4b..7d10c2f 100644 Binary files a/__pycache__/norms.cpython-33.pyc and b/__pycache__/norms.cpython-33.pyc differ diff --git a/caesar_break_parameter_trials.csv b/caesar_break_parameter_trials.csv index df9b836..ba7ee27 100644 --- a/caesar_break_parameter_trials.csv +++ b/caesar_break_parameter_trials.csv @@ -1,144 +1,168 @@ -l1, normalised_english_counts, normalise, 3000, 0.9616 -l1, normalised_english_counts, normalise, 1000, 0.9562 -l1, normalised_english_counts, normalise, 300, 0.9598 -l1, normalised_english_counts, normalise, 100, 0.9622 -l1, normalised_english_counts, normalise, 50, 0.9584 -l1, normalised_english_counts, normalise, 30, 0.953 -l1, normalised_english_counts, normalise, 20, 0.917 -l1, normalised_english_counts, normalise, 10, 0.7328 -l1, normalised_english_counts, normalise, 5, 0.4394 -l1, normalised_english_counts, scale, 3000, 0.9618 -l1, normalised_english_counts, scale, 1000, 0.9574 -l1, normalised_english_counts, scale, 300, 0.9624 -l1, normalised_english_counts, scale, 100, 0.9566 -l1, normalised_english_counts, scale, 50, 0.959 -l1, normalised_english_counts, scale, 30, 0.9476 -l1, normalised_english_counts, scale, 20, 0.8968 -l1, normalised_english_counts, scale, 10, 0.6844 -l1, normalised_english_counts, scale, 5, 0.4298 -l1, scaled_english_counts, normalise, 3000, 0.957 -l1, scaled_english_counts, normalise, 1000, 0.9662 -l1, scaled_english_counts, normalise, 300, 0.9604 -l1, scaled_english_counts, normalise, 100, 0.9602 -l1, scaled_english_counts, normalise, 50, 0.9578 -l1, scaled_english_counts, normalise, 30, 0.9504 -l1, scaled_english_counts, normalise, 20, 0.9174 -l1, scaled_english_counts, normalise, 10, 0.7204 -l1, scaled_english_counts, normalise, 5, 0.4506 -l1, scaled_english_counts, scale, 3000, 0.9584 -l1, scaled_english_counts, scale, 1000, 0.9586 -l1, scaled_english_counts, scale, 300, 0.964 -l1, scaled_english_counts, scale, 100, 0.9582 -l1, scaled_english_counts, scale, 50, 0.9606 -l1, scaled_english_counts, scale, 30, 0.944 -l1, scaled_english_counts, scale, 20, 0.915 -l1, scaled_english_counts, scale, 10, 0.7324 -l1, scaled_english_counts, scale, 5, 0.4446 -l2, normalised_english_counts, normalise, 3000, 0.953 -l2, normalised_english_counts, normalise, 1000, 0.962 -l2, normalised_english_counts, normalise, 300, 0.9638 -l2, normalised_english_counts, normalise, 100, 0.9632 -l2, normalised_english_counts, normalise, 50, 0.9604 -l2, normalised_english_counts, normalise, 30, 0.95 -l2, normalised_english_counts, normalise, 20, 0.892 -l2, normalised_english_counts, normalise, 10, 0.7124 -l2, normalised_english_counts, normalise, 5, 0.4406 -l2, normalised_english_counts, scale, 3000, 0.9626 -l2, normalised_english_counts, scale, 1000, 0.956 -l2, normalised_english_counts, scale, 300, 0.962 -l2, normalised_english_counts, scale, 100, 0.9572 -l2, normalised_english_counts, scale, 50, 0.9526 -l2, normalised_english_counts, scale, 30, 0.9478 -l2, normalised_english_counts, scale, 20, 0.9046 -l2, normalised_english_counts, scale, 10, 0.6896 -l2, normalised_english_counts, scale, 5, 0.4308 -l2, scaled_english_counts, normalise, 3000, 0.9574 -l2, scaled_english_counts, normalise, 1000, 0.9568 -l2, scaled_english_counts, normalise, 300, 0.9536 -l2, scaled_english_counts, normalise, 100, 0.9624 -l2, scaled_english_counts, normalise, 50, 0.9606 -l2, scaled_english_counts, normalise, 30, 0.9384 -l2, scaled_english_counts, normalise, 20, 0.8914 -l2, scaled_english_counts, normalise, 10, 0.6892 -l2, scaled_english_counts, normalise, 5, 0.4196 -l2, scaled_english_counts, scale, 3000, 0.9532 -l2, scaled_english_counts, scale, 1000, 0.9588 -l2, scaled_english_counts, scale, 300, 0.9644 -l2, scaled_english_counts, scale, 100, 0.9572 -l2, scaled_english_counts, scale, 50, 0.9586 -l2, scaled_english_counts, scale, 30, 0.9436 -l2, scaled_english_counts, scale, 20, 0.9036 -l2, scaled_english_counts, scale, 10, 0.693 -l2, scaled_english_counts, scale, 5, 0.4376 -l3, normalised_english_counts, normalise, 3000, 0.9626 -l3, normalised_english_counts, normalise, 1000, 0.9582 -l3, normalised_english_counts, normalise, 300, 0.9542 -l3, normalised_english_counts, normalise, 100, 0.9606 -l3, normalised_english_counts, normalise, 50, 0.953 -l3, normalised_english_counts, normalise, 30, 0.9248 -l3, normalised_english_counts, normalise, 20, 0.8684 -l3, normalised_english_counts, normalise, 10, 0.6106 -l3, normalised_english_counts, normalise, 5, 0.4064 -l3, normalised_english_counts, scale, 3000, 0.961 -l3, normalised_english_counts, scale, 1000, 0.9568 -l3, normalised_english_counts, scale, 300, 0.9566 -l3, normalised_english_counts, scale, 100, 0.9554 -l3, normalised_english_counts, scale, 50, 0.9436 -l3, normalised_english_counts, scale, 30, 0.8936 -l3, normalised_english_counts, scale, 20, 0.8016 -l3, normalised_english_counts, scale, 10, 0.579 +l1, normalised_english_counts, normalise, 300, 0.9992 +l1, normalised_english_counts, normalise, 100, 0.9996 +l1, normalised_english_counts, normalise, 50, 0.9992 +l1, normalised_english_counts, normalise, 30, 0.9914 +l1, normalised_english_counts, normalise, 20, 0.9532 +l1, normalised_english_counts, normalise, 10, 0.7442 +l1, normalised_english_counts, normalise, 5, 0.4358 +l1, normalised_english_counts, scale, 300, 1.0 +l1, normalised_english_counts, scale, 100, 0.999 +l1, normalised_english_counts, scale, 50, 0.9988 +l1, normalised_english_counts, scale, 30, 0.9848 +l1, normalised_english_counts, scale, 20, 0.9316 +l1, normalised_english_counts, scale, 10, 0.715 +l1, normalised_english_counts, scale, 5, 0.436 +l1, scaled_english_counts, normalise, 300, 0.9994 +l1, scaled_english_counts, normalise, 100, 0.9998 +l1, scaled_english_counts, normalise, 50, 0.999 +l1, scaled_english_counts, normalise, 30, 0.9868 +l1, scaled_english_counts, normalise, 20, 0.9482 +l1, scaled_english_counts, normalise, 10, 0.7434 +l1, scaled_english_counts, normalise, 5, 0.4532 +l1, scaled_english_counts, scale, 300, 0.9996 +l1, scaled_english_counts, scale, 100, 1.0 +l1, scaled_english_counts, scale, 50, 0.9988 +l1, scaled_english_counts, scale, 30, 0.9874 +l1, scaled_english_counts, scale, 20, 0.9488 +l1, scaled_english_counts, scale, 10, 0.745 +l1, scaled_english_counts, scale, 5, 0.4548 +l2, normalised_english_counts, normalise, 300, 0.9994 +l2, normalised_english_counts, normalise, 100, 0.9992 +l2, normalised_english_counts, normalise, 50, 0.9978 +l2, normalised_english_counts, normalise, 30, 0.9836 +l2, normalised_english_counts, normalise, 20, 0.9318 +l2, normalised_english_counts, normalise, 10, 0.7072 +l2, normalised_english_counts, normalise, 5, 0.4294 +l2, normalised_english_counts, scale, 300, 0.9988 +l2, normalised_english_counts, scale, 100, 0.9998 +l2, normalised_english_counts, scale, 50, 0.9978 +l2, normalised_english_counts, scale, 30, 0.9868 +l2, normalised_english_counts, scale, 20, 0.9364 +l2, normalised_english_counts, scale, 10, 0.7136 +l2, normalised_english_counts, scale, 5, 0.446 +l2, scaled_english_counts, normalise, 300, 0.9992 +l2, scaled_english_counts, normalise, 100, 0.9996 +l2, scaled_english_counts, normalise, 50, 0.9984 +l2, scaled_english_counts, normalise, 30, 0.9854 +l2, scaled_english_counts, normalise, 20, 0.9328 +l2, scaled_english_counts, normalise, 10, 0.7122 +l2, scaled_english_counts, normalise, 5, 0.4328 +l2, scaled_english_counts, scale, 300, 1.0 +l2, scaled_english_counts, scale, 100, 0.9998 +l2, scaled_english_counts, scale, 50, 0.9972 +l2, scaled_english_counts, scale, 30, 0.9842 +l2, scaled_english_counts, scale, 20, 0.9356 +l2, scaled_english_counts, scale, 10, 0.7126 +l2, scaled_english_counts, scale, 5, 0.4318 +l3, normalised_english_counts, normalise, 300, 0.9996 +l3, normalised_english_counts, normalise, 100, 0.999 +l3, normalised_english_counts, normalise, 50, 0.994 +l3, normalised_english_counts, normalise, 30, 0.9658 +l3, normalised_english_counts, normalise, 20, 0.8926 +l3, normalised_english_counts, normalise, 10, 0.6252 +l3, normalised_english_counts, normalise, 5, 0.3974 +l3, normalised_english_counts, scale, 300, 0.9996 +l3, normalised_english_counts, scale, 100, 0.998 +l3, normalised_english_counts, scale, 50, 0.9828 +l3, normalised_english_counts, scale, 30, 0.9334 +l3, normalised_english_counts, scale, 20, 0.8304 +l3, normalised_english_counts, scale, 10, 0.5968 l3, normalised_english_counts, scale, 5, 0.4114 -l3, scaled_english_counts, normalise, 3000, 0.9616 -l3, scaled_english_counts, normalise, 1000, 0.9612 -l3, scaled_english_counts, normalise, 300, 0.9624 -l3, scaled_english_counts, normalise, 100, 0.9524 -l3, scaled_english_counts, normalise, 50, 0.9474 -l3, scaled_english_counts, normalise, 30, 0.9066 -l3, scaled_english_counts, normalise, 20, 0.8004 -l3, scaled_english_counts, normalise, 10, 0.5686 -l3, scaled_english_counts, normalise, 5, 0.3404 -l3, scaled_english_counts, scale, 3000, 0.96 -l3, scaled_english_counts, scale, 1000, 0.96 -l3, scaled_english_counts, scale, 300, 0.9596 -l3, scaled_english_counts, scale, 100, 0.96 -l3, scaled_english_counts, scale, 50, 0.954 -l3, scaled_english_counts, scale, 30, 0.9374 -l3, scaled_english_counts, scale, 20, 0.862 -l3, scaled_english_counts, scale, 10, 0.6276 -l3, scaled_english_counts, scale, 5, 0.399 -cosine_distance, normalised_english_counts, normalise, 3000, 0.9618 -cosine_distance, normalised_english_counts, normalise, 1000, 0.96 -cosine_distance, normalised_english_counts, normalise, 300, 0.9604 -cosine_distance, normalised_english_counts, normalise, 100, 0.9538 -cosine_distance, normalised_english_counts, normalise, 50, 0.9608 -cosine_distance, normalised_english_counts, normalise, 30, 0.9426 -cosine_distance, normalised_english_counts, normalise, 20, 0.9012 -cosine_distance, normalised_english_counts, normalise, 10, 0.6916 -cosine_distance, normalised_english_counts, normalise, 5, 0.4286 -cosine_distance, normalised_english_counts, scale, 3000, 0.9606 -cosine_distance, normalised_english_counts, scale, 1000, 0.9572 -cosine_distance, normalised_english_counts, scale, 300, 0.9628 -cosine_distance, normalised_english_counts, scale, 100, 0.959 -cosine_distance, normalised_english_counts, scale, 50, 0.9542 -cosine_distance, normalised_english_counts, scale, 30, 0.951 -cosine_distance, normalised_english_counts, scale, 20, 0.9028 -cosine_distance, normalised_english_counts, scale, 10, 0.7028 -cosine_distance, normalised_english_counts, scale, 5, 0.44 -cosine_distance, scaled_english_counts, normalise, 3000, 0.9582 -cosine_distance, scaled_english_counts, normalise, 1000, 0.9614 -cosine_distance, scaled_english_counts, normalise, 300, 0.9632 -cosine_distance, scaled_english_counts, normalise, 100, 0.9584 -cosine_distance, scaled_english_counts, normalise, 50, 0.9574 -cosine_distance, scaled_english_counts, normalise, 30, 0.9506 -cosine_distance, scaled_english_counts, normalise, 20, 0.8956 -cosine_distance, scaled_english_counts, normalise, 10, 0.6916 -cosine_distance, scaled_english_counts, normalise, 5, 0.4356 -cosine_distance, scaled_english_counts, scale, 3000, 0.9572 -cosine_distance, scaled_english_counts, scale, 1000, 0.961 -cosine_distance, scaled_english_counts, scale, 300, 0.9596 -cosine_distance, scaled_english_counts, scale, 100, 0.9544 -cosine_distance, scaled_english_counts, scale, 50, 0.9598 -cosine_distance, scaled_english_counts, scale, 30, 0.9414 -cosine_distance, scaled_english_counts, scale, 20, 0.9036 -cosine_distance, scaled_english_counts, scale, 10, 0.6928 -cosine_distance, scaled_english_counts, scale, 5, 0.4178 +l3, scaled_english_counts, normalise, 300, 0.9994 +l3, scaled_english_counts, normalise, 100, 0.9984 +l3, scaled_english_counts, normalise, 50, 0.9876 +l3, scaled_english_counts, normalise, 30, 0.9284 +l3, scaled_english_counts, normalise, 20, 0.8322 +l3, scaled_english_counts, normalise, 10, 0.579 +l3, scaled_english_counts, normalise, 5, 0.3466 +l3, scaled_english_counts, scale, 300, 1.0 +l3, scaled_english_counts, scale, 100, 0.999 +l3, scaled_english_counts, scale, 50, 0.994 +l3, scaled_english_counts, scale, 30, 0.9688 +l3, scaled_english_counts, scale, 20, 0.8952 +l3, scaled_english_counts, scale, 10, 0.6416 +l3, scaled_english_counts, scale, 5, 0.4042 +cosine_distance, normalised_english_counts, normalise, 300, 0.9994 +cosine_distance, normalised_english_counts, normalise, 100, 1.0 +cosine_distance, normalised_english_counts, normalise, 50, 0.9978 +cosine_distance, normalised_english_counts, normalise, 30, 0.9856 +cosine_distance, normalised_english_counts, normalise, 20, 0.9374 +cosine_distance, normalised_english_counts, normalise, 10, 0.7212 +cosine_distance, normalised_english_counts, normalise, 5, 0.4282 +cosine_distance, normalised_english_counts, scale, 300, 0.9998 +cosine_distance, normalised_english_counts, scale, 100, 0.9994 +cosine_distance, normalised_english_counts, scale, 50, 0.9972 +cosine_distance, normalised_english_counts, scale, 30, 0.9846 +cosine_distance, normalised_english_counts, scale, 20, 0.9324 +cosine_distance, normalised_english_counts, scale, 10, 0.7144 +cosine_distance, normalised_english_counts, scale, 5, 0.4284 +cosine_distance, scaled_english_counts, normalise, 300, 0.9994 +cosine_distance, scaled_english_counts, normalise, 100, 0.9996 +cosine_distance, scaled_english_counts, normalise, 50, 0.9978 +cosine_distance, scaled_english_counts, normalise, 30, 0.9856 +cosine_distance, scaled_english_counts, normalise, 20, 0.935 +cosine_distance, scaled_english_counts, normalise, 10, 0.7232 +cosine_distance, scaled_english_counts, normalise, 5, 0.415 +cosine_distance, scaled_english_counts, scale, 300, 0.9982 +cosine_distance, scaled_english_counts, scale, 100, 0.9988 +cosine_distance, scaled_english_counts, scale, 50, 0.9976 +cosine_distance, scaled_english_counts, scale, 30, 0.9844 +cosine_distance, scaled_english_counts, scale, 20, 0.9314 +cosine_distance, scaled_english_counts, scale, 10, 0.7102 +cosine_distance, scaled_english_counts, scale, 5, 0.4376 +harmonic_mean, normalised_english_counts, normalise, 300, 0.4684 +harmonic_mean, normalised_english_counts, normalise, 100, 0.5068 +harmonic_mean, normalised_english_counts, normalise, 50, 0.6978 +harmonic_mean, normalised_english_counts, normalise, 30, 0.593 +harmonic_mean, normalised_english_counts, normalise, 20, 0.536 +harmonic_mean, normalised_english_counts, normalise, 10, 0.4284 +harmonic_mean, normalised_english_counts, normalise, 5, 0.3542 +harmonic_mean, normalised_english_counts, scale, 300, 0.3602 +harmonic_mean, normalised_english_counts, scale, 100, 0.57 +harmonic_mean, normalised_english_counts, scale, 50, 0.795 +harmonic_mean, normalised_english_counts, scale, 30, 0.7694 +harmonic_mean, normalised_english_counts, scale, 20, 0.6924 +harmonic_mean, normalised_english_counts, scale, 10, 0.559 +harmonic_mean, normalised_english_counts, scale, 5, 0.39 +harmonic_mean, scaled_english_counts, normalise, 300, 0.1214 +harmonic_mean, scaled_english_counts, normalise, 100, 0.132 +harmonic_mean, scaled_english_counts, normalise, 50, 0.1956 +harmonic_mean, scaled_english_counts, normalise, 30, 0.2686 +harmonic_mean, scaled_english_counts, normalise, 20, 0.258 +harmonic_mean, scaled_english_counts, normalise, 10, 0.2042 +harmonic_mean, scaled_english_counts, normalise, 5, 0.227 +harmonic_mean, scaled_english_counts, scale, 300, 0.7956 +harmonic_mean, scaled_english_counts, scale, 100, 0.5672 +harmonic_mean, scaled_english_counts, scale, 50, 0.4404 +harmonic_mean, scaled_english_counts, scale, 30, 0.3584 +harmonic_mean, scaled_english_counts, scale, 20, 0.3012 +harmonic_mean, scaled_english_counts, scale, 10, 0.2136 +harmonic_mean, scaled_english_counts, scale, 5, 0.1426 +geometric_mean, normalised_english_counts, normalise, 300, 0.9996 +geometric_mean, normalised_english_counts, normalise, 100, 0.9992 +geometric_mean, normalised_english_counts, normalise, 50, 0.9928 +geometric_mean, normalised_english_counts, normalise, 30, 0.9552 +geometric_mean, normalised_english_counts, normalise, 20, 0.8936 +geometric_mean, normalised_english_counts, normalise, 10, 0.6582 +geometric_mean, normalised_english_counts, normalise, 5, 0.4316 +geometric_mean, normalised_english_counts, scale, 300, 0.97 +geometric_mean, normalised_english_counts, scale, 100, 0.9762 +geometric_mean, normalised_english_counts, scale, 50, 0.9724 +geometric_mean, normalised_english_counts, scale, 30, 0.9224 +geometric_mean, normalised_english_counts, scale, 20, 0.8496 +geometric_mean, normalised_english_counts, scale, 10, 0.6846 +geometric_mean, normalised_english_counts, scale, 5, 0.4268 +geometric_mean, scaled_english_counts, normalise, 300, 0.9556 +geometric_mean, scaled_english_counts, normalise, 100, 0.8724 +geometric_mean, scaled_english_counts, normalise, 50, 0.7176 +geometric_mean, scaled_english_counts, normalise, 30, 0.6536 +geometric_mean, scaled_english_counts, normalise, 20, 0.5586 +geometric_mean, scaled_english_counts, normalise, 10, 0.3926 +geometric_mean, scaled_english_counts, normalise, 5, 0.319 +geometric_mean, scaled_english_counts, scale, 300, 0.7822 +geometric_mean, scaled_english_counts, scale, 100, 0.5784 +geometric_mean, scaled_english_counts, scale, 50, 0.4318 +geometric_mean, scaled_english_counts, scale, 30, 0.349 +geometric_mean, scaled_english_counts, scale, 20, 0.2932 +geometric_mean, scaled_english_counts, scale, 10, 0.2098 +geometric_mean, scaled_english_counts, scale, 5, 0.1406 diff --git a/cipher.py b/cipher.py index 0536350..b883abe 100644 --- a/cipher.py +++ b/cipher.py @@ -21,6 +21,9 @@ def sanitise(text): sanitised = [c.lower() for c in text if c in string.ascii_letters] return ''.join(sanitised) +def ngrams(text, n): + return [tuple(text[i:i+n]) for i in range(len(text)-n+1)] + def letter_frequencies(text): """Count the number of occurrences of each character in text @@ -105,10 +108,22 @@ def caesar_decipher(message, shift): return caesar_encipher(message, -shift) def caesar_break(message, metric=norms.euclidean_distance, target_frequencies=normalised_english_counts, message_frequency_scaling=norms.normalise): + """Breaks a Caesar cipher using frequency analysis + + + >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrhecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') + (4, 0.3186395289018361) + >>> caesar_break('jhzhuhfrqilqhgwrdevwudfwuhdvrqlqjwkhqkdylqjvxemhfwhgwrfulwlflvpwkhhasodqdwlrqrisrzhuwkdwmxulglfdovfl') + (3, 0.32902042861730835) + >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgteeraxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') + (19, 0.4215290123583277) + >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurersvaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') + (13, 0.31602920807545154) + """ sanitised_message = sanitise(message) best_shift = 0 best_fit = float("inf") - for shift in range(1, 25): + for shift in range(26): plaintext = caesar_decipher(sanitised_message, shift) frequencies = message_frequency_scaling(letter_frequencies(plaintext)) fit = metric(target_frequencies, frequencies) diff --git a/find_best_caesar_break_parameters.py b/find_best_caesar_break_parameters.py index 711cff0..ed8bbaa 100644 --- a/find_best_caesar_break_parameters.py +++ b/find_best_caesar_break_parameters.py @@ -11,7 +11,7 @@ scaled_english_counts = norms.scale(english_counts) metrics = [norms.l1, norms.l2, norms.l3, norms.cosine_distance, norms.harmonic_mean, norms.geometric_mean] corpus_frequencies = [normalised_english_counts, scaled_english_counts] scalings = [norms.normalise, norms.scale] -message_lengths = [3000, 1000, 300, 100, 50, 30, 20, 10, 5] +message_lengths = [300, 100, 50, 30, 20, 10, 5] metric_names = ['l1', 'l2', 'l3', 'cosine_distance', 'harmonic_mean', 'geometric_mean'] corpus_frequency_names = ['normalised_english_counts', 'scaled_english_counts'] diff --git a/norms.py b/norms.py index 4fdf1e3..08cff74 100644 --- a/norms.py +++ b/norms.py @@ -97,24 +97,52 @@ def l3(frequencies1, frequencies2): return total ** (1/3) def geometric_mean(frequencies1, frequencies2): - """Finds the distances between two frequency profiles, expressed as dictionaries. + """Finds the geometric mean of the absolute differences between two frequency profiles, + expressed as dictionaries. Assumes every key in frequencies1 is also in frequencies2 - + + >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) + 1 + >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) + 1 + >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1}) + 3 + >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':5, 'c':1})) + 0.057022248808851934 + >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1})) + 0.0 + >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':0})) + 0.009720703533656434 """ - total = 0 + total = 1 for k in frequencies1.keys(): total *= abs(frequencies1[k] - frequencies2[k]) return total def harmonic_mean(frequencies1, frequencies2): - """Finds the distances between two frequency profiles, expressed as dictionaries. + """Finds the harmonic mean of the absolute differences between two frequency profiles, + expressed as dictionaries. Assumes every key in frequencies1 is also in frequencies2 + >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) + 1.0 + >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) + 1.0 + >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1}) + 1.2857142857142858 + >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':5, 'c':1})) + 0.3849001794597505 + >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1})) + 0 + >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':0})) + 0.17497266360581604 """ total = 0 for k in frequencies1.keys(): + if abs(frequencies1[k] - frequencies2[k]) == 0: + return 0 total += 1 / abs(frequencies1[k] - frequencies2[k]) - return 1 / total + return len(frequencies1) / total def cosine_distance(frequencies1, frequencies2):