3 def normalise(frequencies
):
4 """Scale a set of frequenies so they have a unit euclidean length
6 >>> sorted(normalise({1: 1, 2: 0}).items())
8 >>> sorted(normalise({1: 1, 2: 1}).items())
9 [(1, 0.7071067811865475), (2, 0.7071067811865475)]
10 >>> sorted(normalise({1: 1, 2: 1, 3: 1}).items())
11 [(1, 0.5773502691896258), (2, 0.5773502691896258), (3, 0.5773502691896258)]
12 >>> sorted(normalise({1: 1, 2: 2, 3: 1}).items())
13 [(1, 0.4082482904638631), (2, 0.8164965809277261), (3, 0.4082482904638631)]
15 length
= sum([f
** 2 for f
in frequencies
.values()]) ** 0.5
16 return collections
.defaultdict(int, ((k
, v
/ length
) for (k
, v
) in frequencies
.items()))
18 def scale(frequencies
):
19 """Scale a set of frequencies so the largest is 1
21 >>> sorted(scale({1: 1, 2: 0}).items())
23 >>> sorted(scale({1: 1, 2: 1}).items())
25 >>> sorted(scale({1: 1, 2: 1, 3: 1}).items())
26 [(1, 1.0), (2, 1.0), (3, 1.0)]
27 >>> sorted(scale({1: 1, 2: 2, 3: 1}).items())
28 [(1, 0.5), (2, 1.0), (3, 0.5)]
30 largest
= max(frequencies
.values())
31 return collections
.defaultdict(int, ((k
, v
/ largest
) for (k
, v
) in frequencies
.items()))
34 def l2(frequencies1
, frequencies2
):
35 """Finds the distances between two frequency profiles, expressed as dictionaries.
36 Assumes every key in frequencies1 is also in frequencies2
38 >>> l2({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
40 >>> l2({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
42 >>> l2(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
44 >>> l2({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
46 >>> l2(normalise({'a':0, 'b':2, 'c':0}), normalise({'a':1, 'b':1, 'c':1}))
48 >>> l2({'a':0, 'b':1}, {'a':1, 'b':1})
52 for k
in frequencies1
.keys():
53 total
+= (frequencies1
[k
] - frequencies2
[k
]) ** 2
55 euclidean_distance
= l2
57 def l1(frequencies1
, frequencies2
):
58 """Finds the distances between two frequency profiles, expressed as dictionaries.
59 Assumes every key in frequencies1 is also in frequencies2
61 >>> l1({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
63 >>> l1({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
65 >>> l1(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
67 >>> l1({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
69 >>> l1({'a':0, 'b':1}, {'a':1, 'b':1})
73 for k
in frequencies1
.keys():
74 total
+= abs(frequencies1
[k
] - frequencies2
[k
])
77 def l3(frequencies1
, frequencies2
):
78 """Finds the distances between two frequency profiles, expressed as dictionaries.
79 Assumes every key in frequencies1 is also in frequencies2
81 >>> l3({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
83 >>> l3({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
85 >>> l3({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
87 >>> l3(normalise({'a':0, 'b':2, 'c':0}), normalise({'a':1, 'b':1, 'c':1}))
89 >>> l3({'a':0, 'b':1}, {'a':1, 'b':1})
91 >>> l3(normalise({'a':0, 'b':1}), normalise({'a':1, 'b':1}))
95 for k
in frequencies1
.keys():
96 total
+= abs(frequencies1
[k
] - frequencies2
[k
]) ** 3
99 def geometric_mean(frequencies1
, frequencies2
):
100 """Finds the geometric mean of the absolute differences between two frequency profiles,
101 expressed as dictionaries.
102 Assumes every key in frequencies1 is also in frequencies2
104 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
106 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
108 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1})
110 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':5, 'c':1}))
112 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
114 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':0}))
118 for k
in frequencies1
.keys():
119 total
*= abs(frequencies1
[k
] - frequencies2
[k
])
122 def harmonic_mean(frequencies1
, frequencies2
):
123 """Finds the harmonic mean of the absolute differences between two frequency profiles,
124 expressed as dictionaries.
125 Assumes every key in frequencies1 is also in frequencies2
127 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
129 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
131 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1})
133 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':5, 'c':1}))
135 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
137 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':0}))
141 for k
in frequencies1
.keys():
142 if abs(frequencies1
[k
] - frequencies2
[k
]) == 0:
144 total
+= 1 / abs(frequencies1
[k
] - frequencies2
[k
])
145 return len(frequencies1
) / total
148 def cosine_distance(frequencies1
, frequencies2
):
149 """Finds the distances between two frequency profiles, expressed as dictionaries.
150 Assumes every key in frequencies1 is also in frequencies2
152 >>> cosine_distance({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
153 -2.220446049250313e-16
154 >>> cosine_distance({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
155 -2.220446049250313e-16
156 >>> cosine_distance({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
158 >>> cosine_distance({'a':0, 'b':1}, {'a':1, 'b':1})
164 for k
in frequencies1
.keys():
165 numerator
+= frequencies1
[k
] * frequencies2
[k
]
166 length1
+= frequencies1
[k
]**2
167 for k
in frequencies2
.keys():
168 length2
+= frequencies2
[k
]
169 return 1 - (numerator
/ (length1
** 0.5 * length2
** 0.5))
172 if __name__
== "__main__":