1 """Define a variety of norms for finding distances between vectors"""
5 def normalise(frequencies
):
6 """Scale a set of frequencies so they sum to one
8 >>> sorted(normalise({1: 1, 2: 0}).items())
10 >>> sorted(normalise({1: 1, 2: 1}).items())
12 >>> sorted(normalise({1: 1, 2: 1, 3: 1}).items()) # doctest: +ELLIPSIS
13 [(1, 0.333...), (2, 0.333...), (3, 0.333...)]
14 >>> sorted(normalise({1: 1, 2: 2, 3: 1}).items())
15 [(1, 0.25), (2, 0.5), (3, 0.25)]
17 length
= sum([f
for f
in frequencies
.values()])
18 return collections
.defaultdict(int, ((k
, v
/ length
)
19 for (k
, v
) in frequencies
.items()))
21 def euclidean_scale(frequencies
):
22 """Scale a set of frequencies so they have a unit euclidean length
24 >>> sorted(euclidean_scale({1: 1, 2: 0}).items())
26 >>> sorted(euclidean_scale({1: 1, 2: 1}).items()) # doctest: +ELLIPSIS
27 [(1, 0.7071067...), (2, 0.7071067...)]
28 >>> sorted(euclidean_scale({1: 1, 2: 1, 3: 1}).items()) # doctest: +ELLIPSIS
29 [(1, 0.577350...), (2, 0.577350...), (3, 0.577350...)]
30 >>> sorted(euclidean_scale({1: 1, 2: 2, 3: 1}).items()) # doctest: +ELLIPSIS
31 [(1, 0.408248...), (2, 0.81649658...), (3, 0.408248...)]
33 length
= sum([f
** 2 for f
in frequencies
.values()]) ** 0.5
34 return collections
.defaultdict(int, ((k
, v
/ length
)
35 for (k
, v
) in frequencies
.items()))
37 def identity_scale(frequencies
):
38 """Don't scale a set of frequencies. (For use when a function expects a
39 scaling function but you don't want to supply one.)
44 def l2(frequencies1
, frequencies2
):
45 """Finds the distances between two frequency profiles, expressed as
47 Assumes every key in frequencies1 is also in frequencies2
49 >>> l2({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
51 >>> l2({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
53 >>> l2(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
55 >>> l2({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
57 >>> l2(normalise({'a':0, 'b':2, 'c':0}), \
58 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
60 >>> l2({'a':0, 'b':1}, {'a':1, 'b':1})
64 for k
in frequencies1
:
65 total
+= (frequencies1
[k
] - frequencies2
[k
]) ** 2
67 euclidean_distance
= l2
69 def l1(frequencies1
, frequencies2
):
70 """Finds the distances between two frequency profiles, expressed as
71 dictionaries. Assumes every key in frequencies1 is also in frequencies2
73 >>> l1({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
75 >>> l1({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
77 >>> l1(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
79 >>> l1({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
81 >>> l1({'a':0, 'b':1}, {'a':1, 'b':1})
85 for k
in frequencies1
:
86 total
+= abs(frequencies1
[k
] - frequencies2
[k
])
89 def l3(frequencies1
, frequencies2
):
90 """Finds the distances between two frequency profiles, expressed as
91 dictionaries. Assumes every key in frequencies1 is also in frequencies2
93 >>> l3({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
95 >>> l3({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
97 >>> l3({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
99 >>> l3(normalise({'a':0, 'b':2, 'c':0}), \
100 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
102 >>> l3({'a':0, 'b':1}, {'a':1, 'b':1})
104 >>> l3(normalise({'a':0, 'b':1}), normalise({'a':1, 'b':1})) # doctest: +ELLIPSIS
108 for k
in frequencies1
:
109 total
+= abs(frequencies1
[k
] - frequencies2
[k
]) ** 3
110 return total
** (1/3)
112 def geometric_mean(frequencies1
, frequencies2
):
113 """Finds the geometric mean of the absolute differences between two
114 frequency profiles, expressed as dictionaries.
115 Assumes every key in frequencies1 is also in frequencies2
117 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
119 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
121 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1})
123 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), \
124 normalise({'a':1, 'b':5, 'c':1})) # doctest: +ELLIPSIS
126 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), \
127 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
129 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), \
130 normalise({'a':1, 'b':1, 'c':0})) # doctest: +ELLIPSIS
134 for k
in frequencies1
:
135 total
*= abs(frequencies1
[k
] - frequencies2
[k
])
138 def harmonic_mean(frequencies1
, frequencies2
):
139 """Finds the harmonic mean of the absolute differences between two
140 frequency profiles, expressed as dictionaries.
141 Assumes every key in frequencies1 is also in frequencies2
143 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
145 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
147 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1}) # doctest: +ELLIPSIS
149 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), \
150 normalise({'a':1, 'b':5, 'c':1})) # doctest: +ELLIPSIS
152 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), \
153 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
155 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), \
156 normalise({'a':1, 'b':1, 'c':0})) # doctest: +ELLIPSIS
160 for k
in frequencies1
:
161 if abs(frequencies1
[k
] - frequencies2
[k
]) == 0:
163 total
+= 1 / abs(frequencies1
[k
] - frequencies2
[k
])
164 return len(frequencies1
) / total
167 def cosine_similarity(frequencies1
, frequencies2
):
168 """Finds the distances between two frequency profiles, expressed as
170 Assumes every key in frequencies1 is also in frequencies2
172 >>> cosine_similarity({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
174 >>> cosine_similarity({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
176 >>> cosine_similarity({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
178 >>> cosine_similarity({'a':0, 'b':1}, {'a':1, 'b':1}) # doctest: +ELLIPSIS
184 for k
in frequencies1
:
185 numerator
+= frequencies1
[k
] * frequencies2
[k
]
186 length1
+= frequencies1
[k
]**2
187 for k
in frequencies2
.keys():
188 length2
+= frequencies2
[k
]**2
189 return numerator
/ (length1
** 0.5 * length2
** 0.5)
193 if __name__
== "__main__":