1 """Various norms, for calcuating the distances between two frequency
8 def lp(v1
, v2
=None, p
=2):
9 """Find the L_p norm. If passed one vector, find the length of that vector.
10 If passed two vectors, find the length of the difference between them.
13 vec
= {k
: abs(v1
[k
] - v2
[k
]) for k
in (v1
.keys() | v2
.keys())}
16 return sum(v
** p
for v
in vec
.values()) ** (1.0 / p
)
19 """Finds the distances between two frequency profiles, expressed as
20 dictionaries. Assumes every key in frequencies1 is also in frequencies2
22 >>> l1({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
24 >>> l1({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
26 >>> l1(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
28 >>> l1({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
30 >>> l1({'a':0, 'b':1}, {'a':1, 'b':1})
36 """Finds the distances between two frequency profiles, expressed as dictionaries.
37 Assumes every key in frequencies1 is also in frequencies2
39 >>> l2({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
41 >>> l2({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
43 >>> l2(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
45 >>> l2({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
47 >>> l2(normalise({'a':0, 'b':2, 'c':0}), \
48 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
50 >>> l2({'a':0, 'b':1}, {'a':1, 'b':1})
56 """Finds the distances between two frequency profiles, expressed as
57 dictionaries. Assumes every key in frequencies1 is also in frequencies2
59 >>> l3({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
61 >>> l3({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
63 >>> l3({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
65 >>> l3(normalise({'a':0, 'b':2, 'c':0}), \
66 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
68 >>> l3({'a':0, 'b':1}, {'a':1, 'b':1})
70 >>> l3(normalise({'a':0, 'b':1}), normalise({'a':1, 'b':1})) # doctest: +ELLIPSIS
75 def linf(v1
, v2
=None):
76 """Finds the distances between two frequency profiles, expressed as
77 dictionaries. Assumes every key in frequencies1 is also in frequencies2"""
79 vec
= {k
: abs(v1
[k
] - v2
[k
]) for k
in (v1
.keys() | v2
.keys())}
82 return max(v
for v
in vec
.values())
85 def scale(frequencies
, norm
=l2
):
86 length
= norm(frequencies
)
87 return collections
.defaultdict(int,
88 {k
: v
/ length
for k
, v
in frequencies
.items()})
91 """Scale a set of frequencies so they have a unit euclidean length
93 >>> sorted(euclidean_scale({1: 1, 2: 0}).items())
95 >>> sorted(euclidean_scale({1: 1, 2: 1}).items()) # doctest: +ELLIPSIS
96 [(1, 0.7071067...), (2, 0.7071067...)]
97 >>> sorted(euclidean_scale({1: 1, 2: 1, 3: 1}).items()) # doctest: +ELLIPSIS
98 [(1, 0.577350...), (2, 0.577350...), (3, 0.577350...)]
99 >>> sorted(euclidean_scale({1: 1, 2: 2, 3: 1}).items()) # doctest: +ELLIPSIS
100 [(1, 0.408248...), (2, 0.81649658...), (3, 0.408248...)]
105 """Scale a set of frequencies so they sum to one
107 >>> sorted(normalise({1: 1, 2: 0}).items())
109 >>> sorted(normalise({1: 1, 2: 1}).items())
111 >>> sorted(normalise({1: 1, 2: 1, 3: 1}).items()) # doctest: +ELLIPSIS
112 [(1, 0.333...), (2, 0.333...), (3, 0.333...)]
113 >>> sorted(normalise({1: 1, 2: 2, 3: 1}).items())
114 [(1, 0.25), (2, 0.5), (3, 0.25)]
119 euclidean_distance
= l2
120 euclidean_scale
= l2_scale
123 def geometric_mean(frequencies1
, frequencies2
):
124 """Finds the geometric mean of the absolute differences between two frequency profiles,
125 expressed as dictionaries.
126 Assumes every key in frequencies1 is also in frequencies2
128 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
130 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
132 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1})
134 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), \
135 normalise({'a':1, 'b':5, 'c':1})) # doctest: +ELLIPSIS
137 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), \
138 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
140 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), \
141 normalise({'a':1, 'b':1, 'c':0})) # doctest: +ELLIPSIS
145 for k
in frequencies1
:
146 total
*= abs(frequencies1
[k
] - frequencies2
[k
])
149 def harmonic_mean(frequencies1
, frequencies2
):
150 """Finds the harmonic mean of the absolute differences between two frequency profiles,
151 expressed as dictionaries.
152 Assumes every key in frequencies1 is also in frequencies2
154 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
156 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
158 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1}) # doctest: +ELLIPSIS
160 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), \
161 normalise({'a':1, 'b':5, 'c':1})) # doctest: +ELLIPSIS
163 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), \
164 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
166 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), \
167 normalise({'a':1, 'b':1, 'c':0})) # doctest: +ELLIPSIS
171 for k
in frequencies1
:
172 if abs(frequencies1
[k
] - frequencies2
[k
]) == 0:
174 total
+= 1.0 / abs(frequencies1
[k
] - frequencies2
[k
])
175 return len(frequencies1
) / total
178 def cosine_similarity(frequencies1
, frequencies2
):
179 """Finds the distances between two frequency profiles, expressed as dictionaries.
180 Assumes every key in frequencies1 is also in frequencies2
182 >>> cosine_similarity({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
184 >>> cosine_similarity({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
186 >>> cosine_similarity({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
188 >>> cosine_similarity({'a':0, 'b':1}, {'a':1, 'b':1}) # doctest: +ELLIPSIS
194 for k
in frequencies1
:
195 numerator
+= frequencies1
[k
] * frequencies2
[k
]
196 length1
+= frequencies1
[k
]**2
197 for k
in frequencies2
:
198 length2
+= frequencies2
[k
]**2
199 return numerator
/ (length1
** 0.5 * length2
** 0.5)
203 if __name__
== "__main__":