5 def lp(v1
, v2
=None, p
=2):
6 """Find the L_p norm. If passed one vector, find the length of that vector.
7 If passed two vectors, find the length of the difference between them.
10 vec
= {k
: abs(v1
[k
] - v2
[k
]) for k
in (v1
.keys() | v2
.keys())}
13 return sum(v
** p
for v
in vec
.values()) ** (1.0 / p
)
16 """Finds the distances between two frequency profiles, expressed as
17 dictionaries. Assumes every key in frequencies1 is also in frequencies2
19 >>> l1({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
21 >>> l1({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
23 >>> l1(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
25 >>> l1({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
27 >>> l1({'a':0, 'b':1}, {'a':1, 'b':1})
33 """Finds the distances between two frequency profiles, expressed as dictionaries.
34 Assumes every key in frequencies1 is also in frequencies2
36 >>> l2({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
38 >>> l2({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
40 >>> l2(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
42 >>> l2({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
44 >>> l2(normalise({'a':0, 'b':2, 'c':0}), \
45 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
47 >>> l2({'a':0, 'b':1}, {'a':1, 'b':1})
53 """Finds the distances between two frequency profiles, expressed as
54 dictionaries. Assumes every key in frequencies1 is also in frequencies2
56 >>> l3({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
58 >>> l3({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
60 >>> l3({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
62 >>> l3(normalise({'a':0, 'b':2, 'c':0}), \
63 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
65 >>> l3({'a':0, 'b':1}, {'a':1, 'b':1})
67 >>> l3(normalise({'a':0, 'b':1}), normalise({'a':1, 'b':1})) # doctest: +ELLIPSIS
72 def linf(v1
, v2
=None):
74 vec
= {k
: abs(v1
[k
] - v2
[k
]) for k
in (v1
.keys() | v2
.keys())}
77 return max(v
for v
in vec
.values())
80 def scale(frequencies
, norm
=l2
):
81 length
= norm(frequencies
)
82 return collections
.defaultdict(int,
83 {k
: v
/ length
for k
, v
in frequencies
.items()})
86 """Scale a set of frequencies so they have a unit euclidean length
88 >>> sorted(euclidean_scale({1: 1, 2: 0}).items())
90 >>> sorted(euclidean_scale({1: 1, 2: 1}).items()) # doctest: +ELLIPSIS
91 [(1, 0.7071067...), (2, 0.7071067...)]
92 >>> sorted(euclidean_scale({1: 1, 2: 1, 3: 1}).items()) # doctest: +ELLIPSIS
93 [(1, 0.577350...), (2, 0.577350...), (3, 0.577350...)]
94 >>> sorted(euclidean_scale({1: 1, 2: 2, 3: 1}).items()) # doctest: +ELLIPSIS
95 [(1, 0.408248...), (2, 0.81649658...), (3, 0.408248...)]
100 """Scale a set of frequencies so they sum to one
102 >>> sorted(normalise({1: 1, 2: 0}).items())
104 >>> sorted(normalise({1: 1, 2: 1}).items())
106 >>> sorted(normalise({1: 1, 2: 1, 3: 1}).items()) # doctest: +ELLIPSIS
107 [(1, 0.333...), (2, 0.333...), (3, 0.333...)]
108 >>> sorted(normalise({1: 1, 2: 2, 3: 1}).items())
109 [(1, 0.25), (2, 0.5), (3, 0.25)]
114 euclidean_distance
= l2
115 euclidean_scale
= l2_scale
118 def geometric_mean(frequencies1
, frequencies2
):
119 """Finds the geometric mean of the absolute differences between two frequency profiles,
120 expressed as dictionaries.
121 Assumes every key in frequencies1 is also in frequencies2
123 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
125 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
127 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1})
129 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), \
130 normalise({'a':1, 'b':5, 'c':1})) # doctest: +ELLIPSIS
132 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), \
133 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
135 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), \
136 normalise({'a':1, 'b':1, 'c':0})) # doctest: +ELLIPSIS
140 for k
in frequencies1
:
141 total
*= abs(frequencies1
[k
] - frequencies2
[k
])
144 def harmonic_mean(frequencies1
, frequencies2
):
145 """Finds the harmonic mean of the absolute differences between two frequency profiles,
146 expressed as dictionaries.
147 Assumes every key in frequencies1 is also in frequencies2
149 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
151 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
153 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1}) # doctest: +ELLIPSIS
155 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), \
156 normalise({'a':1, 'b':5, 'c':1})) # doctest: +ELLIPSIS
158 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), \
159 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
161 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), \
162 normalise({'a':1, 'b':1, 'c':0})) # doctest: +ELLIPSIS
166 for k
in frequencies1
:
167 if abs(frequencies1
[k
] - frequencies2
[k
]) == 0:
169 total
+= 1.0 / abs(frequencies1
[k
] - frequencies2
[k
])
170 return len(frequencies1
) / total
173 def cosine_similarity(frequencies1
, frequencies2
):
174 """Finds the distances between two frequency profiles, expressed as dictionaries.
175 Assumes every key in frequencies1 is also in frequencies2
177 >>> cosine_similarity({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
179 >>> cosine_similarity({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
181 >>> cosine_similarity({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
183 >>> cosine_similarity({'a':0, 'b':1}, {'a':1, 'b':1}) # doctest: +ELLIPSIS
189 for k
in frequencies1
:
190 numerator
+= frequencies1
[k
] * frequencies2
[k
]
191 length1
+= frequencies1
[k
]**2
192 for k
in frequencies2
:
193 length2
+= frequencies2
[k
]**2
194 return numerator
/ (length1
** 0.5 * length2
** 0.5)
198 if __name__
== "__main__":