4 def normalise(frequencies
):
5 """Scale a set of frequencies so they sum to one
7 >>> sorted(normalise({1: 1, 2: 0}).items())
9 >>> sorted(normalise({1: 1, 2: 1}).items())
11 >>> sorted(normalise({1: 1, 2: 1, 3: 1}).items()) # doctest: +ELLIPSIS
12 [(1, 0.333...), (2, 0.333...), (3, 0.333...)]
13 >>> sorted(normalise({1: 1, 2: 2, 3: 1}).items())
14 [(1, 0.25), (2, 0.5), (3, 0.25)]
16 length
= sum([f
for f
in frequencies
.values()])
17 return collections
.defaultdict(int, ((k
, v
/ length
)
18 for (k
, v
) in frequencies
.items()))
20 def euclidean_scale(frequencies
):
21 """Scale a set of frequencies so they have a unit euclidean length
23 >>> sorted(euclidean_scale({1: 1, 2: 0}).items())
25 >>> sorted(euclidean_scale({1: 1, 2: 1}).items()) # doctest: +ELLIPSIS
26 [(1, 0.7071067...), (2, 0.7071067...)]
27 >>> sorted(euclidean_scale({1: 1, 2: 1, 3: 1}).items()) # doctest: +ELLIPSIS
28 [(1, 0.577350...), (2, 0.577350...), (3, 0.577350...)]
29 >>> sorted(euclidean_scale({1: 1, 2: 2, 3: 1}).items()) # doctest: +ELLIPSIS
30 [(1, 0.408248...), (2, 0.81649658...), (3, 0.408248...)]
32 length
= sum([f
** 2 for f
in frequencies
.values()]) ** 0.5
33 return collections
.defaultdict(int, ((k
, v
/ length
)
34 for (k
, v
) in frequencies
.items()))
36 def identity_scale(frequencies
):
40 def l2(frequencies1
, frequencies2
):
41 """Finds the distances between two frequency profiles, expressed as dictionaries.
42 Assumes every key in frequencies1 is also in frequencies2
44 >>> l2({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
46 >>> l2({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
48 >>> l2(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
50 >>> l2({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
52 >>> l2(normalise({'a':0, 'b':2, 'c':0}), \
53 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
55 >>> l2({'a':0, 'b':1}, {'a':1, 'b':1})
59 for k
in frequencies1
:
60 total
+= (frequencies1
[k
] - frequencies2
[k
]) ** 2
62 euclidean_distance
= l2
64 def l1(frequencies1
, frequencies2
):
65 """Finds the distances between two frequency profiles, expressed as
66 dictionaries. Assumes every key in frequencies1 is also in frequencies2
68 >>> l1({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
70 >>> l1({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
72 >>> l1(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
74 >>> l1({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
76 >>> l1({'a':0, 'b':1}, {'a':1, 'b':1})
80 for k
in frequencies1
:
81 total
+= abs(frequencies1
[k
] - frequencies2
[k
])
84 def l3(frequencies1
, frequencies2
):
85 """Finds the distances between two frequency profiles, expressed as
86 dictionaries. Assumes every key in frequencies1 is also in frequencies2
88 >>> l3({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
90 >>> l3({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
92 >>> l3({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
94 >>> l3(normalise({'a':0, 'b':2, 'c':0}), \
95 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
97 >>> l3({'a':0, 'b':1}, {'a':1, 'b':1})
99 >>> l3(normalise({'a':0, 'b':1}), normalise({'a':1, 'b':1})) # doctest: +ELLIPSIS
103 for k
in frequencies1
:
104 total
+= abs(frequencies1
[k
] - frequencies2
[k
]) ** 3
105 return total
** (1/3)
107 def geometric_mean(frequencies1
, frequencies2
):
108 """Finds the geometric mean of the absolute differences between two frequency profiles,
109 expressed as dictionaries.
110 Assumes every key in frequencies1 is also in frequencies2
112 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
114 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
116 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1})
118 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), \
119 normalise({'a':1, 'b':5, 'c':1})) # doctest: +ELLIPSIS
121 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), \
122 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
124 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), \
125 normalise({'a':1, 'b':1, 'c':0})) # doctest: +ELLIPSIS
129 for k
in frequencies1
:
130 total
*= abs(frequencies1
[k
] - frequencies2
[k
])
133 def harmonic_mean(frequencies1
, frequencies2
):
134 """Finds the harmonic mean of the absolute differences between two frequency profiles,
135 expressed as dictionaries.
136 Assumes every key in frequencies1 is also in frequencies2
138 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
140 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
142 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1}) # doctest: +ELLIPSIS
144 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), \
145 normalise({'a':1, 'b':5, 'c':1})) # doctest: +ELLIPSIS
147 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), \
148 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
150 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), \
151 normalise({'a':1, 'b':1, 'c':0})) # doctest: +ELLIPSIS
155 for k
in frequencies1
:
156 if abs(frequencies1
[k
] - frequencies2
[k
]) == 0:
158 total
+= 1 / abs(frequencies1
[k
] - frequencies2
[k
])
159 return len(frequencies1
) / total
162 def cosine_similarity(frequencies1
, frequencies2
):
163 """Finds the distances between two frequency profiles, expressed as dictionaries.
164 Assumes every key in frequencies1 is also in frequencies2
166 >>> cosine_distance({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
168 >>> cosine_distance({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
170 >>> cosine_distance({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
172 >>> cosine_distance({'a':0, 'b':1}, {'a':1, 'b':1}) # doctest: +ELLIPSIS
178 for k
in frequencies1
:
179 numerator
+= frequencies1
[k
] * frequencies2
[k
]
180 length1
+= frequencies1
[k
]**2
181 for k
in frequencies2
.keys():
182 length2
+= frequencies2
[k
]**2
183 return numerator
/ (length1
** 0.5 * length2
** 0.5)
187 if __name__
== "__main__":