3 def normalise(frequencies
):
4 """Scale a set of frequenies so they have a unit euclidean length
6 >>> sorted(normalise({1: 1, 2: 0}).items())
8 >>> sorted(normalise({1: 1, 2: 1}).items())
9 [(1, 0.7071067811865475), (2, 0.7071067811865475)]
10 >>> sorted(normalise({1: 1, 2: 1, 3: 1}).items())
11 [(1, 0.5773502691896258), (2, 0.5773502691896258), (3, 0.5773502691896258)]
12 >>> sorted(normalise({1: 1, 2: 2, 3: 1}).items())
13 [(1, 0.4082482904638631), (2, 0.8164965809277261), (3, 0.4082482904638631)]
15 length
= sum([f
** 2 for f
in frequencies
.values()]) ** 0.5
16 return collections
.defaultdict(int, ((k
, v
/ length
)
17 for (k
, v
) in frequencies
.items()))
19 def scale(frequencies
):
20 """Scale a set of frequencies so the largest is 1
22 >>> sorted(scale({1: 1, 2: 0}).items())
24 >>> sorted(scale({1: 1, 2: 1}).items())
26 >>> sorted(scale({1: 1, 2: 1, 3: 1}).items())
27 [(1, 1.0), (2, 1.0), (3, 1.0)]
28 >>> sorted(scale({1: 1, 2: 2, 3: 1}).items())
29 [(1, 0.5), (2, 1.0), (3, 0.5)]
31 largest
= max(frequencies
.values())
32 return collections
.defaultdict(int, ((k
, v
/ largest
)
33 for (k
, v
) in frequencies
.items()))
36 def l2(frequencies1
, frequencies2
):
37 """Finds the distances between two frequency profiles, expressed as dictionaries.
38 Assumes every key in frequencies1 is also in frequencies2
40 >>> l2({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
42 >>> l2({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
44 >>> l2(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
46 >>> l2({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
48 >>> l2(normalise({'a':0, 'b':2, 'c':0}), normalise({'a':1, 'b':1, 'c':1}))
50 >>> l2({'a':0, 'b':1}, {'a':1, 'b':1})
54 for k
in frequencies1
.keys():
55 total
+= (frequencies1
[k
] - frequencies2
[k
]) ** 2
57 euclidean_distance
= l2
59 def l1(frequencies1
, frequencies2
):
60 """Finds the distances between two frequency profiles, expressed as dictionaries.
61 Assumes every key in frequencies1 is also in frequencies2
63 >>> l1({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
65 >>> l1({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
67 >>> l1(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
69 >>> l1({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
71 >>> l1({'a':0, 'b':1}, {'a':1, 'b':1})
75 for k
in frequencies1
.keys():
76 total
+= abs(frequencies1
[k
] - frequencies2
[k
])
79 def l3(frequencies1
, frequencies2
):
80 """Finds the distances between two frequency profiles, expressed as dictionaries.
81 Assumes every key in frequencies1 is also in frequencies2
83 >>> l3({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
85 >>> l3({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
87 >>> l3({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
89 >>> l3(normalise({'a':0, 'b':2, 'c':0}), normalise({'a':1, 'b':1, 'c':1}))
91 >>> l3({'a':0, 'b':1}, {'a':1, 'b':1})
93 >>> l3(normalise({'a':0, 'b':1}), normalise({'a':1, 'b':1}))
97 for k
in frequencies1
.keys():
98 total
+= abs(frequencies1
[k
] - frequencies2
[k
]) ** 3
101 def geometric_mean(frequencies1
, frequencies2
):
102 """Finds the geometric mean of the absolute differences between two frequency profiles,
103 expressed as dictionaries.
104 Assumes every key in frequencies1 is also in frequencies2
106 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
108 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
110 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1})
112 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':5, 'c':1}))
114 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
116 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':0}))
120 for k
in frequencies1
.keys():
121 total
*= abs(frequencies1
[k
] - frequencies2
[k
])
124 def harmonic_mean(frequencies1
, frequencies2
):
125 """Finds the harmonic mean of the absolute differences between two frequency profiles,
126 expressed as dictionaries.
127 Assumes every key in frequencies1 is also in frequencies2
129 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
131 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
133 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1})
135 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':5, 'c':1}))
137 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
139 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':0}))
143 for k
in frequencies1
.keys():
144 if abs(frequencies1
[k
] - frequencies2
[k
]) == 0:
146 total
+= 1 / abs(frequencies1
[k
] - frequencies2
[k
])
147 return len(frequencies1
) / total
150 def cosine_distance(frequencies1
, frequencies2
):
151 """Finds the distances between two frequency profiles, expressed as dictionaries.
152 Assumes every key in frequencies1 is also in frequencies2
154 >>> cosine_distance({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
155 -2.220446049250313e-16
156 >>> cosine_distance({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
157 -2.220446049250313e-16
158 >>> cosine_distance({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
160 >>> cosine_distance({'a':0, 'b':1}, {'a':1, 'b':1})
166 for k
in frequencies1
.keys():
167 numerator
+= frequencies1
[k
] * frequencies2
[k
]
168 length1
+= frequencies1
[k
]**2
169 for k
in frequencies2
.keys():
170 length2
+= frequencies2
[k
]
171 return 1 - (numerator
/ (length1
** 0.5 * length2
** 0.5))
174 if __name__
== "__main__":