3 def normalise(frequencies
):
4 """Scale a set of frequencies so they sum to one
6 >>> sorted(normalise({1: 1, 2: 0}).items())
8 >>> sorted(normalise({1: 1, 2: 1}).items())
10 >>> sorted(normalise({1: 1, 2: 1, 3: 1}).items()) # doctest: +ELLIPSIS
11 [(1, 0.333...), (2, 0.333...), (3, 0.333...)]
12 >>> sorted(normalise({1: 1, 2: 2, 3: 1}).items())
13 [(1, 0.25), (2, 0.5), (3, 0.25)]
15 length
= sum([f
for f
in frequencies
.values()])
16 return collections
.defaultdict(int, ((k
, v
/ length
)
17 for (k
, v
) in frequencies
.items()))
19 def euclidean_scale(frequencies
):
20 """Scale a set of frequencies so they have a unit euclidean length
22 >>> sorted(euclidean_scale({1: 1, 2: 0}).items())
24 >>> sorted(euclidean_scale({1: 1, 2: 1}).items()) # doctest: +ELLIPSIS
25 [(1, 0.7071067...), (2, 0.7071067...)]
26 >>> sorted(euclidean_scale({1: 1, 2: 1, 3: 1}).items()) # doctest: +ELLIPSIS
27 [(1, 0.577350...), (2, 0.577350...), (3, 0.577350...)]
28 >>> sorted(euclidean_scale({1: 1, 2: 2, 3: 1}).items()) # doctest: +ELLIPSIS
29 [(1, 0.408248...), (2, 0.81649658...), (3, 0.408248...)]
31 length
= sum([f
** 2 for f
in frequencies
.values()]) ** 0.5
32 return collections
.defaultdict(int, ((k
, v
/ length
)
33 for (k
, v
) in frequencies
.items()))
36 def scale(frequencies
):
37 """Scale a set of frequencies so the largest is 1
39 >>> sorted(scale({1: 1, 2: 0}).items())
41 >>> sorted(scale({1: 1, 2: 1}).items())
43 >>> sorted(scale({1: 1, 2: 1, 3: 1}).items())
44 [(1, 1.0), (2, 1.0), (3, 1.0)]
45 >>> sorted(scale({1: 1, 2: 2, 3: 1}).items())
46 [(1, 0.5), (2, 1.0), (3, 0.5)]
48 largest
= max(frequencies
.values())
49 return collections
.defaultdict(int, ((k
, v
/ largest
)
50 for (k
, v
) in frequencies
.items()))
53 def l2(frequencies1
, frequencies2
):
54 """Finds the distances between two frequency profiles, expressed as dictionaries.
55 Assumes every key in frequencies1 is also in frequencies2
57 >>> l2({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
59 >>> l2({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
61 >>> l2(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
63 >>> l2({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
65 >>> l2(normalise({'a':0, 'b':2, 'c':0}), \
66 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
68 >>> l2({'a':0, 'b':1}, {'a':1, 'b':1})
72 for k
in frequencies1
.keys():
73 total
+= (frequencies1
[k
] - frequencies2
[k
]) ** 2
75 euclidean_distance
= l2
77 def l1(frequencies1
, frequencies2
):
78 """Finds the distances between two frequency profiles, expressed as
79 dictionaries. Assumes every key in frequencies1 is also in frequencies2
81 >>> l1({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
83 >>> l1({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
85 >>> l1(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
87 >>> l1({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
89 >>> l1({'a':0, 'b':1}, {'a':1, 'b':1})
93 for k
in frequencies1
.keys():
94 total
+= abs(frequencies1
[k
] - frequencies2
[k
])
97 def l3(frequencies1
, frequencies2
):
98 """Finds the distances between two frequency profiles, expressed as
99 dictionaries. Assumes every key in frequencies1 is also in frequencies2
101 >>> l3({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
103 >>> l3({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
105 >>> l3({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
107 >>> l3(normalise({'a':0, 'b':2, 'c':0}), \
108 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
110 >>> l3({'a':0, 'b':1}, {'a':1, 'b':1})
112 >>> l3(normalise({'a':0, 'b':1}), normalise({'a':1, 'b':1})) # doctest: +ELLIPSIS
116 for k
in frequencies1
.keys():
117 total
+= abs(frequencies1
[k
] - frequencies2
[k
]) ** 3
118 return total
** (1/3)
120 def geometric_mean(frequencies1
, frequencies2
):
121 """Finds the geometric mean of the absolute differences between two frequency profiles,
122 expressed as dictionaries.
123 Assumes every key in frequencies1 is also in frequencies2
125 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
127 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
129 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1})
131 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), \
132 normalise({'a':1, 'b':5, 'c':1})) # doctest: +ELLIPSIS
134 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), \
135 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
137 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), \
138 normalise({'a':1, 'b':1, 'c':0})) # doctest: +ELLIPSIS
142 for k
in frequencies1
.keys():
143 total
*= abs(frequencies1
[k
] - frequencies2
[k
])
146 def harmonic_mean(frequencies1
, frequencies2
):
147 """Finds the harmonic mean of the absolute differences between two frequency profiles,
148 expressed as dictionaries.
149 Assumes every key in frequencies1 is also in frequencies2
151 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
153 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
155 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1}) # doctest: +ELLIPSIS
157 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), \
158 normalise({'a':1, 'b':5, 'c':1})) # doctest: +ELLIPSIS
160 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), \
161 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
163 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), \
164 normalise({'a':1, 'b':1, 'c':0})) # doctest: +ELLIPSIS
168 for k
in frequencies1
.keys():
169 if abs(frequencies1
[k
] - frequencies2
[k
]) == 0:
171 total
+= 1 / abs(frequencies1
[k
] - frequencies2
[k
])
172 return len(frequencies1
) / total
175 def cosine_distance(frequencies1
, frequencies2
):
176 """Finds the distances between two frequency profiles, expressed as dictionaries.
177 Assumes every key in frequencies1 is also in frequencies2
179 >>> cosine_distance({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
181 >>> cosine_distance({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
183 >>> cosine_distance({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
185 >>> cosine_distance({'a':0, 'b':1}, {'a':1, 'b':1}) # doctest: +ELLIPSIS
191 for k
in frequencies1
.keys():
192 numerator
+= frequencies1
[k
] * frequencies2
[k
]
193 length1
+= frequencies1
[k
]**2
194 for k
in frequencies2
.keys():
195 length2
+= frequencies2
[k
]
196 return 1 - (numerator
/ (length1
** 0.5 * length2
** 0.5))
199 def index_of_coincidence(frequencies
):
200 """Finds the (expected) index of coincidence given a set of frequencies
202 return sum([f
** 2 for f
in frequencies
.values()]) * len(frequencies
.keys())
205 if __name__
== "__main__":