import collections
+from math import log10
def normalise(frequencies):
"""Scale a set of frequencies so they sum to one
>>> sorted(normalise({1: 1, 2: 2, 3: 1}).items())
[(1, 0.25), (2, 0.5), (3, 0.25)]
"""
- length = sum([f for f in frequencies.values()])
+ length = sum(f for f in frequencies.values())
return collections.defaultdict(int, ((k, v / length)
for (k, v) in frequencies.items()))
return collections.defaultdict(int, ((k, v / length)
for (k, v) in frequencies.items()))
-
-def scale(frequencies):
- """Scale a set of frequencies so the largest is 1
-
- >>> sorted(scale({1: 1, 2: 0}).items())
- [(1, 1.0), (2, 0.0)]
- >>> sorted(scale({1: 1, 2: 1}).items())
- [(1, 1.0), (2, 1.0)]
- >>> sorted(scale({1: 1, 2: 1, 3: 1}).items())
- [(1, 1.0), (2, 1.0), (3, 1.0)]
- >>> sorted(scale({1: 1, 2: 2, 3: 1}).items())
- [(1, 0.5), (2, 1.0), (3, 0.5)]
- """
- largest = max(frequencies.values())
- return collections.defaultdict(int, ((k, v / largest)
- for (k, v) in frequencies.items()))
-
+def identity_scale(frequencies):
+ return frequencies
+
def l2(frequencies1, frequencies2):
"""Finds the distances between two frequency profiles, expressed as dictionaries.
1.0
"""
total = 0
- for k in frequencies1.keys():
+ for k in frequencies1:
total += (frequencies1[k] - frequencies2[k]) ** 2
return total ** 0.5
euclidean_distance = l2
1
"""
total = 0
- for k in frequencies1.keys():
+ for k in frequencies1:
total += abs(frequencies1[k] - frequencies2[k])
return total
0.6299605249...
"""
total = 0
- for k in frequencies1.keys():
+ for k in frequencies1:
total += abs(frequencies1[k] - frequencies2[k]) ** 3
return total ** (1/3)
0.009259259...
"""
total = 1
- for k in frequencies1.keys():
+ for k in frequencies1:
total *= abs(frequencies1[k] - frequencies2[k])
return total
0.2
"""
total = 0
- for k in frequencies1.keys():
+ for k in frequencies1:
if abs(frequencies1[k] - frequencies2[k]) == 0:
return 0
total += 1 / abs(frequencies1[k] - frequencies2[k])
return len(frequencies1) / total
-def cosine_distance(frequencies1, frequencies2):
+def cosine_similarity(frequencies1, frequencies2):
"""Finds the distances between two frequency profiles, expressed as dictionaries.
Assumes every key in frequencies1 is also in frequencies2
- >>> cosine_distance({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
- -2.22044604...e-16
- >>> cosine_distance({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
- -2.22044604...e-16
- >>> cosine_distance({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
- 0.4226497308...
- >>> cosine_distance({'a':0, 'b':1}, {'a':1, 'b':1}) # doctest: +ELLIPSIS
- 0.29289321881...
+ >>> cosine_similarity({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
+ 1.0000000000...
+ >>> cosine_similarity({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
+ 1.0000000000...
+ >>> cosine_similarity({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
+ 0.5773502691...
+ >>> cosine_similarity({'a':0, 'b':1}, {'a':1, 'b':1}) # doctest: +ELLIPSIS
+ 0.7071067811...
"""
numerator = 0
length1 = 0
length2 = 0
- for k in frequencies1.keys():
+ for k in frequencies1:
numerator += frequencies1[k] * frequencies2[k]
length1 += frequencies1[k]**2
- for k in frequencies2.keys():
- length2 += frequencies2[k]
- return 1 - (numerator / (length1 ** 0.5 * length2 ** 0.5))
+ for k in frequencies2:
+ length2 += frequencies2[k]**2
+ return numerator / (length1 ** 0.5 * length2 ** 0.5)
-def index_of_coincidence(frequencies):
- """Finds the (expected) index of coincidence given a set of frequencies
- """
- return sum([f ** 2 for f in frequencies.values()]) * len(frequencies.keys())
-
if __name__ == "__main__":
import doctest