Signed
[cipher-training.git] / norms.py
1 import collections
2 from math import log10
3
4 def normalise(frequencies):
5 """Scale a set of frequencies so they sum to one
6
7 >>> sorted(normalise({1: 1, 2: 0}).items())
8 [(1, 1.0), (2, 0.0)]
9 >>> sorted(normalise({1: 1, 2: 1}).items())
10 [(1, 0.5), (2, 0.5)]
11 >>> sorted(normalise({1: 1, 2: 1, 3: 1}).items()) # doctest: +ELLIPSIS
12 [(1, 0.333...), (2, 0.333...), (3, 0.333...)]
13 >>> sorted(normalise({1: 1, 2: 2, 3: 1}).items())
14 [(1, 0.25), (2, 0.5), (3, 0.25)]
15 """
16 length = sum(f for f in frequencies.values())
17 return collections.defaultdict(int, ((k, v / length)
18 for (k, v) in frequencies.items()))
19
20 def euclidean_scale(frequencies):
21 """Scale a set of frequencies so they have a unit euclidean length
22
23 >>> sorted(euclidean_scale({1: 1, 2: 0}).items())
24 [(1, 1.0), (2, 0.0)]
25 >>> sorted(euclidean_scale({1: 1, 2: 1}).items()) # doctest: +ELLIPSIS
26 [(1, 0.7071067...), (2, 0.7071067...)]
27 >>> sorted(euclidean_scale({1: 1, 2: 1, 3: 1}).items()) # doctest: +ELLIPSIS
28 [(1, 0.577350...), (2, 0.577350...), (3, 0.577350...)]
29 >>> sorted(euclidean_scale({1: 1, 2: 2, 3: 1}).items()) # doctest: +ELLIPSIS
30 [(1, 0.408248...), (2, 0.81649658...), (3, 0.408248...)]
31 """
32 length = sum([f ** 2 for f in frequencies.values()]) ** 0.5
33 return collections.defaultdict(int, ((k, v / length)
34 for (k, v) in frequencies.items()))
35
36 def identity_scale(frequencies):
37 return frequencies
38
39
40 def l2(frequencies1, frequencies2):
41 """Finds the distances between two frequency profiles, expressed as dictionaries.
42 Assumes every key in frequencies1 is also in frequencies2
43
44 >>> l2({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
45 0.0
46 >>> l2({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
47 1.73205080...
48 >>> l2(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
49 0.0
50 >>> l2({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
51 1.732050807...
52 >>> l2(normalise({'a':0, 'b':2, 'c':0}), \
53 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
54 0.81649658...
55 >>> l2({'a':0, 'b':1}, {'a':1, 'b':1})
56 1.0
57 """
58 total = 0
59 for k in frequencies1:
60 total += (frequencies1[k] - frequencies2[k]) ** 2
61 return total ** 0.5
62 euclidean_distance = l2
63
64 def l1(frequencies1, frequencies2):
65 """Finds the distances between two frequency profiles, expressed as
66 dictionaries. Assumes every key in frequencies1 is also in frequencies2
67
68 >>> l1({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
69 0
70 >>> l1({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
71 3
72 >>> l1(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
73 0.0
74 >>> l1({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
75 3
76 >>> l1({'a':0, 'b':1}, {'a':1, 'b':1})
77 1
78 """
79 total = 0
80 for k in frequencies1:
81 total += abs(frequencies1[k] - frequencies2[k])
82 return total
83
84 def l3(frequencies1, frequencies2):
85 """Finds the distances between two frequency profiles, expressed as
86 dictionaries. Assumes every key in frequencies1 is also in frequencies2
87
88 >>> l3({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
89 0.0
90 >>> l3({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
91 1.44224957...
92 >>> l3({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
93 1.4422495703...
94 >>> l3(normalise({'a':0, 'b':2, 'c':0}), \
95 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
96 0.718144896...
97 >>> l3({'a':0, 'b':1}, {'a':1, 'b':1})
98 1.0
99 >>> l3(normalise({'a':0, 'b':1}), normalise({'a':1, 'b':1})) # doctest: +ELLIPSIS
100 0.6299605249...
101 """
102 total = 0
103 for k in frequencies1:
104 total += abs(frequencies1[k] - frequencies2[k]) ** 3
105 return total ** (1/3)
106
107 def geometric_mean(frequencies1, frequencies2):
108 """Finds the geometric mean of the absolute differences between two frequency profiles,
109 expressed as dictionaries.
110 Assumes every key in frequencies1 is also in frequencies2
111
112 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
113 1
114 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
115 1
116 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1})
117 3
118 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), \
119 normalise({'a':1, 'b':5, 'c':1})) # doctest: +ELLIPSIS
120 0.01382140...
121 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), \
122 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
123 0.0
124 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), \
125 normalise({'a':1, 'b':1, 'c':0})) # doctest: +ELLIPSIS
126 0.009259259...
127 """
128 total = 1
129 for k in frequencies1:
130 total *= abs(frequencies1[k] - frequencies2[k])
131 return total
132
133 def harmonic_mean(frequencies1, frequencies2):
134 """Finds the harmonic mean of the absolute differences between two frequency profiles,
135 expressed as dictionaries.
136 Assumes every key in frequencies1 is also in frequencies2
137
138 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
139 1.0
140 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
141 1.0
142 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1}) # doctest: +ELLIPSIS
143 1.285714285...
144 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), \
145 normalise({'a':1, 'b':5, 'c':1})) # doctest: +ELLIPSIS
146 0.228571428571...
147 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), \
148 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
149 0
150 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), \
151 normalise({'a':1, 'b':1, 'c':0})) # doctest: +ELLIPSIS
152 0.2
153 """
154 total = 0
155 for k in frequencies1:
156 if abs(frequencies1[k] - frequencies2[k]) == 0:
157 return 0
158 total += 1 / abs(frequencies1[k] - frequencies2[k])
159 return len(frequencies1) / total
160
161
162 def cosine_similarity(frequencies1, frequencies2):
163 """Finds the distances between two frequency profiles, expressed as dictionaries.
164 Assumes every key in frequencies1 is also in frequencies2
165
166 >>> cosine_similarity({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
167 1.0000000000...
168 >>> cosine_similarity({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
169 1.0000000000...
170 >>> cosine_similarity({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
171 0.5773502691...
172 >>> cosine_similarity({'a':0, 'b':1}, {'a':1, 'b':1}) # doctest: +ELLIPSIS
173 0.7071067811...
174 """
175 numerator = 0
176 length1 = 0
177 length2 = 0
178 for k in frequencies1:
179 numerator += frequencies1[k] * frequencies2[k]
180 length1 += frequencies1[k]**2
181 for k in frequencies2.keys():
182 length2 += frequencies2[k]**2
183 return numerator / (length1 ** 0.5 * length2 ** 0.5)
184
185
186
187 if __name__ == "__main__":
188 import doctest
189 doctest.testmod()