More tweaking to conform with linting
[cipher-training.git] / norms.py
1 """Define a variety of norms for finding distances between vectors"""
2
3 import collections
4
5 def normalise(frequencies):
6 """Scale a set of frequencies so they sum to one
7
8 >>> sorted(normalise({1: 1, 2: 0}).items())
9 [(1, 1.0), (2, 0.0)]
10 >>> sorted(normalise({1: 1, 2: 1}).items())
11 [(1, 0.5), (2, 0.5)]
12 >>> sorted(normalise({1: 1, 2: 1, 3: 1}).items()) # doctest: +ELLIPSIS
13 [(1, 0.333...), (2, 0.333...), (3, 0.333...)]
14 >>> sorted(normalise({1: 1, 2: 2, 3: 1}).items())
15 [(1, 0.25), (2, 0.5), (3, 0.25)]
16 """
17 length = sum([f for f in frequencies.values()])
18 return collections.defaultdict(int, ((k, v / length)
19 for (k, v) in frequencies.items()))
20
21 def euclidean_scale(frequencies):
22 """Scale a set of frequencies so they have a unit euclidean length
23
24 >>> sorted(euclidean_scale({1: 1, 2: 0}).items())
25 [(1, 1.0), (2, 0.0)]
26 >>> sorted(euclidean_scale({1: 1, 2: 1}).items()) # doctest: +ELLIPSIS
27 [(1, 0.7071067...), (2, 0.7071067...)]
28 >>> sorted(euclidean_scale({1: 1, 2: 1, 3: 1}).items()) # doctest: +ELLIPSIS
29 [(1, 0.577350...), (2, 0.577350...), (3, 0.577350...)]
30 >>> sorted(euclidean_scale({1: 1, 2: 2, 3: 1}).items()) # doctest: +ELLIPSIS
31 [(1, 0.408248...), (2, 0.81649658...), (3, 0.408248...)]
32 """
33 length = sum([f ** 2 for f in frequencies.values()]) ** 0.5
34 return collections.defaultdict(int, ((k, v / length)
35 for (k, v) in frequencies.items()))
36
37 def identity_scale(frequencies):
38 """Don't scale a set of frequencies. (For use when a function expects a
39 scaling function but you don't want to supply one.)
40 """
41 return frequencies
42
43
44 def l2(frequencies1, frequencies2):
45 """Finds the distances between two frequency profiles, expressed as
46 dictionaries.
47 Assumes every key in frequencies1 is also in frequencies2
48
49 >>> l2({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
50 0.0
51 >>> l2({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
52 1.73205080...
53 >>> l2(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
54 0.0
55 >>> l2({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
56 1.732050807...
57 >>> l2(normalise({'a':0, 'b':2, 'c':0}), \
58 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
59 0.81649658...
60 >>> l2({'a':0, 'b':1}, {'a':1, 'b':1})
61 1.0
62 """
63 total = 0
64 for k in frequencies1:
65 total += (frequencies1[k] - frequencies2[k]) ** 2
66 return total ** 0.5
67 euclidean_distance = l2
68
69 def l1(frequencies1, frequencies2):
70 """Finds the distances between two frequency profiles, expressed as
71 dictionaries. Assumes every key in frequencies1 is also in frequencies2
72
73 >>> l1({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
74 0
75 >>> l1({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
76 3
77 >>> l1(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
78 0.0
79 >>> l1({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
80 3
81 >>> l1({'a':0, 'b':1}, {'a':1, 'b':1})
82 1
83 """
84 total = 0
85 for k in frequencies1:
86 total += abs(frequencies1[k] - frequencies2[k])
87 return total
88
89 def l3(frequencies1, frequencies2):
90 """Finds the distances between two frequency profiles, expressed as
91 dictionaries. Assumes every key in frequencies1 is also in frequencies2
92
93 >>> l3({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
94 0.0
95 >>> l3({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
96 1.44224957...
97 >>> l3({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
98 1.4422495703...
99 >>> l3(normalise({'a':0, 'b':2, 'c':0}), \
100 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
101 0.718144896...
102 >>> l3({'a':0, 'b':1}, {'a':1, 'b':1})
103 1.0
104 >>> l3(normalise({'a':0, 'b':1}), normalise({'a':1, 'b':1})) # doctest: +ELLIPSIS
105 0.6299605249...
106 """
107 total = 0
108 for k in frequencies1:
109 total += abs(frequencies1[k] - frequencies2[k]) ** 3
110 return total ** (1/3)
111
112 def geometric_mean(frequencies1, frequencies2):
113 """Finds the geometric mean of the absolute differences between two
114 frequency profiles, expressed as dictionaries.
115 Assumes every key in frequencies1 is also in frequencies2
116
117 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
118 1
119 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
120 1
121 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1})
122 3
123 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), \
124 normalise({'a':1, 'b':5, 'c':1})) # doctest: +ELLIPSIS
125 0.01382140...
126 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), \
127 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
128 0.0
129 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), \
130 normalise({'a':1, 'b':1, 'c':0})) # doctest: +ELLIPSIS
131 0.009259259...
132 """
133 total = 1
134 for k in frequencies1:
135 total *= abs(frequencies1[k] - frequencies2[k])
136 return total
137
138 def harmonic_mean(frequencies1, frequencies2):
139 """Finds the harmonic mean of the absolute differences between two
140 frequency profiles, expressed as dictionaries.
141 Assumes every key in frequencies1 is also in frequencies2
142
143 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
144 1.0
145 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
146 1.0
147 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1}) # doctest: +ELLIPSIS
148 1.285714285...
149 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), \
150 normalise({'a':1, 'b':5, 'c':1})) # doctest: +ELLIPSIS
151 0.228571428571...
152 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), \
153 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
154 0
155 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), \
156 normalise({'a':1, 'b':1, 'c':0})) # doctest: +ELLIPSIS
157 0.2
158 """
159 total = 0
160 for k in frequencies1:
161 if abs(frequencies1[k] - frequencies2[k]) == 0:
162 return 0
163 total += 1 / abs(frequencies1[k] - frequencies2[k])
164 return len(frequencies1) / total
165
166
167 def cosine_similarity(frequencies1, frequencies2):
168 """Finds the distances between two frequency profiles, expressed as
169 dictionaries.
170 Assumes every key in frequencies1 is also in frequencies2
171
172 >>> cosine_similarity({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
173 1.0000000000...
174 >>> cosine_similarity({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
175 1.0000000000...
176 >>> cosine_similarity({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
177 0.5773502691...
178 >>> cosine_similarity({'a':0, 'b':1}, {'a':1, 'b':1}) # doctest: +ELLIPSIS
179 0.7071067811...
180 """
181 numerator = 0
182 length1 = 0
183 length2 = 0
184 for k in frequencies1:
185 numerator += frequencies1[k] * frequencies2[k]
186 length1 += frequencies1[k]**2
187 for k in frequencies2.keys():
188 length2 += frequencies2[k]**2
189 return numerator / (length1 ** 0.5 * length2 ** 0.5)
190
191
192
193 if __name__ == "__main__":
194 import doctest
195 doctest.testmod()