Updated for challenge 9
[cipher-tools.git] / support / norms.py
1 import collections
2 from math import log10
3
4
5 def lp(v1, v2=None, p=2):
6 """Find the L_p norm. If passed one vector, find the length of that vector.
7 If passed two vectors, find the length of the difference between them.
8 """
9 if v2:
10 vec = {k: abs(v1[k] - v2[k]) for k in (v1.keys() | v2.keys())}
11 else:
12 vec = v1
13 return sum(v ** p for v in vec.values()) ** (1.0 / p)
14
15 def l1(v1, v2=None):
16 """Finds the distances between two frequency profiles, expressed as
17 dictionaries. Assumes every key in frequencies1 is also in frequencies2
18
19 >>> l1({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
20 0.0
21 >>> l1({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
22 3.0
23 >>> l1(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
24 0.0
25 >>> l1({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
26 3.0
27 >>> l1({'a':0, 'b':1}, {'a':1, 'b':1})
28 1.0
29 """
30 return lp(v1, v2, 1)
31
32 def l2(v1, v2=None):
33 """Finds the distances between two frequency profiles, expressed as dictionaries.
34 Assumes every key in frequencies1 is also in frequencies2
35
36 >>> l2({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
37 0.0
38 >>> l2({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
39 1.73205080...
40 >>> l2(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
41 0.0
42 >>> l2({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
43 1.732050807...
44 >>> l2(normalise({'a':0, 'b':2, 'c':0}), \
45 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
46 0.81649658...
47 >>> l2({'a':0, 'b':1}, {'a':1, 'b':1})
48 1.0
49 """
50 return lp(v1, v2, 2)
51
52 def l3(v1, v2=None):
53 """Finds the distances between two frequency profiles, expressed as
54 dictionaries. Assumes every key in frequencies1 is also in frequencies2
55
56 >>> l3({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
57 0.0
58 >>> l3({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
59 1.44224957...
60 >>> l3({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
61 1.4422495703...
62 >>> l3(normalise({'a':0, 'b':2, 'c':0}), \
63 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
64 0.718144896...
65 >>> l3({'a':0, 'b':1}, {'a':1, 'b':1})
66 1.0
67 >>> l3(normalise({'a':0, 'b':1}), normalise({'a':1, 'b':1})) # doctest: +ELLIPSIS
68 0.6299605249...
69 """
70 return lp(v1, v2, 3)
71
72 def linf(v1, v2=None):
73 if v2:
74 vec = {k: abs(v1[k] - v2[k]) for k in (v1.keys() | v2.keys())}
75 else:
76 vec = v1
77 return max(v for v in vec.values())
78
79
80 def scale(frequencies, norm=l2):
81 length = norm(frequencies)
82 return collections.defaultdict(int,
83 {k: v / length for k, v in frequencies.items()})
84
85 def l2_scale(f):
86 """Scale a set of frequencies so they have a unit euclidean length
87
88 >>> sorted(euclidean_scale({1: 1, 2: 0}).items())
89 [(1, 1.0), (2, 0.0)]
90 >>> sorted(euclidean_scale({1: 1, 2: 1}).items()) # doctest: +ELLIPSIS
91 [(1, 0.7071067...), (2, 0.7071067...)]
92 >>> sorted(euclidean_scale({1: 1, 2: 1, 3: 1}).items()) # doctest: +ELLIPSIS
93 [(1, 0.577350...), (2, 0.577350...), (3, 0.577350...)]
94 >>> sorted(euclidean_scale({1: 1, 2: 2, 3: 1}).items()) # doctest: +ELLIPSIS
95 [(1, 0.408248...), (2, 0.81649658...), (3, 0.408248...)]
96 """
97 return scale(f, l2)
98
99 def l1_scale(f):
100 """Scale a set of frequencies so they sum to one
101
102 >>> sorted(normalise({1: 1, 2: 0}).items())
103 [(1, 1.0), (2, 0.0)]
104 >>> sorted(normalise({1: 1, 2: 1}).items())
105 [(1, 0.5), (2, 0.5)]
106 >>> sorted(normalise({1: 1, 2: 1, 3: 1}).items()) # doctest: +ELLIPSIS
107 [(1, 0.333...), (2, 0.333...), (3, 0.333...)]
108 >>> sorted(normalise({1: 1, 2: 2, 3: 1}).items())
109 [(1, 0.25), (2, 0.5), (3, 0.25)]
110 """
111 return scale(f, l1)
112
113 normalise = l1_scale
114 euclidean_distance = l2
115 euclidean_scale = l2_scale
116
117
118 def geometric_mean(frequencies1, frequencies2):
119 """Finds the geometric mean of the absolute differences between two frequency profiles,
120 expressed as dictionaries.
121 Assumes every key in frequencies1 is also in frequencies2
122
123 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
124 1.0
125 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
126 1.0
127 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1})
128 3.0
129 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), \
130 normalise({'a':1, 'b':5, 'c':1})) # doctest: +ELLIPSIS
131 0.01382140...
132 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), \
133 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
134 0.0
135 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), \
136 normalise({'a':1, 'b':1, 'c':0})) # doctest: +ELLIPSIS
137 0.009259259...
138 """
139 total = 1.0
140 for k in frequencies1:
141 total *= abs(frequencies1[k] - frequencies2[k])
142 return total
143
144 def harmonic_mean(frequencies1, frequencies2):
145 """Finds the harmonic mean of the absolute differences between two frequency profiles,
146 expressed as dictionaries.
147 Assumes every key in frequencies1 is also in frequencies2
148
149 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
150 1.0
151 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
152 1.0
153 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1}) # doctest: +ELLIPSIS
154 1.285714285...
155 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), \
156 normalise({'a':1, 'b':5, 'c':1})) # doctest: +ELLIPSIS
157 0.228571428571...
158 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), \
159 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
160 0.0
161 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), \
162 normalise({'a':1, 'b':1, 'c':0})) # doctest: +ELLIPSIS
163 0.2
164 """
165 total = 0.0
166 for k in frequencies1:
167 if abs(frequencies1[k] - frequencies2[k]) == 0:
168 return 0.0
169 total += 1.0 / abs(frequencies1[k] - frequencies2[k])
170 return len(frequencies1) / total
171
172
173 def cosine_similarity(frequencies1, frequencies2):
174 """Finds the distances between two frequency profiles, expressed as dictionaries.
175 Assumes every key in frequencies1 is also in frequencies2
176
177 >>> cosine_similarity({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
178 1.0000000000...
179 >>> cosine_similarity({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
180 1.0000000000...
181 >>> cosine_similarity({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
182 0.5773502691...
183 >>> cosine_similarity({'a':0, 'b':1}, {'a':1, 'b':1}) # doctest: +ELLIPSIS
184 0.7071067811...
185 """
186 numerator = 0
187 length1 = 0
188 length2 = 0
189 for k in frequencies1:
190 numerator += frequencies1[k] * frequencies2[k]
191 length1 += frequencies1[k]**2
192 for k in frequencies2:
193 length2 += frequencies2[k]**2
194 return numerator / (length1 ** 0.5 * length2 ** 0.5)
195
196
197
198 if __name__ == "__main__":
199 import doctest
200 doctest.testmod()