Started on documentation
[szyfrow.git] / szyfrow / support / norms.py
1 """Various norms, for calcuating the distances between two frequency
2 profiles.
3 """
4
5 import collections
6 from math import log10
7
8 def lp(v1, v2=None, p=2):
9 """Find the L_p norm. If passed one vector, find the length of that vector.
10 If passed two vectors, find the length of the difference between them.
11 """
12 if v2:
13 vec = {k: abs(v1[k] - v2[k]) for k in (v1.keys() | v2.keys())}
14 else:
15 vec = v1
16 return sum(v ** p for v in vec.values()) ** (1.0 / p)
17
18 def l1(v1, v2=None):
19 """Finds the distances between two frequency profiles, expressed as
20 dictionaries. Assumes every key in frequencies1 is also in frequencies2
21
22 >>> l1({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
23 0.0
24 >>> l1({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
25 3.0
26 >>> l1(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
27 0.0
28 >>> l1({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
29 3.0
30 >>> l1({'a':0, 'b':1}, {'a':1, 'b':1})
31 1.0
32 """
33 return lp(v1, v2, 1)
34
35 def l2(v1, v2=None):
36 """Finds the distances between two frequency profiles, expressed as dictionaries.
37 Assumes every key in frequencies1 is also in frequencies2
38
39 >>> l2({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
40 0.0
41 >>> l2({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
42 1.73205080...
43 >>> l2(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
44 0.0
45 >>> l2({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
46 1.732050807...
47 >>> l2(normalise({'a':0, 'b':2, 'c':0}), \
48 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
49 0.81649658...
50 >>> l2({'a':0, 'b':1}, {'a':1, 'b':1})
51 1.0
52 """
53 return lp(v1, v2, 2)
54
55 def l3(v1, v2=None):
56 """Finds the distances between two frequency profiles, expressed as
57 dictionaries. Assumes every key in frequencies1 is also in frequencies2
58
59 >>> l3({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
60 0.0
61 >>> l3({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
62 1.44224957...
63 >>> l3({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
64 1.4422495703...
65 >>> l3(normalise({'a':0, 'b':2, 'c':0}), \
66 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
67 0.718144896...
68 >>> l3({'a':0, 'b':1}, {'a':1, 'b':1})
69 1.0
70 >>> l3(normalise({'a':0, 'b':1}), normalise({'a':1, 'b':1})) # doctest: +ELLIPSIS
71 0.6299605249...
72 """
73 return lp(v1, v2, 3)
74
75 def linf(v1, v2=None):
76 """Finds the distances between two frequency profiles, expressed as
77 dictionaries. Assumes every key in frequencies1 is also in frequencies2"""
78 if v2:
79 vec = {k: abs(v1[k] - v2[k]) for k in (v1.keys() | v2.keys())}
80 else:
81 vec = v1
82 return max(v for v in vec.values())
83
84
85 def scale(frequencies, norm=l2):
86 length = norm(frequencies)
87 return collections.defaultdict(int,
88 {k: v / length for k, v in frequencies.items()})
89
90 def l2_scale(f):
91 """Scale a set of frequencies so they have a unit euclidean length
92
93 >>> sorted(euclidean_scale({1: 1, 2: 0}).items())
94 [(1, 1.0), (2, 0.0)]
95 >>> sorted(euclidean_scale({1: 1, 2: 1}).items()) # doctest: +ELLIPSIS
96 [(1, 0.7071067...), (2, 0.7071067...)]
97 >>> sorted(euclidean_scale({1: 1, 2: 1, 3: 1}).items()) # doctest: +ELLIPSIS
98 [(1, 0.577350...), (2, 0.577350...), (3, 0.577350...)]
99 >>> sorted(euclidean_scale({1: 1, 2: 2, 3: 1}).items()) # doctest: +ELLIPSIS
100 [(1, 0.408248...), (2, 0.81649658...), (3, 0.408248...)]
101 """
102 return scale(f, l2)
103
104 def l1_scale(f):
105 """Scale a set of frequencies so they sum to one
106
107 >>> sorted(normalise({1: 1, 2: 0}).items())
108 [(1, 1.0), (2, 0.0)]
109 >>> sorted(normalise({1: 1, 2: 1}).items())
110 [(1, 0.5), (2, 0.5)]
111 >>> sorted(normalise({1: 1, 2: 1, 3: 1}).items()) # doctest: +ELLIPSIS
112 [(1, 0.333...), (2, 0.333...), (3, 0.333...)]
113 >>> sorted(normalise({1: 1, 2: 2, 3: 1}).items())
114 [(1, 0.25), (2, 0.5), (3, 0.25)]
115 """
116 return scale(f, l1)
117
118 normalise = l1_scale
119 euclidean_distance = l2
120 euclidean_scale = l2_scale
121
122
123 def geometric_mean(frequencies1, frequencies2):
124 """Finds the geometric mean of the absolute differences between two frequency profiles,
125 expressed as dictionaries.
126 Assumes every key in frequencies1 is also in frequencies2
127
128 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
129 1.0
130 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
131 1.0
132 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1})
133 3.0
134 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), \
135 normalise({'a':1, 'b':5, 'c':1})) # doctest: +ELLIPSIS
136 0.01382140...
137 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), \
138 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
139 0.0
140 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), \
141 normalise({'a':1, 'b':1, 'c':0})) # doctest: +ELLIPSIS
142 0.009259259...
143 """
144 total = 1.0
145 for k in frequencies1:
146 total *= abs(frequencies1[k] - frequencies2[k])
147 return total
148
149 def harmonic_mean(frequencies1, frequencies2):
150 """Finds the harmonic mean of the absolute differences between two frequency profiles,
151 expressed as dictionaries.
152 Assumes every key in frequencies1 is also in frequencies2
153
154 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
155 1.0
156 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
157 1.0
158 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1}) # doctest: +ELLIPSIS
159 1.285714285...
160 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), \
161 normalise({'a':1, 'b':5, 'c':1})) # doctest: +ELLIPSIS
162 0.228571428571...
163 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), \
164 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
165 0.0
166 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), \
167 normalise({'a':1, 'b':1, 'c':0})) # doctest: +ELLIPSIS
168 0.2
169 """
170 total = 0.0
171 for k in frequencies1:
172 if abs(frequencies1[k] - frequencies2[k]) == 0:
173 return 0.0
174 total += 1.0 / abs(frequencies1[k] - frequencies2[k])
175 return len(frequencies1) / total
176
177
178 def cosine_similarity(frequencies1, frequencies2):
179 """Finds the distances between two frequency profiles, expressed as dictionaries.
180 Assumes every key in frequencies1 is also in frequencies2
181
182 >>> cosine_similarity({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
183 1.0000000000...
184 >>> cosine_similarity({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
185 1.0000000000...
186 >>> cosine_similarity({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
187 0.5773502691...
188 >>> cosine_similarity({'a':0, 'b':1}, {'a':1, 'b':1}) # doctest: +ELLIPSIS
189 0.7071067811...
190 """
191 numerator = 0
192 length1 = 0
193 length2 = 0
194 for k in frequencies1:
195 numerator += frequencies1[k] * frequencies2[k]
196 length1 += frequencies1[k]**2
197 for k in frequencies2:
198 length2 += frequencies2[k]**2
199 return numerator / (length1 ** 0.5 * length2 ** 0.5)
200
201
202
203 if __name__ == "__main__":
204 import doctest
205 doctest.testmod()