Changed norms.normalise to sum elements to one (e.g. probabilities). Index of coincid...
[cipher-tools.git] / norms.py
1 import collections
2
3 def normalise(frequencies):
4 """Scale a set of frequencies so they sum to one
5
6 >>> sorted(normalise({1: 1, 2: 0}).items())
7 [(1, 1.0), (2, 0.0)]
8 >>> sorted(normalise({1: 1, 2: 1}).items())
9 [(1, 0.5), (2, 0.5)]
10 >>> sorted(normalise({1: 1, 2: 1, 3: 1}).items()) # doctest: +ELLIPSIS
11 [(1, 0.333...), (2, 0.333...), (3, 0.333...)]
12 >>> sorted(normalise({1: 1, 2: 2, 3: 1}).items())
13 [(1, 0.25), (2, 0.5), (3, 0.25)]
14 """
15 length = sum([f for f in frequencies.values()])
16 return collections.defaultdict(int, ((k, v / length)
17 for (k, v) in frequencies.items()))
18
19 def euclidean_scale(frequencies):
20 """Scale a set of frequencies so they have a unit euclidean length
21
22 >>> sorted(euclidean_scale({1: 1, 2: 0}).items())
23 [(1, 1.0), (2, 0.0)]
24 >>> sorted(euclidean_scale({1: 1, 2: 1}).items()) # doctest: +ELLIPSIS
25 [(1, 0.7071067...), (2, 0.7071067...)]
26 >>> sorted(euclidean_scale({1: 1, 2: 1, 3: 1}).items()) # doctest: +ELLIPSIS
27 [(1, 0.577350...), (2, 0.577350...), (3, 0.577350...)]
28 >>> sorted(euclidean_scale({1: 1, 2: 2, 3: 1}).items()) # doctest: +ELLIPSIS
29 [(1, 0.408248...), (2, 0.81649658...), (3, 0.408248...)]
30 """
31 length = sum([f ** 2 for f in frequencies.values()]) ** 0.5
32 return collections.defaultdict(int, ((k, v / length)
33 for (k, v) in frequencies.items()))
34
35
36 def scale(frequencies):
37 """Scale a set of frequencies so the largest is 1
38
39 >>> sorted(scale({1: 1, 2: 0}).items())
40 [(1, 1.0), (2, 0.0)]
41 >>> sorted(scale({1: 1, 2: 1}).items())
42 [(1, 1.0), (2, 1.0)]
43 >>> sorted(scale({1: 1, 2: 1, 3: 1}).items())
44 [(1, 1.0), (2, 1.0), (3, 1.0)]
45 >>> sorted(scale({1: 1, 2: 2, 3: 1}).items())
46 [(1, 0.5), (2, 1.0), (3, 0.5)]
47 """
48 largest = max(frequencies.values())
49 return collections.defaultdict(int, ((k, v / largest)
50 for (k, v) in frequencies.items()))
51
52
53 def l2(frequencies1, frequencies2):
54 """Finds the distances between two frequency profiles, expressed as dictionaries.
55 Assumes every key in frequencies1 is also in frequencies2
56
57 >>> l2({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
58 0.0
59 >>> l2({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
60 1.73205080...
61 >>> l2(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
62 0.0
63 >>> l2({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
64 1.732050807...
65 >>> l2(normalise({'a':0, 'b':2, 'c':0}), \
66 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
67 0.81649658...
68 >>> l2({'a':0, 'b':1}, {'a':1, 'b':1})
69 1.0
70 """
71 total = 0
72 for k in frequencies1.keys():
73 total += (frequencies1[k] - frequencies2[k]) ** 2
74 return total ** 0.5
75 euclidean_distance = l2
76
77 def l1(frequencies1, frequencies2):
78 """Finds the distances between two frequency profiles, expressed as
79 dictionaries. Assumes every key in frequencies1 is also in frequencies2
80
81 >>> l1({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
82 0
83 >>> l1({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
84 3
85 >>> l1(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
86 0.0
87 >>> l1({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
88 3
89 >>> l1({'a':0, 'b':1}, {'a':1, 'b':1})
90 1
91 """
92 total = 0
93 for k in frequencies1.keys():
94 total += abs(frequencies1[k] - frequencies2[k])
95 return total
96
97 def l3(frequencies1, frequencies2):
98 """Finds the distances between two frequency profiles, expressed as
99 dictionaries. Assumes every key in frequencies1 is also in frequencies2
100
101 >>> l3({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
102 0.0
103 >>> l3({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
104 1.44224957...
105 >>> l3({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
106 1.4422495703...
107 >>> l3(normalise({'a':0, 'b':2, 'c':0}), \
108 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
109 0.718144896...
110 >>> l3({'a':0, 'b':1}, {'a':1, 'b':1})
111 1.0
112 >>> l3(normalise({'a':0, 'b':1}), normalise({'a':1, 'b':1})) # doctest: +ELLIPSIS
113 0.6299605249...
114 """
115 total = 0
116 for k in frequencies1.keys():
117 total += abs(frequencies1[k] - frequencies2[k]) ** 3
118 return total ** (1/3)
119
120 def geometric_mean(frequencies1, frequencies2):
121 """Finds the geometric mean of the absolute differences between two frequency profiles,
122 expressed as dictionaries.
123 Assumes every key in frequencies1 is also in frequencies2
124
125 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
126 1
127 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
128 1
129 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1})
130 3
131 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), \
132 normalise({'a':1, 'b':5, 'c':1})) # doctest: +ELLIPSIS
133 0.01382140...
134 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), \
135 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
136 0.0
137 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), \
138 normalise({'a':1, 'b':1, 'c':0})) # doctest: +ELLIPSIS
139 0.009259259...
140 """
141 total = 1
142 for k in frequencies1.keys():
143 total *= abs(frequencies1[k] - frequencies2[k])
144 return total
145
146 def harmonic_mean(frequencies1, frequencies2):
147 """Finds the harmonic mean of the absolute differences between two frequency profiles,
148 expressed as dictionaries.
149 Assumes every key in frequencies1 is also in frequencies2
150
151 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
152 1.0
153 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
154 1.0
155 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1}) # doctest: +ELLIPSIS
156 1.285714285...
157 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), \
158 normalise({'a':1, 'b':5, 'c':1})) # doctest: +ELLIPSIS
159 0.228571428571...
160 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), \
161 normalise({'a':1, 'b':1, 'c':1})) # doctest: +ELLIPSIS
162 0
163 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), \
164 normalise({'a':1, 'b':1, 'c':0})) # doctest: +ELLIPSIS
165 0.2
166 """
167 total = 0
168 for k in frequencies1.keys():
169 if abs(frequencies1[k] - frequencies2[k]) == 0:
170 return 0
171 total += 1 / abs(frequencies1[k] - frequencies2[k])
172 return len(frequencies1) / total
173
174
175 def cosine_distance(frequencies1, frequencies2):
176 """Finds the distances between two frequency profiles, expressed as dictionaries.
177 Assumes every key in frequencies1 is also in frequencies2
178
179 >>> cosine_distance({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
180 -2.22044604...e-16
181 >>> cosine_distance({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
182 -2.22044604...e-16
183 >>> cosine_distance({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
184 0.4226497308...
185 >>> cosine_distance({'a':0, 'b':1}, {'a':1, 'b':1}) # doctest: +ELLIPSIS
186 0.29289321881...
187 """
188 numerator = 0
189 length1 = 0
190 length2 = 0
191 for k in frequencies1.keys():
192 numerator += frequencies1[k] * frequencies2[k]
193 length1 += frequencies1[k]**2
194 for k in frequencies2.keys():
195 length2 += frequencies2[k]
196 return 1 - (numerator / (length1 ** 0.5 * length2 ** 0.5))
197
198
199 def index_of_coincidence(frequencies):
200 """Finds the (expected) index of coincidence given a set of frequencies
201 """
202 return sum([f ** 2 for f in frequencies.values()]) * len(frequencies.keys())
203
204
205 if __name__ == "__main__":
206 import doctest
207 doctest.testmod()