More whitespace and line length tidying
[cipher-tools.git] / norms.py
1 import collections
2
3 def normalise(frequencies):
4 """Scale a set of frequenies so they have a unit euclidean length
5
6 >>> sorted(normalise({1: 1, 2: 0}).items())
7 [(1, 1.0), (2, 0.0)]
8 >>> sorted(normalise({1: 1, 2: 1}).items())
9 [(1, 0.7071067811865475), (2, 0.7071067811865475)]
10 >>> sorted(normalise({1: 1, 2: 1, 3: 1}).items())
11 [(1, 0.5773502691896258), (2, 0.5773502691896258), (3, 0.5773502691896258)]
12 >>> sorted(normalise({1: 1, 2: 2, 3: 1}).items())
13 [(1, 0.4082482904638631), (2, 0.8164965809277261), (3, 0.4082482904638631)]
14 """
15 length = sum([f ** 2 for f in frequencies.values()]) ** 0.5
16 return collections.defaultdict(int, ((k, v / length)
17 for (k, v) in frequencies.items()))
18
19 def scale(frequencies):
20 """Scale a set of frequencies so the largest is 1
21
22 >>> sorted(scale({1: 1, 2: 0}).items())
23 [(1, 1.0), (2, 0.0)]
24 >>> sorted(scale({1: 1, 2: 1}).items())
25 [(1, 1.0), (2, 1.0)]
26 >>> sorted(scale({1: 1, 2: 1, 3: 1}).items())
27 [(1, 1.0), (2, 1.0), (3, 1.0)]
28 >>> sorted(scale({1: 1, 2: 2, 3: 1}).items())
29 [(1, 0.5), (2, 1.0), (3, 0.5)]
30 """
31 largest = max(frequencies.values())
32 return collections.defaultdict(int, ((k, v / largest)
33 for (k, v) in frequencies.items()))
34
35
36 def l2(frequencies1, frequencies2):
37 """Finds the distances between two frequency profiles, expressed as dictionaries.
38 Assumes every key in frequencies1 is also in frequencies2
39
40 >>> l2({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
41 0.0
42 >>> l2({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
43 1.7320508075688772
44 >>> l2(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
45 0.0
46 >>> l2({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
47 1.7320508075688772
48 >>> l2(normalise({'a':0, 'b':2, 'c':0}), normalise({'a':1, 'b':1, 'c':1}))
49 0.9194016867619662
50 >>> l2({'a':0, 'b':1}, {'a':1, 'b':1})
51 1.0
52 """
53 total = 0
54 for k in frequencies1.keys():
55 total += (frequencies1[k] - frequencies2[k]) ** 2
56 return total ** 0.5
57 euclidean_distance = l2
58
59 def l1(frequencies1, frequencies2):
60 """Finds the distances between two frequency profiles, expressed as dictionaries.
61 Assumes every key in frequencies1 is also in frequencies2
62
63 >>> l1({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
64 0
65 >>> l1({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
66 3
67 >>> l1(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
68 0.0
69 >>> l1({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
70 3
71 >>> l1({'a':0, 'b':1}, {'a':1, 'b':1})
72 1
73 """
74 total = 0
75 for k in frequencies1.keys():
76 total += abs(frequencies1[k] - frequencies2[k])
77 return total
78
79 def l3(frequencies1, frequencies2):
80 """Finds the distances between two frequency profiles, expressed as dictionaries.
81 Assumes every key in frequencies1 is also in frequencies2
82
83 >>> l3({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
84 0.0
85 >>> l3({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
86 1.4422495703074083
87 >>> l3({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
88 1.4422495703074083
89 >>> l3(normalise({'a':0, 'b':2, 'c':0}), normalise({'a':1, 'b':1, 'c':1}))
90 0.7721675487598008
91 >>> l3({'a':0, 'b':1}, {'a':1, 'b':1})
92 1.0
93 >>> l3(normalise({'a':0, 'b':1}), normalise({'a':1, 'b':1}))
94 0.7234757712960591
95 """
96 total = 0
97 for k in frequencies1.keys():
98 total += abs(frequencies1[k] - frequencies2[k]) ** 3
99 return total ** (1/3)
100
101 def geometric_mean(frequencies1, frequencies2):
102 """Finds the geometric mean of the absolute differences between two frequency profiles,
103 expressed as dictionaries.
104 Assumes every key in frequencies1 is also in frequencies2
105
106 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
107 1
108 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
109 1
110 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1})
111 3
112 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':5, 'c':1}))
113 0.057022248808851934
114 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
115 0.0
116 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':0}))
117 0.009720703533656434
118 """
119 total = 1
120 for k in frequencies1.keys():
121 total *= abs(frequencies1[k] - frequencies2[k])
122 return total
123
124 def harmonic_mean(frequencies1, frequencies2):
125 """Finds the harmonic mean of the absolute differences between two frequency profiles,
126 expressed as dictionaries.
127 Assumes every key in frequencies1 is also in frequencies2
128
129 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
130 1.0
131 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
132 1.0
133 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1})
134 1.2857142857142858
135 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':5, 'c':1}))
136 0.3849001794597505
137 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
138 0
139 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':0}))
140 0.17497266360581604
141 """
142 total = 0
143 for k in frequencies1.keys():
144 if abs(frequencies1[k] - frequencies2[k]) == 0:
145 return 0
146 total += 1 / abs(frequencies1[k] - frequencies2[k])
147 return len(frequencies1) / total
148
149
150 def cosine_distance(frequencies1, frequencies2):
151 """Finds the distances between two frequency profiles, expressed as dictionaries.
152 Assumes every key in frequencies1 is also in frequencies2
153
154 >>> cosine_distance({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
155 -2.220446049250313e-16
156 >>> cosine_distance({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
157 -2.220446049250313e-16
158 >>> cosine_distance({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
159 0.42264973081037416
160 >>> cosine_distance({'a':0, 'b':1}, {'a':1, 'b':1})
161 0.29289321881345254
162 """
163 numerator = 0
164 length1 = 0
165 length2 = 0
166 for k in frequencies1.keys():
167 numerator += frequencies1[k] * frequencies2[k]
168 length1 += frequencies1[k]**2
169 for k in frequencies2.keys():
170 length2 += frequencies2[k]
171 return 1 - (numerator / (length1 ** 0.5 * length2 ** 0.5))
172
173
174 if __name__ == "__main__":
175 import doctest
176 doctest.testmod()