Segmentation working, though hits recursion limit for texts longer than 250 characters
[cipher-tools.git] / norms.py
1 import collections
2
3 def normalise(frequencies):
4 """Scale a set of frequenies so they have a unit euclidean length
5
6 >>> sorted(normalise({1: 1, 2: 0}).items())
7 [(1, 1.0), (2, 0.0)]
8 >>> sorted(normalise({1: 1, 2: 1}).items())
9 [(1, 0.7071067811865475), (2, 0.7071067811865475)]
10 >>> sorted(normalise({1: 1, 2: 1, 3: 1}).items())
11 [(1, 0.5773502691896258), (2, 0.5773502691896258), (3, 0.5773502691896258)]
12 >>> sorted(normalise({1: 1, 2: 2, 3: 1}).items())
13 [(1, 0.4082482904638631), (2, 0.8164965809277261), (3, 0.4082482904638631)]
14 """
15 length = sum([f ** 2 for f in frequencies.values()]) ** 0.5
16 return collections.defaultdict(int, ((k, v / length) for (k, v) in frequencies.items()))
17
18 def scale(frequencies):
19 """Scale a set of frequencies so the largest is 1
20
21 >>> sorted(scale({1: 1, 2: 0}).items())
22 [(1, 1.0), (2, 0.0)]
23 >>> sorted(scale({1: 1, 2: 1}).items())
24 [(1, 1.0), (2, 1.0)]
25 >>> sorted(scale({1: 1, 2: 1, 3: 1}).items())
26 [(1, 1.0), (2, 1.0), (3, 1.0)]
27 >>> sorted(scale({1: 1, 2: 2, 3: 1}).items())
28 [(1, 0.5), (2, 1.0), (3, 0.5)]
29 """
30 largest = max(frequencies.values())
31 return collections.defaultdict(int, ((k, v / largest) for (k, v) in frequencies.items()))
32
33
34 def l2(frequencies1, frequencies2):
35 """Finds the distances between two frequency profiles, expressed as dictionaries.
36 Assumes every key in frequencies1 is also in frequencies2
37
38 >>> l2({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
39 0.0
40 >>> l2({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
41 1.7320508075688772
42 >>> l2(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
43 0.0
44 >>> l2({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
45 1.7320508075688772
46 >>> l2(normalise({'a':0, 'b':2, 'c':0}), normalise({'a':1, 'b':1, 'c':1}))
47 0.9194016867619662
48 >>> l2({'a':0, 'b':1}, {'a':1, 'b':1})
49 1.0
50 """
51 total = 0
52 for k in frequencies1.keys():
53 total += (frequencies1[k] - frequencies2[k]) ** 2
54 return total ** 0.5
55 euclidean_distance = l2
56
57 def l1(frequencies1, frequencies2):
58 """Finds the distances between two frequency profiles, expressed as dictionaries.
59 Assumes every key in frequencies1 is also in frequencies2
60
61 >>> l1({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
62 0
63 >>> l1({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
64 3
65 >>> l1(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
66 0.0
67 >>> l1({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
68 3
69 >>> l1({'a':0, 'b':1}, {'a':1, 'b':1})
70 1
71 """
72 total = 0
73 for k in frequencies1.keys():
74 total += abs(frequencies1[k] - frequencies2[k])
75 return total
76
77 def l3(frequencies1, frequencies2):
78 """Finds the distances between two frequency profiles, expressed as dictionaries.
79 Assumes every key in frequencies1 is also in frequencies2
80
81 >>> l3({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
82 0.0
83 >>> l3({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
84 1.4422495703074083
85 >>> l3({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
86 1.4422495703074083
87 >>> l3(normalise({'a':0, 'b':2, 'c':0}), normalise({'a':1, 'b':1, 'c':1}))
88 0.7721675487598008
89 >>> l3({'a':0, 'b':1}, {'a':1, 'b':1})
90 1.0
91 >>> l3(normalise({'a':0, 'b':1}), normalise({'a':1, 'b':1}))
92 0.7234757712960591
93 """
94 total = 0
95 for k in frequencies1.keys():
96 total += abs(frequencies1[k] - frequencies2[k]) ** 3
97 return total ** (1/3)
98
99 def geometric_mean(frequencies1, frequencies2):
100 """Finds the geometric mean of the absolute differences between two frequency profiles,
101 expressed as dictionaries.
102 Assumes every key in frequencies1 is also in frequencies2
103
104 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
105 1
106 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
107 1
108 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1})
109 3
110 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':5, 'c':1}))
111 0.057022248808851934
112 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
113 0.0
114 >>> geometric_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':0}))
115 0.009720703533656434
116 """
117 total = 1
118 for k in frequencies1.keys():
119 total *= abs(frequencies1[k] - frequencies2[k])
120 return total
121
122 def harmonic_mean(frequencies1, frequencies2):
123 """Finds the harmonic mean of the absolute differences between two frequency profiles,
124 expressed as dictionaries.
125 Assumes every key in frequencies1 is also in frequencies2
126
127 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
128 1.0
129 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
130 1.0
131 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':5, 'c':1})
132 1.2857142857142858
133 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':5, 'c':1}))
134 0.3849001794597505
135 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
136 0
137 >>> harmonic_mean(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':0}))
138 0.17497266360581604
139 """
140 total = 0
141 for k in frequencies1.keys():
142 if abs(frequencies1[k] - frequencies2[k]) == 0:
143 return 0
144 total += 1 / abs(frequencies1[k] - frequencies2[k])
145 return len(frequencies1) / total
146
147
148 def cosine_distance(frequencies1, frequencies2):
149 """Finds the distances between two frequency profiles, expressed as dictionaries.
150 Assumes every key in frequencies1 is also in frequencies2
151
152 >>> cosine_distance({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
153 -2.220446049250313e-16
154 >>> cosine_distance({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
155 -2.220446049250313e-16
156 >>> cosine_distance({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
157 0.42264973081037416
158 >>> cosine_distance({'a':0, 'b':1}, {'a':1, 'b':1})
159 0.29289321881345254
160 """
161 numerator = 0
162 length1 = 0
163 length2 = 0
164 for k in frequencies1.keys():
165 numerator += frequencies1[k] * frequencies2[k]
166 length1 += frequencies1[k]**2
167 for k in frequencies2.keys():
168 length2 += frequencies2[k]
169 return 1 - (numerator / (length1 ** 0.5 * length2 ** 0.5))
170
171
172 if __name__ == "__main__":
173 import doctest
174 doctest.testmod()