Added experiment for checking which metrics work best for using unigram frequency...
[cipher-tools.git] / norms.py
1 import collections
2
3 def normalise(frequencies):
4 """Scale a set of frequenies so they have a unit euclidean length
5
6 >>> sorted(normalise({1: 1, 2: 0}).items())
7 [(1, 1.0), (2, 0.0)]
8 >>> sorted(normalise({1: 1, 2: 1}).items())
9 [(1, 0.7071067811865475), (2, 0.7071067811865475)]
10 >>> sorted(normalise({1: 1, 2: 1, 3: 1}).items())
11 [(1, 0.5773502691896258), (2, 0.5773502691896258), (3, 0.5773502691896258)]
12 >>> sorted(normalise({1: 1, 2: 2, 3: 1}).items())
13 [(1, 0.4082482904638631), (2, 0.8164965809277261), (3, 0.4082482904638631)]
14 """
15 length = sum([f ** 2 for f in frequencies.values()]) ** 0.5
16 return collections.defaultdict(int, ((k, v / length) for (k, v) in frequencies.items()))
17
18 def scale(frequencies):
19 """Scale a set of frequencies so the largest is 1
20
21 >>> sorted(scale({1: 1, 2: 0}).items())
22 [(1, 1.0), (2, 0.0)]
23 >>> sorted(scale({1: 1, 2: 1}).items())
24 [(1, 1.0), (2, 1.0)]
25 >>> sorted(scale({1: 1, 2: 1, 3: 1}).items())
26 [(1, 1.0), (2, 1.0), (3, 1.0)]
27 >>> sorted(scale({1: 1, 2: 2, 3: 1}).items())
28 [(1, 0.5), (2, 1.0), (3, 0.5)]
29 """
30 largest = max(frequencies.values())
31 return collections.defaultdict(int, ((k, v / largest) for (k, v) in frequencies.items()))
32
33
34 def l2(frequencies1, frequencies2):
35 """Finds the distances between two frequency profiles, expressed as dictionaries.
36 Assumes every key in frequencies1 is also in frequencies2
37
38 >>> l2({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
39 0.0
40 >>> l2({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
41 1.7320508075688772
42 >>> l2(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
43 0.0
44 >>> l2({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
45 1.7320508075688772
46 >>> l2(normalise({'a':0, 'b':2, 'c':0}), normalise({'a':1, 'b':1, 'c':1}))
47 0.9194016867619662
48 >>> l2({'a':0, 'b':1}, {'a':1, 'b':1})
49 1.0
50 """
51 total = 0
52 for k in frequencies1.keys():
53 total += (frequencies1[k] - frequencies2[k]) ** 2
54 return total ** 0.5
55 euclidean_distance = l2
56
57 def l1(frequencies1, frequencies2):
58 """Finds the distances between two frequency profiles, expressed as dictionaries.
59 Assumes every key in frequencies1 is also in frequencies2
60
61 >>> l1({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
62 0
63 >>> l1({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
64 3
65 >>> l1(normalise({'a':2, 'b':2, 'c':2}), normalise({'a':1, 'b':1, 'c':1}))
66 0.0
67 >>> l1({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
68 3
69 >>> l1({'a':0, 'b':1}, {'a':1, 'b':1})
70 1
71 """
72 total = 0
73 for k in frequencies1.keys():
74 total += abs(frequencies1[k] - frequencies2[k])
75 return total
76
77 def l3(frequencies1, frequencies2):
78 """Finds the distances between two frequency profiles, expressed as dictionaries.
79 Assumes every key in frequencies1 is also in frequencies2
80
81 >>> l3({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
82 0.0
83 >>> l3({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
84 1.4422495703074083
85 >>> l3({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
86 1.4422495703074083
87 >>> l3(normalise({'a':0, 'b':2, 'c':0}), normalise({'a':1, 'b':1, 'c':1}))
88 0.7721675487598008
89 >>> l3({'a':0, 'b':1}, {'a':1, 'b':1})
90 1.0
91 >>> l3(normalise({'a':0, 'b':1}), normalise({'a':1, 'b':1}))
92 0.7234757712960591
93 """
94 total = 0
95 for k in frequencies1.keys():
96 total += abs(frequencies1[k] - frequencies2[k]) ** 3
97 return total ** (1/3)
98
99 def geometric_mean(frequencies1, frequencies2):
100 """Finds the distances between two frequency profiles, expressed as dictionaries.
101 Assumes every key in frequencies1 is also in frequencies2
102
103 """
104 total = 0
105 for k in frequencies1.keys():
106 total *= abs(frequencies1[k] - frequencies2[k])
107 return total
108
109 def harmonic_mean(frequencies1, frequencies2):
110 """Finds the distances between two frequency profiles, expressed as dictionaries.
111 Assumes every key in frequencies1 is also in frequencies2
112
113 """
114 total = 0
115 for k in frequencies1.keys():
116 total += 1 / abs(frequencies1[k] - frequencies2[k])
117 return 1 / total
118
119
120 def cosine_distance(frequencies1, frequencies2):
121 """Finds the distances between two frequency profiles, expressed as dictionaries.
122 Assumes every key in frequencies1 is also in frequencies2
123
124 >>> cosine_distance({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1})
125 -2.220446049250313e-16
126 >>> cosine_distance({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1})
127 -2.220446049250313e-16
128 >>> cosine_distance({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1})
129 0.42264973081037416
130 >>> cosine_distance({'a':0, 'b':1}, {'a':1, 'b':1})
131 0.29289321881345254
132 """
133 numerator = 0
134 length1 = 0
135 length2 = 0
136 for k in frequencies1.keys():
137 numerator += frequencies1[k] * frequencies2[k]
138 length1 += frequencies1[k]**2
139 for k in frequencies2.keys():
140 length2 += frequencies2[k]
141 return 1 - (numerator / (length1 ** 0.5 * length2 ** 0.5))
142
143
144 if __name__ == "__main__":
145 import doctest
146 doctest.testmod()