ca984a30f96e4b09545eb09b277a5b0dc188be28
[cipher-tools.git] / support / utilities.py
1 import string
2 import collections
3 from itertools import zip_longest
4
5 # join a a list of letters into a string
6 cat = ''.join
7
8 # join a list of words into a string, separated by spaces
9 wcat = ' '.join
10
11 # join a list of lines, separated by newline
12 lcat = '\n'.join
13
14 def pos(letter):
15 """Return the position of a letter in the alphabet (0-25)"""
16 if letter in string.ascii_lowercase:
17 return ord(letter) - ord('a')
18 elif letter in string.ascii_uppercase:
19 return ord(letter) - ord('A')
20 else:
21 raise ValueError('pos requires input of {} to be an ascii letter'.format(letter))
22
23 def unpos(number):
24 """Return the letter in the given position in the alphabet (mod 26)"""
25 return chr(number % 26 + ord('a'))
26
27 def every_nth(text, n, fillvalue=''):
28 """Returns n strings, each of which consists of every nth character,
29 starting with the 0th, 1st, 2nd, ... (n-1)th character
30
31 >>> every_nth(string.ascii_lowercase, 5)
32 ['afkpuz', 'bglqv', 'chmrw', 'dinsx', 'ejoty']
33 >>> every_nth(string.ascii_lowercase, 1)
34 ['abcdefghijklmnopqrstuvwxyz']
35 >>> every_nth(string.ascii_lowercase, 26) # doctest: +NORMALIZE_WHITESPACE
36 ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
37 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
38 >>> every_nth(string.ascii_lowercase, 5, fillvalue='!')
39 ['afkpuz', 'bglqv!', 'chmrw!', 'dinsx!', 'ejoty!']
40 """
41 split_text = chunks(text, n, fillvalue)
42 return [cat(l) for l in zip_longest(*split_text, fillvalue=fillvalue)]
43
44 def combine_every_nth(split_text):
45 """Reforms a text split into every_nth strings
46
47 >>> combine_every_nth(every_nth(string.ascii_lowercase, 5))
48 'abcdefghijklmnopqrstuvwxyz'
49 >>> combine_every_nth(every_nth(string.ascii_lowercase, 1))
50 'abcdefghijklmnopqrstuvwxyz'
51 >>> combine_every_nth(every_nth(string.ascii_lowercase, 26))
52 'abcdefghijklmnopqrstuvwxyz'
53 """
54 return cat([cat(l)
55 for l in zip_longest(*split_text, fillvalue='')])
56
57 def chunks(text, n, fillvalue=None):
58 """Split a text into chunks of n characters
59
60 >>> chunks('abcdefghi', 3)
61 ['abc', 'def', 'ghi']
62 >>> chunks('abcdefghi', 4)
63 ['abcd', 'efgh', 'i']
64 >>> chunks('abcdefghi', 4, fillvalue='!')
65 ['abcd', 'efgh', 'i!!!']
66 """
67 if fillvalue:
68 padding = fillvalue[0] * (n - len(text) % n)
69 else:
70 padding = ''
71 return [(text+padding)[i:i+n] for i in range(0, len(text), n)]
72
73 def transpose(items, transposition):
74 """Moves items around according to the given transposition
75
76 >>> transpose(['a', 'b', 'c', 'd'], (0,1,2,3))
77 ['a', 'b', 'c', 'd']
78 >>> transpose(['a', 'b', 'c', 'd'], (3,1,2,0))
79 ['d', 'b', 'c', 'a']
80 >>> transpose([10,11,12,13,14,15], (3,2,4,1,5,0))
81 [13, 12, 14, 11, 15, 10]
82 """
83 transposed = [''] * len(transposition)
84 for p, t in enumerate(transposition):
85 transposed[p] = items[t]
86 return transposed
87
88 def untranspose(items, transposition):
89 """Undoes a transpose
90
91 >>> untranspose(['a', 'b', 'c', 'd'], [0,1,2,3])
92 ['a', 'b', 'c', 'd']
93 >>> untranspose(['d', 'b', 'c', 'a'], [3,1,2,0])
94 ['a', 'b', 'c', 'd']
95 >>> untranspose([13, 12, 14, 11, 15, 10], [3,2,4,1,5,0])
96 [10, 11, 12, 13, 14, 15]
97 """
98 transposed = [''] * len(transposition)
99 for p, t in enumerate(transposition):
100 transposed[t] = items[p]
101 return transposed
102
103 def deduplicate(text):
104 return list(collections.OrderedDict.fromkeys(text))
105
106
107 def letters(text):
108 """Remove all non-alphabetic characters from a text
109 >>> letters('The Quick')
110 'TheQuick'
111 >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
112 'TheQuickBROWNfoxjumpedoverthelazyDOG'
113 """
114 return ''.join([c for c in text if c in string.ascii_letters])
115
116 # Special characters for conversion, such as smart quotes.
117 unaccent_specials = ''.maketrans({"’": "'", '“': '"', '”': '"'})
118
119 def unaccent(text):
120 """Remove all accents from letters.
121 It does this by converting the unicode string to decomposed compatability
122 form, dropping all the combining accents, then re-encoding the bytes.
123
124 >>> unaccent('hello')
125 'hello'
126 >>> unaccent('HELLO')
127 'HELLO'
128 >>> unaccent('héllo')
129 'hello'
130 >>> unaccent('héllö')
131 'hello'
132 >>> unaccent('HÉLLÖ')
133 'HELLO'
134 """
135 translated_text = text.translate(unaccent_specials)
136 return unicodedata.normalize('NFKD', translated_text).\
137 encode('ascii', 'ignore').\
138 decode('utf-8')
139
140 def sanitise(text):
141 """Remove all non-alphabetic characters and convert the text to lowercase
142
143 >>> sanitise('The Quick')
144 'thequick'
145 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
146 'thequickbrownfoxjumpedoverthelazydog'
147 >>> sanitise('HÉLLÖ')
148 'hello'
149 """
150 return letters(unaccent(text)).lower()
151
152
153 def index_of_coincidence(text):
154 stext = sanitise(text)
155 counts = collections.Counter(stext)
156 denom = len(stext) * (len(text) - 1) / 26
157 return (
158 sum(max(counts[l] * counts[l] - 1, 0) for l in string.ascii_lowercase)
159 /
160 denom
161 )
162
163
164 def frequencies(text):
165 """Count the number of occurrences of each character in text
166
167 >>> sorted(frequencies('abcdefabc').items())
168 [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
169 >>> sorted(frequencies('the quick brown fox jumped over the lazy ' \
170 'dog').items()) # doctest: +NORMALIZE_WHITESPACE
171 [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1),
172 ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1),
173 ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2),
174 ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
175 >>> sorted(frequencies('The Quick BROWN fox jumped! over... the ' \
176 '(9lazy) DOG').items()) # doctest: +NORMALIZE_WHITESPACE
177 [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1),
178 ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1),
179 ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2),
180 ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1),
181 ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
182 >>> sorted(frequencies(sanitise('The Quick BROWN fox jumped! over... '\
183 'the (9lazy) DOG')).items()) # doctest: +NORMALIZE_WHITESPACE
184 [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1),
185 ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1),
186 ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1),
187 ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
188 >>> frequencies('abcdefabcdef')['x']
189 0
190 """
191 return collections.Counter(c for c in text)
192
193 if __name__ == "__main__":
194 import doctest