Updated for challenge 9
[cipher-tools.git] / support / utilities.py
1 import string
2 import collections
3 import unicodedata
4 from itertools import zip_longest
5
6 # join a a list of letters into a string
7 cat = ''.join
8
9 # join a list of words into a string, separated by spaces
10 wcat = ' '.join
11
12 # join a list of lines, separated by newline
13 lcat = '\n'.join
14
15 def pos(letter):
16 """Return the position of a letter in the alphabet (0-25)"""
17 if letter in string.ascii_lowercase:
18 return ord(letter) - ord('a')
19 elif letter in string.ascii_uppercase:
20 return ord(letter) - ord('A')
21 else:
22 raise ValueError('pos requires input of {} to be an ascii letter'.format(letter))
23
24 def unpos(number):
25 """Return the letter in the given position in the alphabet (mod 26)"""
26 return chr(number % 26 + ord('a'))
27
28 def every_nth(text, n, fillvalue=''):
29 """Returns n strings, each of which consists of every nth character,
30 starting with the 0th, 1st, 2nd, ... (n-1)th character
31
32 >>> every_nth(string.ascii_lowercase, 5)
33 ['afkpuz', 'bglqv', 'chmrw', 'dinsx', 'ejoty']
34 >>> every_nth(string.ascii_lowercase, 1)
35 ['abcdefghijklmnopqrstuvwxyz']
36 >>> every_nth(string.ascii_lowercase, 26) # doctest: +NORMALIZE_WHITESPACE
37 ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
38 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
39 >>> every_nth(string.ascii_lowercase, 5, fillvalue='!')
40 ['afkpuz', 'bglqv!', 'chmrw!', 'dinsx!', 'ejoty!']
41 """
42 split_text = chunks(text, n, fillvalue)
43 return [cat(l) for l in zip_longest(*split_text, fillvalue=fillvalue)]
44
45 def combine_every_nth(split_text):
46 """Reforms a text split into every_nth strings
47
48 >>> combine_every_nth(every_nth(string.ascii_lowercase, 5))
49 'abcdefghijklmnopqrstuvwxyz'
50 >>> combine_every_nth(every_nth(string.ascii_lowercase, 1))
51 'abcdefghijklmnopqrstuvwxyz'
52 >>> combine_every_nth(every_nth(string.ascii_lowercase, 26))
53 'abcdefghijklmnopqrstuvwxyz'
54 """
55 return cat([cat(l)
56 for l in zip_longest(*split_text, fillvalue='')])
57
58 def chunks(text, n, fillvalue=None):
59 """Split a text into chunks of n characters
60
61 >>> chunks('abcdefghi', 3)
62 ['abc', 'def', 'ghi']
63 >>> chunks('abcdefghi', 4)
64 ['abcd', 'efgh', 'i']
65 >>> chunks('abcdefghi', 4, fillvalue='!')
66 ['abcd', 'efgh', 'i!!!']
67 """
68 if fillvalue:
69 padding = fillvalue[0] * (n - len(text) % n)
70 else:
71 padding = ''
72 return [(text+padding)[i:i+n] for i in range(0, len(text), n)]
73
74 def transpose(items, transposition):
75 """Moves items around according to the given transposition
76
77 >>> transpose(['a', 'b', 'c', 'd'], (0,1,2,3))
78 ['a', 'b', 'c', 'd']
79 >>> transpose(['a', 'b', 'c', 'd'], (3,1,2,0))
80 ['d', 'b', 'c', 'a']
81 >>> transpose([10,11,12,13,14,15], (3,2,4,1,5,0))
82 [13, 12, 14, 11, 15, 10]
83 """
84 transposed = [''] * len(transposition)
85 for p, t in enumerate(transposition):
86 transposed[p] = items[t]
87 return transposed
88
89 def untranspose(items, transposition):
90 """Undoes a transpose
91
92 >>> untranspose(['a', 'b', 'c', 'd'], [0,1,2,3])
93 ['a', 'b', 'c', 'd']
94 >>> untranspose(['d', 'b', 'c', 'a'], [3,1,2,0])
95 ['a', 'b', 'c', 'd']
96 >>> untranspose([13, 12, 14, 11, 15, 10], [3,2,4,1,5,0])
97 [10, 11, 12, 13, 14, 15]
98 """
99 transposed = [''] * len(transposition)
100 for p, t in enumerate(transposition):
101 transposed[t] = items[p]
102 return transposed
103
104 def deduplicate(text):
105 return list(collections.OrderedDict.fromkeys(text))
106
107
108 def letters(text):
109 """Remove all non-alphabetic characters from a text
110 >>> letters('The Quick')
111 'TheQuick'
112 >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
113 'TheQuickBROWNfoxjumpedoverthelazyDOG'
114 """
115 return ''.join([c for c in text if c in string.ascii_letters])
116
117 # Special characters for conversion, such as smart quotes.
118 unaccent_specials = ''.maketrans({"’": "'", '“': '"', '”': '"'})
119
120 def unaccent(text):
121 """Remove all accents from letters.
122 It does this by converting the unicode string to decomposed compatability
123 form, dropping all the combining accents, then re-encoding the bytes.
124
125 >>> unaccent('hello')
126 'hello'
127 >>> unaccent('HELLO')
128 'HELLO'
129 >>> unaccent('héllo')
130 'hello'
131 >>> unaccent('héllö')
132 'hello'
133 >>> unaccent('HÉLLÖ')
134 'HELLO'
135 """
136 translated_text = text.translate(unaccent_specials)
137 return unicodedata.normalize('NFKD', translated_text).\
138 encode('ascii', 'ignore').\
139 decode('utf-8')
140
141 def sanitise(text):
142 """Remove all non-alphabetic characters and convert the text to lowercase
143
144 >>> sanitise('The Quick')
145 'thequick'
146 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
147 'thequickbrownfoxjumpedoverthelazydog'
148 >>> sanitise('HÉLLÖ')
149 'hello'
150 """
151 return letters(unaccent(text)).lower()
152
153
154 def index_of_coincidence(text):
155 stext = sanitise(text)
156 counts = collections.Counter(stext)
157 denom = len(stext) * (len(text) - 1) / 26
158 return (
159 sum(max(counts[l] * counts[l] - 1, 0) for l in string.ascii_lowercase)
160 /
161 denom
162 )
163
164
165 def frequencies(text):
166 """Count the number of occurrences of each character in text
167
168 >>> sorted(frequencies('abcdefabc').items())
169 [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
170 >>> sorted(frequencies('the quick brown fox jumped over the lazy ' \
171 'dog').items()) # doctest: +NORMALIZE_WHITESPACE
172 [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1),
173 ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1),
174 ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2),
175 ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
176 >>> sorted(frequencies('The Quick BROWN fox jumped! over... the ' \
177 '(9lazy) DOG').items()) # doctest: +NORMALIZE_WHITESPACE
178 [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1),
179 ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1),
180 ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2),
181 ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1),
182 ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
183 >>> sorted(frequencies(sanitise('The Quick BROWN fox jumped! over... '\
184 'the (9lazy) DOG')).items()) # doctest: +NORMALIZE_WHITESPACE
185 [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1),
186 ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1),
187 ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1),
188 ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
189 >>> frequencies('abcdefabcdef')['x']
190 0
191 """
192 return collections.Counter(c for c in text)
193
194 if __name__ == "__main__":
195 import doctest
196 doctest.testmod()