7a61ae709a55d2115118991bc78f860caf1ad66b
[szyfrow.git] / szyfrow / support / utilities.py
1 import string
2 import collections
3 import unicodedata
4 from itertools import zip_longest
5
6 # join a a list of letters into a string
7 cat = ''.join
8
9 # join a list of words into a string, separated by spaces
10 wcat = ' '.join
11
12 # join a list of lines, separated by newline
13 lcat = '\n'.join
14
15 def pos(letter):
16 """Return the position of a letter in the alphabet (0-25)"""
17 if letter in string.ascii_lowercase:
18 return ord(letter) - ord('a')
19 elif letter in string.ascii_uppercase:
20 return ord(letter) - ord('A')
21 else:
22 raise ValueError('pos requires input of {} to be an ascii letter'.format(letter))
23
24 def unpos(number):
25 """Return the letter in the given position in the alphabet (mod 26)"""
26 return chr(number % 26 + ord('a'))
27
28 def pad(message_len, group_len, fillvalue):
29 """Return the padding needed to extend a message to a multiple of group_len
30 in length.
31
32 fillvalue can be a function or a literal value. If a function, it is called
33 once for each padded character. Use this with fillvalue=random_english_letter
34 to pad a message with random letters.
35 """
36 padding_length = group_len - message_len % group_len
37 if padding_length == group_len: padding_length = 0
38 padding = ''
39 if callable(fillvalue):
40 for i in range(padding_length):
41 padding += fillvalue()
42 else:
43 padding += fillvalue * padding_length
44 return padding
45
46 def every_nth(text, n, fillvalue=''):
47 """Returns n strings, each of which consists of every nth character,
48 starting with the 0th, 1st, 2nd, ... (n-1)th character
49
50 >>> every_nth(string.ascii_lowercase, 5)
51 ['afkpuz', 'bglqv', 'chmrw', 'dinsx', 'ejoty']
52 >>> every_nth(string.ascii_lowercase, 1)
53 ['abcdefghijklmnopqrstuvwxyz']
54 >>> every_nth(string.ascii_lowercase, 26) # doctest: +NORMALIZE_WHITESPACE
55 ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
56 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
57 >>> every_nth(string.ascii_lowercase, 5, fillvalue='!')
58 ['afkpuz', 'bglqv!', 'chmrw!', 'dinsx!', 'ejoty!']
59 """
60 split_text = chunks(text, n, fillvalue)
61 return [cat(l) for l in zip_longest(*split_text, fillvalue=fillvalue)]
62
63 def combine_every_nth(split_text):
64 """Reforms a text split into every_nth strings
65
66 >>> combine_every_nth(every_nth(string.ascii_lowercase, 5))
67 'abcdefghijklmnopqrstuvwxyz'
68 >>> combine_every_nth(every_nth(string.ascii_lowercase, 1))
69 'abcdefghijklmnopqrstuvwxyz'
70 >>> combine_every_nth(every_nth(string.ascii_lowercase, 26))
71 'abcdefghijklmnopqrstuvwxyz'
72 """
73 return cat([cat(l)
74 for l in zip_longest(*split_text, fillvalue='')])
75
76 def chunks(text, n, fillvalue=None):
77 """Split a text into chunks of n characters
78
79 >>> chunks('abcdefghi', 3)
80 ['abc', 'def', 'ghi']
81 >>> chunks('abcdefghi', 4)
82 ['abcd', 'efgh', 'i']
83 >>> chunks('abcdefghi', 4, fillvalue='!')
84 ['abcd', 'efgh', 'i!!!']
85 """
86 if fillvalue:
87 # padding = fillvalue[0] * (n - len(text) % n)
88 padding = pad(len(text), n, fillvalue)
89 padded_text = text + padding
90 else:
91 padded_text = text
92 return [(padded_text)[i:i+n] for i in range(0, len(text), n)]
93
94 def transpose(items, transposition):
95 """Moves items around according to the given transposition
96
97 >>> transpose(['a', 'b', 'c', 'd'], (0,1,2,3))
98 ['a', 'b', 'c', 'd']
99 >>> transpose(['a', 'b', 'c', 'd'], (3,1,2,0))
100 ['d', 'b', 'c', 'a']
101 >>> transpose([10,11,12,13,14,15], (3,2,4,1,5,0))
102 [13, 12, 14, 11, 15, 10]
103 """
104 transposed = [''] * len(transposition)
105 for p, t in enumerate(transposition):
106 transposed[p] = items[t]
107 return transposed
108
109 def untranspose(items, transposition):
110 """Undoes a transpose
111
112 >>> untranspose(['a', 'b', 'c', 'd'], [0,1,2,3])
113 ['a', 'b', 'c', 'd']
114 >>> untranspose(['d', 'b', 'c', 'a'], [3,1,2,0])
115 ['a', 'b', 'c', 'd']
116 >>> untranspose([13, 12, 14, 11, 15, 10], [3,2,4,1,5,0])
117 [10, 11, 12, 13, 14, 15]
118 """
119 transposed = [''] * len(transposition)
120 for p, t in enumerate(transposition):
121 transposed[t] = items[p]
122 return transposed
123
124 def deduplicate(text):
125 return list(collections.OrderedDict.fromkeys(text))
126
127
128 def letters(text):
129 """Remove all non-alphabetic characters from a text
130 >>> letters('The Quick')
131 'TheQuick'
132 >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
133 'TheQuickBROWNfoxjumpedoverthelazyDOG'
134 """
135 return ''.join([c for c in text if c in string.ascii_letters])
136
137 # Special characters for conversion, such as smart quotes.
138 unaccent_specials = ''.maketrans({"’": "'", '“': '"', '”': '"'})
139
140 def unaccent(text):
141 """Remove all accents from letters.
142 It does this by converting the unicode string to decomposed compatability
143 form, dropping all the combining accents, then re-encoding the bytes.
144
145 >>> unaccent('hello')
146 'hello'
147 >>> unaccent('HELLO')
148 'HELLO'
149 >>> unaccent('héllo')
150 'hello'
151 >>> unaccent('héllö')
152 'hello'
153 >>> unaccent('HÉLLÖ')
154 'HELLO'
155 """
156 translated_text = text.translate(unaccent_specials)
157 return unicodedata.normalize('NFKD', translated_text).\
158 encode('ascii', 'ignore').\
159 decode('utf-8')
160
161 def sanitise(text):
162 """Remove all non-alphabetic characters and convert the text to lowercase
163
164 >>> sanitise('The Quick')
165 'thequick'
166 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
167 'thequickbrownfoxjumpedoverthelazydog'
168 >>> sanitise('HÉLLÖ')
169 'hello'
170 """
171 return letters(unaccent(text)).lower()
172
173
174 def index_of_coincidence(text):
175 stext = sanitise(text)
176 counts = collections.Counter(stext)
177 denom = len(stext) * (len(text) - 1) / 26
178 return (
179 sum(max(counts[l] * counts[l] - 1, 0) for l in string.ascii_lowercase)
180 /
181 denom
182 )
183
184
185 def frequencies(text):
186 """Count the number of occurrences of each character in text
187
188 >>> sorted(frequencies('abcdefabc').items())
189 [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
190 >>> sorted(frequencies('the quick brown fox jumped over the lazy ' \
191 'dog').items()) # doctest: +NORMALIZE_WHITESPACE
192 [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1),
193 ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1),
194 ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2),
195 ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
196 >>> sorted(frequencies('The Quick BROWN fox jumped! over... the ' \
197 '(9lazy) DOG').items()) # doctest: +NORMALIZE_WHITESPACE
198 [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1),
199 ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1),
200 ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2),
201 ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1),
202 ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
203 >>> sorted(frequencies(sanitise('The Quick BROWN fox jumped! over... '\
204 'the (9lazy) DOG')).items()) # doctest: +NORMALIZE_WHITESPACE
205 [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1),
206 ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1),
207 ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1),
208 ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
209 >>> frequencies('abcdefabcdef')['x']
210 0
211 """
212 return collections.Counter(c for c in text)
213
214 if __name__ == "__main__":
215 import doctest
216 doctest.testmod()