Minor documentation updates
[szyfrow.git] / szyfrow / support / utilities.py
1 """A mish-mash of utility functions"""
2
3 import string
4 import collections
5 import unicodedata
6 from itertools import zip_longest
7
8 cat = ''.join
9 """join a a list of letters into a string."""
10
11 wcat = ' '.join
12 """join a list of words into a string, separated by spaces"""
13
14 lcat = '\n'.join
15 """join a list of lines, separated by newline"""
16
17 def pos(letter):
18 """Return the position of a letter in the alphabet (0-25)"""
19 if letter in string.ascii_lowercase:
20 return ord(letter) - ord('a')
21 elif letter in string.ascii_uppercase:
22 return ord(letter) - ord('A')
23 else:
24 raise ValueError('pos requires input of {} to be an ascii letter'.format(letter))
25
26 def unpos(number):
27 """Return the letter in the given position in the alphabet (mod 26)"""
28 return chr(number % 26 + ord('a'))
29
30 def pad(message_len, group_len, fillvalue):
31 """Return the padding needed to extend a message to a multiple of group_len
32 in length.
33
34 fillvalue can be a function or a literal value. If a function, it is called
35 once for each padded character. Use this with fillvalue=random_english_letter
36 to pad a message with random letters.
37 """
38 padding_length = group_len - message_len % group_len
39 if padding_length == group_len: padding_length = 0
40 padding = ''
41 if callable(fillvalue):
42 for i in range(padding_length):
43 padding += fillvalue()
44 else:
45 padding += fillvalue * padding_length
46 return padding
47
48 def every_nth(text, n, fillvalue=''):
49 """Returns n strings, each of which consists of every nth character,
50 starting with the 0th, 1st, 2nd, ... (n-1)th character
51
52 >>> every_nth(string.ascii_lowercase, 5)
53 ['afkpuz', 'bglqv', 'chmrw', 'dinsx', 'ejoty']
54 >>> every_nth(string.ascii_lowercase, 1)
55 ['abcdefghijklmnopqrstuvwxyz']
56 >>> every_nth(string.ascii_lowercase, 26) # doctest: +NORMALIZE_WHITESPACE
57 ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
58 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
59 >>> every_nth(string.ascii_lowercase, 5, fillvalue='!')
60 ['afkpuz', 'bglqv!', 'chmrw!', 'dinsx!', 'ejoty!']
61 """
62 split_text = chunks(text, n, fillvalue)
63 return [cat(l) for l in zip_longest(*split_text, fillvalue=fillvalue)]
64
65 def combine_every_nth(split_text):
66 """Reforms a text split into every_nth strings
67
68 >>> combine_every_nth(every_nth(string.ascii_lowercase, 5))
69 'abcdefghijklmnopqrstuvwxyz'
70 >>> combine_every_nth(every_nth(string.ascii_lowercase, 1))
71 'abcdefghijklmnopqrstuvwxyz'
72 >>> combine_every_nth(every_nth(string.ascii_lowercase, 26))
73 'abcdefghijklmnopqrstuvwxyz'
74 """
75 return cat([cat(l)
76 for l in zip_longest(*split_text, fillvalue='')])
77
78 def chunks(text, n, fillvalue=None):
79 """Split a text into chunks of n characters
80
81 >>> chunks('abcdefghi', 3)
82 ['abc', 'def', 'ghi']
83 >>> chunks('abcdefghi', 4)
84 ['abcd', 'efgh', 'i']
85 >>> chunks('abcdefghi', 4, fillvalue='!')
86 ['abcd', 'efgh', 'i!!!']
87 """
88 if fillvalue:
89 # padding = fillvalue[0] * (n - len(text) % n)
90 padding = pad(len(text), n, fillvalue)
91 padded_text = text + padding
92 else:
93 padded_text = text
94 return [(padded_text)[i:i+n] for i in range(0, len(text), n)]
95
96 def transpose(items, transposition):
97 """Moves items around according to the given transposition
98
99 >>> transpose(['a', 'b', 'c', 'd'], (0,1,2,3))
100 ['a', 'b', 'c', 'd']
101 >>> transpose(['a', 'b', 'c', 'd'], (3,1,2,0))
102 ['d', 'b', 'c', 'a']
103 >>> transpose([10,11,12,13,14,15], (3,2,4,1,5,0))
104 [13, 12, 14, 11, 15, 10]
105 """
106 transposed = [''] * len(transposition)
107 for p, t in enumerate(transposition):
108 transposed[p] = items[t]
109 return transposed
110
111 def untranspose(items, transposition):
112 """Undoes a transpose
113
114 >>> untranspose(['a', 'b', 'c', 'd'], [0,1,2,3])
115 ['a', 'b', 'c', 'd']
116 >>> untranspose(['d', 'b', 'c', 'a'], [3,1,2,0])
117 ['a', 'b', 'c', 'd']
118 >>> untranspose([13, 12, 14, 11, 15, 10], [3,2,4,1,5,0])
119 [10, 11, 12, 13, 14, 15]
120 """
121 transposed = [''] * len(transposition)
122 for p, t in enumerate(transposition):
123 transposed[t] = items[p]
124 return transposed
125
126 def deduplicate(text):
127 """Return the input string, but with second (and subsequent) occurrences
128 of a character removed.
129 """
130 return list(collections.OrderedDict.fromkeys(text))
131
132
133 def letters(text):
134 """Remove all non-alphabetic characters from a text
135 >>> letters('The Quick')
136 'TheQuick'
137 >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
138 'TheQuickBROWNfoxjumpedoverthelazyDOG'
139 """
140 return ''.join([c for c in text if c in string.ascii_letters])
141
142 # Special characters for conversion, such as smart quotes.
143 unaccent_specials = ''.maketrans({"’": "'", '“': '"', '”': '"'})
144
145 def unaccent(text):
146 """Remove all accents from letters.
147 It does this by converting the unicode string to decomposed compatability
148 form, dropping all the combining accents, then re-encoding the bytes.
149
150 >>> unaccent('hello')
151 'hello'
152 >>> unaccent('HELLO')
153 'HELLO'
154 >>> unaccent('héllo')
155 'hello'
156 >>> unaccent('héllö')
157 'hello'
158 >>> unaccent('HÉLLÖ')
159 'HELLO'
160 """
161 translated_text = text.translate(unaccent_specials)
162 return unicodedata.normalize('NFKD', translated_text).\
163 encode('ascii', 'ignore').\
164 decode('utf-8')
165
166 def sanitise(text):
167 """Remove all non-alphabetic characters and convert the text to lowercase
168
169 >>> sanitise('The Quick')
170 'thequick'
171 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
172 'thequickbrownfoxjumpedoverthelazydog'
173 >>> sanitise('HÉLLÖ')
174 'hello'
175 """
176 return letters(unaccent(text)).lower()
177
178
179 def index_of_coincidence(text):
180 """Index of coincidence of a string. This is low for random text,
181 higher for natural langauge.
182 """
183 stext = sanitise(text)
184 counts = collections.Counter(stext)
185 denom = len(stext) * (len(text) - 1) / 26
186 return (
187 sum(max(counts[l] * counts[l] - 1, 0) for l in string.ascii_lowercase)
188 /
189 denom
190 )
191
192
193 def frequencies(text):
194 """Count the number of occurrences of each character in text
195
196 >>> sorted(frequencies('abcdefabc').items())
197 [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
198 >>> sorted(frequencies('the quick brown fox jumped over the lazy ' \
199 'dog').items()) # doctest: +NORMALIZE_WHITESPACE
200 [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1),
201 ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1),
202 ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2),
203 ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
204 >>> sorted(frequencies('The Quick BROWN fox jumped! over... the ' \
205 '(9lazy) DOG').items()) # doctest: +NORMALIZE_WHITESPACE
206 [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1),
207 ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1),
208 ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2),
209 ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1),
210 ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
211 >>> sorted(frequencies(sanitise('The Quick BROWN fox jumped! over... '\
212 'the (9lazy) DOG')).items()) # doctest: +NORMALIZE_WHITESPACE
213 [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1),
214 ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1),
215 ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1),
216 ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
217 >>> frequencies('abcdefabcdef')['x']
218 0
219 """
220 return collections.Counter(c for c in text)
221
222 if __name__ == "__main__":
223 import doctest
224 doctest.testmod()