1 """A mish-mash of utility functions"""
6 from itertools
import zip_longest
9 """join a a list of letters into a string."""
12 """join a list of words into a string, separated by spaces"""
15 """join a list of lines, separated by newline"""
18 """Return the position of a letter in the alphabet (0-25)"""
19 if letter
in string
.ascii_lowercase
:
20 return ord(letter
) - ord('a')
21 elif letter
in string
.ascii_uppercase
:
22 return ord(letter
) - ord('A')
24 raise ValueError('pos requires input of {} to be an ascii letter'.format(letter
))
27 """Return the letter in the given position in the alphabet (mod 26)"""
28 return chr(number
% 26 + ord('a'))
30 def pad(message_len
, group_len
, fillvalue
):
31 """Return the padding needed to extend a message to a multiple of group_len
34 fillvalue can be a function or a literal value. If a function, it is called
35 once for each padded character. Use this with fillvalue=random_english_letter
36 to pad a message with random letters.
38 padding_length
= group_len
- message_len
% group_len
39 if padding_length
== group_len
: padding_length
= 0
41 if callable(fillvalue
):
42 for i
in range(padding_length
):
43 padding
+= fillvalue()
45 padding
+= fillvalue
* padding_length
48 def every_nth(text
, n
, fillvalue
=''):
49 """Returns n strings, each of which consists of every nth character,
50 starting with the 0th, 1st, 2nd, ... (n-1)th character
52 >>> every_nth(string.ascii_lowercase, 5)
53 ['afkpuz', 'bglqv', 'chmrw', 'dinsx', 'ejoty']
54 >>> every_nth(string.ascii_lowercase, 1)
55 ['abcdefghijklmnopqrstuvwxyz']
56 >>> every_nth(string.ascii_lowercase, 26) # doctest: +NORMALIZE_WHITESPACE
57 ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
58 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
59 >>> every_nth(string.ascii_lowercase, 5, fillvalue='!')
60 ['afkpuz', 'bglqv!', 'chmrw!', 'dinsx!', 'ejoty!']
62 split_text
= chunks(text
, n
, fillvalue
)
63 return [cat(l
) for l
in zip_longest(*split_text
, fillvalue
=fillvalue
)]
65 def combine_every_nth(split_text
):
66 """Reforms a text split into every_nth strings
68 >>> combine_every_nth(every_nth(string.ascii_lowercase, 5))
69 'abcdefghijklmnopqrstuvwxyz'
70 >>> combine_every_nth(every_nth(string.ascii_lowercase, 1))
71 'abcdefghijklmnopqrstuvwxyz'
72 >>> combine_every_nth(every_nth(string.ascii_lowercase, 26))
73 'abcdefghijklmnopqrstuvwxyz'
76 for l
in zip_longest(*split_text
, fillvalue
='')])
78 def chunks(text
, n
, fillvalue
=None):
79 """Split a text into chunks of n characters
81 >>> chunks('abcdefghi', 3)
83 >>> chunks('abcdefghi', 4)
85 >>> chunks('abcdefghi', 4, fillvalue='!')
86 ['abcd', 'efgh', 'i!!!']
89 # padding = fillvalue[0] * (n - len(text) % n)
90 padding
= pad(len(text
), n
, fillvalue
)
91 padded_text
= text
+ padding
94 return [(padded_text
)[i
:i
+n
] for i
in range(0, len(text
), n
)]
96 def transpose(items
, transposition
):
97 """Moves items around according to the given transposition
99 >>> transpose(['a', 'b', 'c', 'd'], (0,1,2,3))
101 >>> transpose(['a', 'b', 'c', 'd'], (3,1,2,0))
103 >>> transpose([10,11,12,13,14,15], (3,2,4,1,5,0))
104 [13, 12, 14, 11, 15, 10]
106 transposed
= [''] * len(transposition
)
107 for p
, t
in enumerate(transposition
):
108 transposed
[p
] = items
[t
]
111 def untranspose(items
, transposition
):
112 """Undoes a transpose
114 >>> untranspose(['a', 'b', 'c', 'd'], [0,1,2,3])
116 >>> untranspose(['d', 'b', 'c', 'a'], [3,1,2,0])
118 >>> untranspose([13, 12, 14, 11, 15, 10], [3,2,4,1,5,0])
119 [10, 11, 12, 13, 14, 15]
121 transposed
= [''] * len(transposition
)
122 for p
, t
in enumerate(transposition
):
123 transposed
[t
] = items
[p
]
126 def deduplicate(text
):
127 """Return the input string, but with second (and subsequent) occurrences
128 of a character removed.
130 return list(collections
.OrderedDict
.fromkeys(text
))
134 """Remove all non-alphabetic characters from a text
135 >>> letters('The Quick')
137 >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
138 'TheQuickBROWNfoxjumpedoverthelazyDOG'
140 return ''.join([c
for c
in text
if c
in string
.ascii_letters
])
142 # Special characters for conversion, such as smart quotes.
143 unaccent_specials
= ''.maketrans({"’": "'", '“': '"', '”': '"'})
146 """Remove all accents from letters.
147 It does this by converting the unicode string to decomposed compatability
148 form, dropping all the combining accents, then re-encoding the bytes.
150 >>> unaccent('hello')
152 >>> unaccent('HELLO')
154 >>> unaccent('héllo')
156 >>> unaccent('héllö')
158 >>> unaccent('HÉLLÖ')
161 translated_text
= text
.translate(unaccent_specials
)
162 return unicodedata
.normalize('NFKD', translated_text
).\
163 encode('ascii', 'ignore').\
167 """Remove all non-alphabetic characters and convert the text to lowercase
169 >>> sanitise('The Quick')
171 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
172 'thequickbrownfoxjumpedoverthelazydog'
173 >>> sanitise('HÉLLÖ')
176 return letters(unaccent(text
)).lower()
179 def index_of_coincidence(text
):
180 """Index of coincidence of a string. This is low for random text,
181 higher for natural langauge.
183 stext
= sanitise(text
)
184 counts
= collections
.Counter(stext
)
185 denom
= len(stext
) * (len(text
) - 1) / 26
187 sum(max(counts
[l
] * counts
[l
] - 1, 0) for l
in string
.ascii_lowercase
)
193 def frequencies(text
):
194 """Count the number of occurrences of each character in text
196 >>> sorted(frequencies('abcdefabc').items())
197 [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
198 >>> sorted(frequencies('the quick brown fox jumped over the lazy ' \
199 'dog').items()) # doctest: +NORMALIZE_WHITESPACE
200 [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1),
201 ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1),
202 ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2),
203 ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
204 >>> sorted(frequencies('The Quick BROWN fox jumped! over... the ' \
205 '(9lazy) DOG').items()) # doctest: +NORMALIZE_WHITESPACE
206 [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1),
207 ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1),
208 ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2),
209 ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1),
210 ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
211 >>> sorted(frequencies(sanitise('The Quick BROWN fox jumped! over... '\
212 'the (9lazy) DOG')).items()) # doctest: +NORMALIZE_WHITESPACE
213 [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1),
214 ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1),
215 ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1),
216 ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
217 >>> frequencies('abcdefabcdef')['x']
220 return collections
.Counter(c
for c
in text
)
222 if __name__
== "__main__":