d1961a8a4e534e4cda82eee6249603dc5b4999de
[cipher-tools.git] / utilities.py
1 import string
2 import collections
3
4 # join a a list of letters into a string
5 cat = ''.join
6
7 # join a list of words into a string, separated by spaces
8 wcat = ' '.join
9
10 # join a list of lines, separated by newline
11 lcat = '\n'.join
12
13 def pos(letter):
14 """Return the position of a letter in the alphabet (0-25)"""
15 if letter in string.ascii_lowercase:
16 return ord(letter) - ord('a')
17 elif letter in string.ascii_uppercase:
18 return ord(letter) - ord('A')
19 else:
20 return 0
21
22 def unpos(number):
23 """Return the letter in the given position in the alphabet (mod 26)"""
24 return chr(number % 26 + ord('a'))
25
26 def every_nth(text, n, fillvalue=''):
27 """Returns n strings, each of which consists of every nth character,
28 starting with the 0th, 1st, 2nd, ... (n-1)th character
29
30 >>> every_nth(string.ascii_lowercase, 5)
31 ['afkpuz', 'bglqv', 'chmrw', 'dinsx', 'ejoty']
32 >>> every_nth(string.ascii_lowercase, 1)
33 ['abcdefghijklmnopqrstuvwxyz']
34 >>> every_nth(string.ascii_lowercase, 26) # doctest: +NORMALIZE_WHITESPACE
35 ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
36 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
37 >>> every_nth(string.ascii_lowercase, 5, fillvalue='!')
38 ['afkpuz', 'bglqv!', 'chmrw!', 'dinsx!', 'ejoty!']
39 """
40 split_text = chunks(text, n, fillvalue)
41 return [cat(l) for l in zip_longest(*split_text, fillvalue=fillvalue)]
42
43 def combine_every_nth(split_text):
44 """Reforms a text split into every_nth strings
45
46 >>> combine_every_nth(every_nth(string.ascii_lowercase, 5))
47 'abcdefghijklmnopqrstuvwxyz'
48 >>> combine_every_nth(every_nth(string.ascii_lowercase, 1))
49 'abcdefghijklmnopqrstuvwxyz'
50 >>> combine_every_nth(every_nth(string.ascii_lowercase, 26))
51 'abcdefghijklmnopqrstuvwxyz'
52 """
53 return cat([cat(l)
54 for l in zip_longest(*split_text, fillvalue='')])
55
56 def chunks(text, n, fillvalue=None):
57 """Split a text into chunks of n characters
58
59 >>> chunks('abcdefghi', 3)
60 ['abc', 'def', 'ghi']
61 >>> chunks('abcdefghi', 4)
62 ['abcd', 'efgh', 'i']
63 >>> chunks('abcdefghi', 4, fillvalue='!')
64 ['abcd', 'efgh', 'i!!!']
65 """
66 if fillvalue:
67 padding = fillvalue[0] * (n - len(text) % n)
68 else:
69 padding = ''
70 return [(text+padding)[i:i+n] for i in range(0, len(text), n)]
71
72 def transpose(items, transposition):
73 """Moves items around according to the given transposition
74
75 >>> transpose(['a', 'b', 'c', 'd'], (0,1,2,3))
76 ['a', 'b', 'c', 'd']
77 >>> transpose(['a', 'b', 'c', 'd'], (3,1,2,0))
78 ['d', 'b', 'c', 'a']
79 >>> transpose([10,11,12,13,14,15], (3,2,4,1,5,0))
80 [13, 12, 14, 11, 15, 10]
81 """
82 transposed = [''] * len(transposition)
83 for p, t in enumerate(transposition):
84 transposed[p] = items[t]
85 return transposed
86
87 def untranspose(items, transposition):
88 """Undoes a transpose
89
90 >>> untranspose(['a', 'b', 'c', 'd'], [0,1,2,3])
91 ['a', 'b', 'c', 'd']
92 >>> untranspose(['d', 'b', 'c', 'a'], [3,1,2,0])
93 ['a', 'b', 'c', 'd']
94 >>> untranspose([13, 12, 14, 11, 15, 10], [3,2,4,1,5,0])
95 [10, 11, 12, 13, 14, 15]
96 """
97 transposed = [''] * len(transposition)
98 for p, t in enumerate(transposition):
99 transposed[t] = items[p]
100 return transposed
101
102 def deduplicate(text):
103 return list(collections.OrderedDict.fromkeys(text))
104
105
106 def letters(text):
107 """Remove all non-alphabetic characters from a text
108 >>> letters('The Quick')
109 'TheQuick'
110 >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
111 'TheQuickBROWNfoxjumpedoverthelazyDOG'
112 """
113 return ''.join([c for c in text if c in string.ascii_letters])
114
115 # Special characters for conversion, such as smart quotes.
116 unaccent_specials = ''.maketrans({"’": "'", '“': '"', '”': '"'})
117
118 def unaccent(text):
119 """Remove all accents from letters.
120 It does this by converting the unicode string to decomposed compatability
121 form, dropping all the combining accents, then re-encoding the bytes.
122
123 >>> unaccent('hello')
124 'hello'
125 >>> unaccent('HELLO')
126 'HELLO'
127 >>> unaccent('héllo')
128 'hello'
129 >>> unaccent('héllö')
130 'hello'
131 >>> unaccent('HÉLLÖ')
132 'HELLO'
133 """
134 translated_text = text.translate(unaccent_specials)
135 return unicodedata.normalize('NFKD', translated_text).\
136 encode('ascii', 'ignore').\
137 decode('utf-8')
138
139 def sanitise(text):
140 """Remove all non-alphabetic characters and convert the text to lowercase
141
142 >>> sanitise('The Quick')
143 'thequick'
144 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
145 'thequickbrownfoxjumpedoverthelazydog'
146 >>> sanitise('HÉLLÖ')
147 'hello'
148 """
149 return letters(unaccent(text)).lower()
150
151
152 def index_of_coincidence(text):
153 stext = sanitise(text)
154 counts = collections.Counter(stext)
155 denom = len(stext) * (len(text) - 1) / 26
156 return (
157 sum(max(counts[l] * counts[l] - 1, 0) for l in string.ascii_lowercase)
158 /
159 denom
160 )
161
162
163 transpositions = collections.defaultdict(list)
164 for word in keywords:
165 transpositions[transpositions_of(word)] += [word]
166
167 def frequencies(text):
168 """Count the number of occurrences of each character in text
169
170 >>> sorted(frequencies('abcdefabc').items())
171 [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
172 >>> sorted(frequencies('the quick brown fox jumped over the lazy ' \
173 'dog').items()) # doctest: +NORMALIZE_WHITESPACE
174 [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1),
175 ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1),
176 ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2),
177 ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
178 >>> sorted(frequencies('The Quick BROWN fox jumped! over... the ' \
179 '(9lazy) DOG').items()) # doctest: +NORMALIZE_WHITESPACE
180 [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1),
181 ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1),
182 ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2),
183 ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1),
184 ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
185 >>> sorted(frequencies(sanitise('The Quick BROWN fox jumped! over... '\
186 'the (9lazy) DOG')).items()) # doctest: +NORMALIZE_WHITESPACE
187 [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1),
188 ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1),
189 ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1),
190 ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
191 >>> frequencies('abcdefabcdef')['x']
192 0
193 """
194 return collections.Counter(c for c in text)