utilities.py

   1 import string
   2 import collections
   3 import unicodedata
   4 from itertools import zip_longest
   5
   6 # join a a list of letters into a string
   7 cat = ''.join
   8
   9 # join a list of words into a string, separated by spaces
  10 wcat = ' '.join
  11
  12 # join a list of lines, separated by newline
  13 lcat = '\n'.join
  14
  15 def pos(letter):
  16     """Return the position of a letter in the alphabet (0-25)"""
  17     if letter in string.ascii_lowercase:
  18         return ord(letter) - ord('a')
  19     elif letter in string.ascii_uppercase:
  20         return ord(letter) - ord('A')
  21     else:
  22         raise ValueError('pos requires input of {} to be an ascii letter'.format(letter))
  23
  24 def unpos(number):
  25     """Return the letter in the given position in the alphabet (mod 26)"""
  26     return chr(number % 26 + ord('a'))
  27
  28 def pad(message_len, group_len, fillvalue):
  29     """Return the padding needed to extend a message to a multiple of group_len
  30     in length.
  31
  32     fillvalue can be a function or a literal value. If a function, it is called
  33     once for each padded character. Use this with fillvalue=random_english_letter
  34     to pad a message with random letters.
  35     """
  36     padding_length = group_len - message_len % group_len
  37     if padding_length == group_len: padding_length = 0
  38     padding = ''
  39     if callable(fillvalue):
  40         for i in range(padding_length):
  41             padding += fillvalue()
  42     else:
  43         padding += fillvalue * padding_length
  44     return padding
  45
  46 def every_nth(text, n, fillvalue=''):
  47     """Returns n strings, each of which consists of every nth character,
  48     starting with the 0th, 1st, 2nd, ... (n-1)th character
  49
  50     >>> every_nth(string.ascii_lowercase, 5)
  51     ['afkpuz', 'bglqv', 'chmrw', 'dinsx', 'ejoty']
  52     >>> every_nth(string.ascii_lowercase, 1)
  53     ['abcdefghijklmnopqrstuvwxyz']
  54     >>> every_nth(string.ascii_lowercase, 26) # doctest: +NORMALIZE_WHITESPACE
  55     ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
  56      'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
  57     >>> every_nth(string.ascii_lowercase, 5, fillvalue='!')
  58     ['afkpuz', 'bglqv!', 'chmrw!', 'dinsx!', 'ejoty!']
  59     """
  60     split_text = chunks(text, n, fillvalue)
  61     return [cat(l) for l in zip_longest(*split_text, fillvalue=fillvalue)]
  62
  63 def combine_every_nth(split_text):
  64     """Reforms a text split into every_nth strings
  65
  66     >>> combine_every_nth(every_nth(string.ascii_lowercase, 5))
  67     'abcdefghijklmnopqrstuvwxyz'
  68     >>> combine_every_nth(every_nth(string.ascii_lowercase, 1))
  69     'abcdefghijklmnopqrstuvwxyz'
  70     >>> combine_every_nth(every_nth(string.ascii_lowercase, 26))
  71     'abcdefghijklmnopqrstuvwxyz'
  72     """
  73     return cat([cat(l)
  74                     for l in zip_longest(*split_text, fillvalue='')])
  75
  76 def chunks(text, n, fillvalue=None):
  77     """Split a text into chunks of n characters
  78
  79     >>> chunks('abcdefghi', 3)
  80     ['abc', 'def', 'ghi']
  81     >>> chunks('abcdefghi', 4)
  82     ['abcd', 'efgh', 'i']
  83     >>> chunks('abcdefghi', 4, fillvalue='!')
  84     ['abcd', 'efgh', 'i!!!']
  85     """
  86     if fillvalue:
  87         # padding = fillvalue[0] * (n - len(text) % n)
  88         padding = pad(len(text), n, fillvalue)
  89         padded_text = text + padding
  90     else:
  91         padded_text = text
  92     return [(padded_text)[i:i+n] for i in range(0, len(text), n)]
  93
  94 def transpose(items, transposition):
  95     """Moves items around according to the given transposition
  96
  97     >>> transpose(['a', 'b', 'c', 'd'], (0,1,2,3))
  98     ['a', 'b', 'c', 'd']
  99     >>> transpose(['a', 'b', 'c', 'd'], (3,1,2,0))
 100     ['d', 'b', 'c', 'a']
 101     >>> transpose([10,11,12,13,14,15], (3,2,4,1,5,0))
 102     [13, 12, 14, 11, 15, 10]
 103     """
 104     transposed = [''] * len(transposition)
 105     for p, t in enumerate(transposition):
 106        transposed[p] = items[t]
 107     return transposed
 108
 109 def untranspose(items, transposition):
 110     """Undoes a transpose
 111
 112     >>> untranspose(['a', 'b', 'c', 'd'], [0,1,2,3])
 113     ['a', 'b', 'c', 'd']
 114     >>> untranspose(['d', 'b', 'c', 'a'], [3,1,2,0])
 115     ['a', 'b', 'c', 'd']
 116     >>> untranspose([13, 12, 14, 11, 15, 10], [3,2,4,1,5,0])
 117     [10, 11, 12, 13, 14, 15]
 118     """
 119     transposed = [''] * len(transposition)
 120     for p, t in enumerate(transposition):
 121        transposed[t] = items[p]
 122     return transposed
 123
 124 def deduplicate(text):
 125     return list(collections.OrderedDict.fromkeys(text))
 126
 127
 128 def letters(text):
 129     """Remove all non-alphabetic characters from a text
 130     >>> letters('The Quick')
 131     'TheQuick'
 132     >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
 133     'TheQuickBROWNfoxjumpedoverthelazyDOG'
 134     """
 135     return ''.join([c for c in text if c in string.ascii_letters])
 136
 137 # Special characters for conversion, such as smart quotes.
 138 unaccent_specials = ''.maketrans({"’": "'", '“': '"', '”': '"'})
 139
 140 def unaccent(text):
 141     """Remove all accents from letters.
 142     It does this by converting the unicode string to decomposed compatability
 143     form, dropping all the combining accents, then re-encoding the bytes.
 144
 145     >>> unaccent('hello')
 146     'hello'
 147     >>> unaccent('HELLO')
 148     'HELLO'
 149     >>> unaccent('héllo')
 150     'hello'
 151     >>> unaccent('héllö')
 152     'hello'
 153     >>> unaccent('HÉLLÖ')
 154     'HELLO'
 155     """
 156     translated_text = text.translate(unaccent_specials)
 157     return unicodedata.normalize('NFKD', translated_text).\
 158         encode('ascii', 'ignore').\
 159         decode('utf-8')
 160
 161 def sanitise(text):
 162     """Remove all non-alphabetic characters and convert the text to lowercase
 163
 164     >>> sanitise('The Quick')
 165     'thequick'
 166     >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
 167     'thequickbrownfoxjumpedoverthelazydog'
 168     >>> sanitise('HÉLLÖ')
 169     'hello'
 170     """
 171     return letters(unaccent(text)).lower()
 172
 173
 174 def index_of_coincidence(text):
 175     stext = sanitise(text)
 176     counts = collections.Counter(stext)
 177     denom = len(stext) * (len(text) - 1) / 26
 178     return (
 179         sum(max(counts[l] * counts[l] - 1, 0) for l in string.ascii_lowercase)
 180         /
 181         denom
 182     )
 183
 184
 185 def frequencies(text):
 186     """Count the number of occurrences of each character in text
 187
 188     >>> sorted(frequencies('abcdefabc').items())
 189     [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
 190     >>> sorted(frequencies('the quick brown fox jumped over the lazy ' \
 191          'dog').items()) # doctest: +NORMALIZE_WHITESPACE
 192     [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1),
 193      ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1),
 194      ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2),
 195      ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
 196     >>> sorted(frequencies('The Quick BROWN fox jumped! over... the ' \
 197          '(9lazy) DOG').items()) # doctest: +NORMALIZE_WHITESPACE
 198     [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1),
 199      ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1),
 200      ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2),
 201      ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1),
 202      ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
 203     >>> sorted(frequencies(sanitise('The Quick BROWN fox jumped! over... '\
 204          'the (9lazy) DOG')).items()) # doctest: +NORMALIZE_WHITESPACE
 205     [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1),
 206      ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1),
 207      ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1),
 208      ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
 209     >>> frequencies('abcdefabcdef')['x']
 210     0
 211     """
 212     return collections.Counter(c for c in text)
 213
 214 if __name__ == "__main__":
 215     import doctest
 216     doctest.testmod()