support/utilities.py

   1 import string
   2 import collections
   3 from itertools import zip_longest
   4
   5 # join a a list of letters into a string
   6 cat = ''.join
   7
   8 # join a list of words into a string, separated by spaces
   9 wcat = ' '.join
  10
  11 # join a list of lines, separated by newline
  12 lcat = '\n'.join
  13
  14 def pos(letter):
  15     """Return the position of a letter in the alphabet (0-25)"""
  16     if letter in string.ascii_lowercase:
  17         return ord(letter) - ord('a')
  18     elif letter in string.ascii_uppercase:
  19         return ord(letter) - ord('A')
  20     else:
  21         raise ValueError('pos requires input of {} to be an ascii letter'.format(letter))
  22
  23 def unpos(number):
  24     """Return the letter in the given position in the alphabet (mod 26)"""
  25     return chr(number % 26 + ord('a'))
  26
  27 def every_nth(text, n, fillvalue=''):
  28     """Returns n strings, each of which consists of every nth character,
  29     starting with the 0th, 1st, 2nd, ... (n-1)th character
  30
  31     >>> every_nth(string.ascii_lowercase, 5)
  32     ['afkpuz', 'bglqv', 'chmrw', 'dinsx', 'ejoty']
  33     >>> every_nth(string.ascii_lowercase, 1)
  34     ['abcdefghijklmnopqrstuvwxyz']
  35     >>> every_nth(string.ascii_lowercase, 26) # doctest: +NORMALIZE_WHITESPACE
  36     ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
  37      'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
  38     >>> every_nth(string.ascii_lowercase, 5, fillvalue='!')
  39     ['afkpuz', 'bglqv!', 'chmrw!', 'dinsx!', 'ejoty!']
  40     """
  41     split_text = chunks(text, n, fillvalue)
  42     return [cat(l) for l in zip_longest(*split_text, fillvalue=fillvalue)]
  43
  44 def combine_every_nth(split_text):
  45     """Reforms a text split into every_nth strings
  46
  47     >>> combine_every_nth(every_nth(string.ascii_lowercase, 5))
  48     'abcdefghijklmnopqrstuvwxyz'
  49     >>> combine_every_nth(every_nth(string.ascii_lowercase, 1))
  50     'abcdefghijklmnopqrstuvwxyz'
  51     >>> combine_every_nth(every_nth(string.ascii_lowercase, 26))
  52     'abcdefghijklmnopqrstuvwxyz'
  53     """
  54     return cat([cat(l)
  55                     for l in zip_longest(*split_text, fillvalue='')])
  56
  57 def chunks(text, n, fillvalue=None):
  58     """Split a text into chunks of n characters
  59
  60     >>> chunks('abcdefghi', 3)
  61     ['abc', 'def', 'ghi']
  62     >>> chunks('abcdefghi', 4)
  63     ['abcd', 'efgh', 'i']
  64     >>> chunks('abcdefghi', 4, fillvalue='!')
  65     ['abcd', 'efgh', 'i!!!']
  66     """
  67     if fillvalue:
  68         padding = fillvalue[0] * (n - len(text) % n)
  69     else:
  70         padding = ''
  71     return [(text+padding)[i:i+n] for i in range(0, len(text), n)]
  72
  73 def transpose(items, transposition):
  74     """Moves items around according to the given transposition
  75
  76     >>> transpose(['a', 'b', 'c', 'd'], (0,1,2,3))
  77     ['a', 'b', 'c', 'd']
  78     >>> transpose(['a', 'b', 'c', 'd'], (3,1,2,0))
  79     ['d', 'b', 'c', 'a']
  80     >>> transpose([10,11,12,13,14,15], (3,2,4,1,5,0))
  81     [13, 12, 14, 11, 15, 10]
  82     """
  83     transposed = [''] * len(transposition)
  84     for p, t in enumerate(transposition):
  85        transposed[p] = items[t]
  86     return transposed
  87
  88 def untranspose(items, transposition):
  89     """Undoes a transpose
  90
  91     >>> untranspose(['a', 'b', 'c', 'd'], [0,1,2,3])
  92     ['a', 'b', 'c', 'd']
  93     >>> untranspose(['d', 'b', 'c', 'a'], [3,1,2,0])
  94     ['a', 'b', 'c', 'd']
  95     >>> untranspose([13, 12, 14, 11, 15, 10], [3,2,4,1,5,0])
  96     [10, 11, 12, 13, 14, 15]
  97     """
  98     transposed = [''] * len(transposition)
  99     for p, t in enumerate(transposition):
 100        transposed[t] = items[p]
 101     return transposed
 102
 103 def deduplicate(text):
 104     return list(collections.OrderedDict.fromkeys(text))
 105
 106
 107 def letters(text):
 108     """Remove all non-alphabetic characters from a text
 109     >>> letters('The Quick')
 110     'TheQuick'
 111     >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
 112     'TheQuickBROWNfoxjumpedoverthelazyDOG'
 113     """
 114     return ''.join([c for c in text if c in string.ascii_letters])
 115
 116 # Special characters for conversion, such as smart quotes.
 117 unaccent_specials = ''.maketrans({"’": "'", '“': '"', '”': '"'})
 118
 119 def unaccent(text):
 120     """Remove all accents from letters.
 121     It does this by converting the unicode string to decomposed compatability
 122     form, dropping all the combining accents, then re-encoding the bytes.
 123
 124     >>> unaccent('hello')
 125     'hello'
 126     >>> unaccent('HELLO')
 127     'HELLO'
 128     >>> unaccent('héllo')
 129     'hello'
 130     >>> unaccent('héllö')
 131     'hello'
 132     >>> unaccent('HÉLLÖ')
 133     'HELLO'
 134     """
 135     translated_text = text.translate(unaccent_specials)
 136     return unicodedata.normalize('NFKD', translated_text).\
 137         encode('ascii', 'ignore').\
 138         decode('utf-8')
 139
 140 def sanitise(text):
 141     """Remove all non-alphabetic characters and convert the text to lowercase
 142
 143     >>> sanitise('The Quick')
 144     'thequick'
 145     >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
 146     'thequickbrownfoxjumpedoverthelazydog'
 147     >>> sanitise('HÉLLÖ')
 148     'hello'
 149     """
 150     return letters(unaccent(text)).lower()
 151
 152
 153 def index_of_coincidence(text):
 154     stext = sanitise(text)
 155     counts = collections.Counter(stext)
 156     denom = len(stext) * (len(text) - 1) / 26
 157     return (
 158         sum(max(counts[l] * counts[l] - 1, 0) for l in string.ascii_lowercase)
 159         /
 160         denom
 161     )
 162
 163
 164 def frequencies(text):
 165     """Count the number of occurrences of each character in text
 166
 167     >>> sorted(frequencies('abcdefabc').items())
 168     [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
 169     >>> sorted(frequencies('the quick brown fox jumped over the lazy ' \
 170          'dog').items()) # doctest: +NORMALIZE_WHITESPACE
 171     [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1),
 172      ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1),
 173      ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2),
 174      ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
 175     >>> sorted(frequencies('The Quick BROWN fox jumped! over... the ' \
 176          '(9lazy) DOG').items()) # doctest: +NORMALIZE_WHITESPACE
 177     [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1),
 178      ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1),
 179      ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2),
 180      ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1),
 181      ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
 182     >>> sorted(frequencies(sanitise('The Quick BROWN fox jumped! over... '\
 183          'the (9lazy) DOG')).items()) # doctest: +NORMALIZE_WHITESPACE
 184     [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1),
 185      ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1),
 186      ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1),
 187      ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
 188     >>> frequencies('abcdefabcdef')['x']
 189     0
 190     """
 191     return collections.Counter(c for c in text)
 192
 193 if __name__ == "__main__":
 194     import doctest