utilities.py

   1 import string
   2 import collections
   3 import unicodedata
   4 from itertools import zip_longest
   5
   6 # join a a list of letters into a string
   7 cat = ''.join
   8
   9 # join a list of words into a string, separated by spaces
  10 wcat = ' '.join
  11
  12 # join a list of lines, separated by newline
  13 lcat = '\n'.join
  14
  15 def pos(letter):
  16     """Return the position of a letter in the alphabet (0-25)"""
  17     if letter in string.ascii_lowercase:
  18         return ord(letter) - ord('a')
  19     elif letter in string.ascii_uppercase:
  20         return ord(letter) - ord('A')
  21     else:
  22         raise ValueError('pos requires input of {} to be an ascii letter'.format(letter))
  23
  24 def unpos(number):
  25     """Return the letter in the given position in the alphabet (mod 26)"""
  26     return chr(number % 26 + ord('a'))
  27
  28 def every_nth(text, n, fillvalue=''):
  29     """Returns n strings, each of which consists of every nth character,
  30     starting with the 0th, 1st, 2nd, ... (n-1)th character
  31
  32     >>> every_nth(string.ascii_lowercase, 5)
  33     ['afkpuz', 'bglqv', 'chmrw', 'dinsx', 'ejoty']
  34     >>> every_nth(string.ascii_lowercase, 1)
  35     ['abcdefghijklmnopqrstuvwxyz']
  36     >>> every_nth(string.ascii_lowercase, 26) # doctest: +NORMALIZE_WHITESPACE
  37     ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
  38      'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
  39     >>> every_nth(string.ascii_lowercase, 5, fillvalue='!')
  40     ['afkpuz', 'bglqv!', 'chmrw!', 'dinsx!', 'ejoty!']
  41     """
  42     split_text = chunks(text, n, fillvalue)
  43     return [cat(l) for l in zip_longest(*split_text, fillvalue=fillvalue)]
  44
  45 def combine_every_nth(split_text):
  46     """Reforms a text split into every_nth strings
  47
  48     >>> combine_every_nth(every_nth(string.ascii_lowercase, 5))
  49     'abcdefghijklmnopqrstuvwxyz'
  50     >>> combine_every_nth(every_nth(string.ascii_lowercase, 1))
  51     'abcdefghijklmnopqrstuvwxyz'
  52     >>> combine_every_nth(every_nth(string.ascii_lowercase, 26))
  53     'abcdefghijklmnopqrstuvwxyz'
  54     """
  55     return cat([cat(l)
  56                     for l in zip_longest(*split_text, fillvalue='')])
  57
  58 def chunks(text, n, fillvalue=None):
  59     """Split a text into chunks of n characters
  60
  61     >>> chunks('abcdefghi', 3)
  62     ['abc', 'def', 'ghi']
  63     >>> chunks('abcdefghi', 4)
  64     ['abcd', 'efgh', 'i']
  65     >>> chunks('abcdefghi', 4, fillvalue='!')
  66     ['abcd', 'efgh', 'i!!!']
  67     """
  68     if fillvalue:
  69         padding = fillvalue[0] * (n - len(text) % n)
  70     else:
  71         padding = ''
  72     return [(text+padding)[i:i+n] for i in range(0, len(text), n)]
  73
  74 def transpose(items, transposition):
  75     """Moves items around according to the given transposition
  76
  77     >>> transpose(['a', 'b', 'c', 'd'], (0,1,2,3))
  78     ['a', 'b', 'c', 'd']
  79     >>> transpose(['a', 'b', 'c', 'd'], (3,1,2,0))
  80     ['d', 'b', 'c', 'a']
  81     >>> transpose([10,11,12,13,14,15], (3,2,4,1,5,0))
  82     [13, 12, 14, 11, 15, 10]
  83     """
  84     transposed = [''] * len(transposition)
  85     for p, t in enumerate(transposition):
  86        transposed[p] = items[t]
  87     return transposed
  88
  89 def untranspose(items, transposition):
  90     """Undoes a transpose
  91
  92     >>> untranspose(['a', 'b', 'c', 'd'], [0,1,2,3])
  93     ['a', 'b', 'c', 'd']
  94     >>> untranspose(['d', 'b', 'c', 'a'], [3,1,2,0])
  95     ['a', 'b', 'c', 'd']
  96     >>> untranspose([13, 12, 14, 11, 15, 10], [3,2,4,1,5,0])
  97     [10, 11, 12, 13, 14, 15]
  98     """
  99     transposed = [''] * len(transposition)
 100     for p, t in enumerate(transposition):
 101        transposed[t] = items[p]
 102     return transposed
 103
 104 def deduplicate(text):
 105     return list(collections.OrderedDict.fromkeys(text))
 106
 107
 108 def letters(text):
 109     """Remove all non-alphabetic characters from a text
 110     >>> letters('The Quick')
 111     'TheQuick'
 112     >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
 113     'TheQuickBROWNfoxjumpedoverthelazyDOG'
 114     """
 115     return ''.join([c for c in text if c in string.ascii_letters])
 116
 117 # Special characters for conversion, such as smart quotes.
 118 unaccent_specials = ''.maketrans({"’": "'", '“': '"', '”': '"'})
 119
 120 def unaccent(text):
 121     """Remove all accents from letters.
 122     It does this by converting the unicode string to decomposed compatability
 123     form, dropping all the combining accents, then re-encoding the bytes.
 124
 125     >>> unaccent('hello')
 126     'hello'
 127     >>> unaccent('HELLO')
 128     'HELLO'
 129     >>> unaccent('héllo')
 130     'hello'
 131     >>> unaccent('héllö')
 132     'hello'
 133     >>> unaccent('HÉLLÖ')
 134     'HELLO'
 135     """
 136     translated_text = text.translate(unaccent_specials)
 137     return unicodedata.normalize('NFKD', translated_text).\
 138         encode('ascii', 'ignore').\
 139         decode('utf-8')
 140
 141 def sanitise(text):
 142     """Remove all non-alphabetic characters and convert the text to lowercase
 143
 144     >>> sanitise('The Quick')
 145     'thequick'
 146     >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
 147     'thequickbrownfoxjumpedoverthelazydog'
 148     >>> sanitise('HÉLLÖ')
 149     'hello'
 150     """
 151     return letters(unaccent(text)).lower()
 152
 153
 154 def index_of_coincidence(text):
 155     stext = sanitise(text)
 156     counts = collections.Counter(stext)
 157     denom = len(stext) * (len(text) - 1) / 26
 158     return (
 159         sum(max(counts[l] * counts[l] - 1, 0) for l in string.ascii_lowercase)
 160         /
 161         denom
 162     )
 163
 164
 165 def frequencies(text):
 166     """Count the number of occurrences of each character in text
 167
 168     >>> sorted(frequencies('abcdefabc').items())
 169     [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
 170     >>> sorted(frequencies('the quick brown fox jumped over the lazy ' \
 171          'dog').items()) # doctest: +NORMALIZE_WHITESPACE
 172     [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1),
 173      ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1),
 174      ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2),
 175      ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
 176     >>> sorted(frequencies('The Quick BROWN fox jumped! over... the ' \
 177          '(9lazy) DOG').items()) # doctest: +NORMALIZE_WHITESPACE
 178     [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1),
 179      ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1),
 180      ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2),
 181      ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1),
 182      ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
 183     >>> sorted(frequencies(sanitise('The Quick BROWN fox jumped! over... '\
 184          'the (9lazy) DOG')).items()) # doctest: +NORMALIZE_WHITESPACE
 185     [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1),
 186      ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1),
 187      ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1),
 188      ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
 189     >>> frequencies('abcdefabcdef')['x']
 190     0
 191     """
 192     return collections.Counter(c for c in text)
 193
 194 if __name__ == "__main__":
 195     import doctest