utilities.py

   1 import string
   2 import collections
   3
   4 # join a a list of letters into a string
   5 cat = ''.join
   6
   7 # join a list of words into a string, separated by spaces
   8 wcat = ' '.join
   9
  10 # join a list of lines, separated by newline
  11 lcat = '\n'.join
  12
  13 def pos(letter):
  14         """Return the position of a letter in the alphabet (0-25)"""
  15     if letter in string.ascii_lowercase:
  16         return ord(letter) - ord('a')
  17     elif letter in string.ascii_uppercase:
  18         return ord(letter) - ord('A')
  19     else:
  20         return 0
  21
  22 def unpos(number):
  23         """Return the letter in the given position in the alphabet (mod 26)"""
  24         return chr(number % 26 + ord('a'))
  25
  26 def every_nth(text, n, fillvalue=''):
  27     """Returns n strings, each of which consists of every nth character,
  28     starting with the 0th, 1st, 2nd, ... (n-1)th character
  29
  30     >>> every_nth(string.ascii_lowercase, 5)
  31     ['afkpuz', 'bglqv', 'chmrw', 'dinsx', 'ejoty']
  32     >>> every_nth(string.ascii_lowercase, 1)
  33     ['abcdefghijklmnopqrstuvwxyz']
  34     >>> every_nth(string.ascii_lowercase, 26) # doctest: +NORMALIZE_WHITESPACE
  35     ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
  36      'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
  37     >>> every_nth(string.ascii_lowercase, 5, fillvalue='!')
  38     ['afkpuz', 'bglqv!', 'chmrw!', 'dinsx!', 'ejoty!']
  39     """
  40     split_text = chunks(text, n, fillvalue)
  41     return [cat(l) for l in zip_longest(*split_text, fillvalue=fillvalue)]
  42
  43 def combine_every_nth(split_text):
  44     """Reforms a text split into every_nth strings
  45
  46     >>> combine_every_nth(every_nth(string.ascii_lowercase, 5))
  47     'abcdefghijklmnopqrstuvwxyz'
  48     >>> combine_every_nth(every_nth(string.ascii_lowercase, 1))
  49     'abcdefghijklmnopqrstuvwxyz'
  50     >>> combine_every_nth(every_nth(string.ascii_lowercase, 26))
  51     'abcdefghijklmnopqrstuvwxyz'
  52     """
  53     return cat([cat(l)
  54                     for l in zip_longest(*split_text, fillvalue='')])
  55
  56 def chunks(text, n, fillvalue=None):
  57     """Split a text into chunks of n characters
  58
  59     >>> chunks('abcdefghi', 3)
  60     ['abc', 'def', 'ghi']
  61     >>> chunks('abcdefghi', 4)
  62     ['abcd', 'efgh', 'i']
  63     >>> chunks('abcdefghi', 4, fillvalue='!')
  64     ['abcd', 'efgh', 'i!!!']
  65     """
  66     if fillvalue:
  67         padding = fillvalue[0] * (n - len(text) % n)
  68     else:
  69         padding = ''
  70     return [(text+padding)[i:i+n] for i in range(0, len(text), n)]
  71
  72 def transpose(items, transposition):
  73     """Moves items around according to the given transposition
  74
  75     >>> transpose(['a', 'b', 'c', 'd'], (0,1,2,3))
  76     ['a', 'b', 'c', 'd']
  77     >>> transpose(['a', 'b', 'c', 'd'], (3,1,2,0))
  78     ['d', 'b', 'c', 'a']
  79     >>> transpose([10,11,12,13,14,15], (3,2,4,1,5,0))
  80     [13, 12, 14, 11, 15, 10]
  81     """
  82     transposed = [''] * len(transposition)
  83     for p, t in enumerate(transposition):
  84        transposed[p] = items[t]
  85     return transposed
  86
  87 def untranspose(items, transposition):
  88     """Undoes a transpose
  89
  90     >>> untranspose(['a', 'b', 'c', 'd'], [0,1,2,3])
  91     ['a', 'b', 'c', 'd']
  92     >>> untranspose(['d', 'b', 'c', 'a'], [3,1,2,0])
  93     ['a', 'b', 'c', 'd']
  94     >>> untranspose([13, 12, 14, 11, 15, 10], [3,2,4,1,5,0])
  95     [10, 11, 12, 13, 14, 15]
  96     """
  97     transposed = [''] * len(transposition)
  98     for p, t in enumerate(transposition):
  99        transposed[t] = items[p]
 100     return transposed
 101
 102 def deduplicate(text):
 103     return list(collections.OrderedDict.fromkeys(text))
 104
 105
 106 def letters(text):
 107     """Remove all non-alphabetic characters from a text
 108     >>> letters('The Quick')
 109     'TheQuick'
 110     >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
 111     'TheQuickBROWNfoxjumpedoverthelazyDOG'
 112     """
 113     return ''.join([c for c in text if c in string.ascii_letters])
 114
 115 # Special characters for conversion, such as smart quotes.
 116 unaccent_specials = ''.maketrans({"’": "'", '“': '"', '”': '"'})
 117
 118 def unaccent(text):
 119     """Remove all accents from letters.
 120     It does this by converting the unicode string to decomposed compatability
 121     form, dropping all the combining accents, then re-encoding the bytes.
 122
 123     >>> unaccent('hello')
 124     'hello'
 125     >>> unaccent('HELLO')
 126     'HELLO'
 127     >>> unaccent('héllo')
 128     'hello'
 129     >>> unaccent('héllö')
 130     'hello'
 131     >>> unaccent('HÉLLÖ')
 132     'HELLO'
 133     """
 134     translated_text = text.translate(unaccent_specials)
 135     return unicodedata.normalize('NFKD', translated_text).\
 136         encode('ascii', 'ignore').\
 137         decode('utf-8')
 138
 139 def sanitise(text):
 140     """Remove all non-alphabetic characters and convert the text to lowercase
 141
 142     >>> sanitise('The Quick')
 143     'thequick'
 144     >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
 145     'thequickbrownfoxjumpedoverthelazydog'
 146     >>> sanitise('HÉLLÖ')
 147     'hello'
 148     """
 149     return letters(unaccent(text)).lower()
 150
 151
 152 def index_of_coincidence(text):
 153     stext = sanitise(text)
 154     counts = collections.Counter(stext)
 155     denom = len(stext) * (len(text) - 1) / 26
 156     return (
 157         sum(max(counts[l] * counts[l] - 1, 0) for l in string.ascii_lowercase)
 158         /
 159         denom
 160     )
 161
 162
 163 transpositions = collections.defaultdict(list)
 164 for word in keywords:
 165     transpositions[transpositions_of(word)] += [word]
 166
 167 def frequencies(text):
 168     """Count the number of occurrences of each character in text
 169
 170     >>> sorted(frequencies('abcdefabc').items())
 171     [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
 172     >>> sorted(frequencies('the quick brown fox jumped over the lazy ' \
 173          'dog').items()) # doctest: +NORMALIZE_WHITESPACE
 174     [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1),
 175      ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1),
 176      ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2),
 177      ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
 178     >>> sorted(frequencies('The Quick BROWN fox jumped! over... the ' \
 179          '(9lazy) DOG').items()) # doctest: +NORMALIZE_WHITESPACE
 180     [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1),
 181      ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1),
 182      ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2),
 183      ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1),
 184      ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
 185     >>> sorted(frequencies(sanitise('The Quick BROWN fox jumped! over... '\
 186          'the (9lazy) DOG')).items()) # doctest: +NORMALIZE_WHITESPACE
 187     [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1),
 188      ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1),
 189      ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1),
 190      ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
 191     >>> frequencies('abcdefabcdef')['x']
 192     0
 193     """
 194     return collections.Counter(c for c in text)