szyfrow/support/utilities.py

   1 """A mish-mash of utility functions"""
   2
   3 import string
   4 import collections
   5 import unicodedata
   6 from itertools import zip_longest
   7
   8 cat = ''.join
   9 """join a a list of letters into a string."""
  10
  11 wcat = ' '.join
  12 """join a list of words into a string, separated by spaces"""
  13
  14 lcat = '\n'.join
  15 """join a list of lines, separated by newline"""
  16
  17 def pos(letter):
  18     """Return the position of a letter in the alphabet (0-25)"""
  19     if letter in string.ascii_lowercase:
  20         return ord(letter) - ord('a')
  21     elif letter in string.ascii_uppercase:
  22         return ord(letter) - ord('A')
  23     else:
  24         raise ValueError('pos requires input of {} to be an ascii letter'.format(letter))
  25
  26 def unpos(number):
  27     """Return the letter in the given position in the alphabet (mod 26)"""
  28     return chr(number % 26 + ord('a'))
  29
  30 def pad(message_len, group_len, fillvalue):
  31     """Return the padding needed to extend a message to a multiple of group_len
  32     in length.
  33
  34     fillvalue can be a function or a literal value. If a function, it is called
  35     once for each padded character. Use this with fillvalue=random_english_letter
  36     to pad a message with random letters.
  37     """
  38     padding_length = group_len - message_len % group_len
  39     if padding_length == group_len: padding_length = 0
  40     padding = ''
  41     if callable(fillvalue):
  42         for i in range(padding_length):
  43             padding += fillvalue()
  44     else:
  45         padding += fillvalue * padding_length
  46     return padding
  47
  48 def every_nth(text, n, fillvalue=''):
  49     """Returns n strings, each of which consists of every nth character,
  50     starting with the 0th, 1st, 2nd, ... (n-1)th character
  51
  52     >>> every_nth(string.ascii_lowercase, 5)
  53     ['afkpuz', 'bglqv', 'chmrw', 'dinsx', 'ejoty']
  54     >>> every_nth(string.ascii_lowercase, 1)
  55     ['abcdefghijklmnopqrstuvwxyz']
  56     >>> every_nth(string.ascii_lowercase, 26) # doctest: +NORMALIZE_WHITESPACE
  57     ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
  58      'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
  59     >>> every_nth(string.ascii_lowercase, 5, fillvalue='!')
  60     ['afkpuz', 'bglqv!', 'chmrw!', 'dinsx!', 'ejoty!']
  61     """
  62     split_text = chunks(text, n, fillvalue)
  63     return [cat(l) for l in zip_longest(*split_text, fillvalue=fillvalue)]
  64
  65 def combine_every_nth(split_text):
  66     """Reforms a text split into every_nth strings
  67
  68     >>> combine_every_nth(every_nth(string.ascii_lowercase, 5))
  69     'abcdefghijklmnopqrstuvwxyz'
  70     >>> combine_every_nth(every_nth(string.ascii_lowercase, 1))
  71     'abcdefghijklmnopqrstuvwxyz'
  72     >>> combine_every_nth(every_nth(string.ascii_lowercase, 26))
  73     'abcdefghijklmnopqrstuvwxyz'
  74     """
  75     return cat([cat(l)
  76                     for l in zip_longest(*split_text, fillvalue='')])
  77
  78 def chunks(text, n, fillvalue=None):
  79     """Split a text into chunks of n characters
  80
  81     >>> chunks('abcdefghi', 3)
  82     ['abc', 'def', 'ghi']
  83     >>> chunks('abcdefghi', 4)
  84     ['abcd', 'efgh', 'i']
  85     >>> chunks('abcdefghi', 4, fillvalue='!')
  86     ['abcd', 'efgh', 'i!!!']
  87     """
  88     if fillvalue:
  89         # padding = fillvalue[0] * (n - len(text) % n)
  90         padding = pad(len(text), n, fillvalue)
  91         padded_text = text + padding
  92     else:
  93         padded_text = text
  94     return [(padded_text)[i:i+n] for i in range(0, len(text), n)]
  95
  96 def transpose(items, transposition):
  97     """Moves items around according to the given transposition
  98
  99     >>> transpose(['a', 'b', 'c', 'd'], (0,1,2,3))
 100     ['a', 'b', 'c', 'd']
 101     >>> transpose(['a', 'b', 'c', 'd'], (3,1,2,0))
 102     ['d', 'b', 'c', 'a']
 103     >>> transpose([10,11,12,13,14,15], (3,2,4,1,5,0))
 104     [13, 12, 14, 11, 15, 10]
 105     """
 106     transposed = [''] * len(transposition)
 107     for p, t in enumerate(transposition):
 108        transposed[p] = items[t]
 109     return transposed
 110
 111 def untranspose(items, transposition):
 112     """Undoes a transpose
 113
 114     >>> untranspose(['a', 'b', 'c', 'd'], [0,1,2,3])
 115     ['a', 'b', 'c', 'd']
 116     >>> untranspose(['d', 'b', 'c', 'a'], [3,1,2,0])
 117     ['a', 'b', 'c', 'd']
 118     >>> untranspose([13, 12, 14, 11, 15, 10], [3,2,4,1,5,0])
 119     [10, 11, 12, 13, 14, 15]
 120     """
 121     transposed = [''] * len(transposition)
 122     for p, t in enumerate(transposition):
 123        transposed[t] = items[p]
 124     return transposed
 125
 126 def deduplicate(text):
 127     """Return the input string, but with second (and subsequent) occurrences
 128     of a character removed.
 129     """
 130     return list(collections.OrderedDict.fromkeys(text))
 131
 132
 133 def letters(text):
 134     """Remove all non-alphabetic characters from a text
 135     >>> letters('The Quick')
 136     'TheQuick'
 137     >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
 138     'TheQuickBROWNfoxjumpedoverthelazyDOG'
 139     """
 140     return ''.join([c for c in text if c in string.ascii_letters])
 141
 142 # Special characters for conversion, such as smart quotes.
 143 unaccent_specials = ''.maketrans({"’": "'", '“': '"', '”': '"'})
 144
 145 def unaccent(text):
 146     """Remove all accents from letters.
 147     It does this by converting the unicode string to decomposed compatability
 148     form, dropping all the combining accents, then re-encoding the bytes.
 149
 150     >>> unaccent('hello')
 151     'hello'
 152     >>> unaccent('HELLO')
 153     'HELLO'
 154     >>> unaccent('héllo')
 155     'hello'
 156     >>> unaccent('héllö')
 157     'hello'
 158     >>> unaccent('HÉLLÖ')
 159     'HELLO'
 160     """
 161     translated_text = text.translate(unaccent_specials)
 162     return unicodedata.normalize('NFKD', translated_text).\
 163         encode('ascii', 'ignore').\
 164         decode('utf-8')
 165
 166 def sanitise(text):
 167     """Remove all non-alphabetic characters and convert the text to lowercase
 168
 169     >>> sanitise('The Quick')
 170     'thequick'
 171     >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
 172     'thequickbrownfoxjumpedoverthelazydog'
 173     >>> sanitise('HÉLLÖ')
 174     'hello'
 175     """
 176     return letters(unaccent(text)).lower()
 177
 178
 179 def index_of_coincidence(text):
 180     """Index of coincidence of a string. This is low for random text,
 181     higher for natural langauge.
 182     """
 183     stext = sanitise(text)
 184     counts = collections.Counter(stext)
 185     denom = len(stext) * (len(text) - 1) / 26
 186     return (
 187         sum(max(counts[l] * counts[l] - 1, 0) for l in string.ascii_lowercase)
 188         /
 189         denom
 190     )
 191
 192
 193 def frequencies(text):
 194     """Count the number of occurrences of each character in text
 195
 196     >>> sorted(frequencies('abcdefabc').items())
 197     [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
 198     >>> sorted(frequencies('the quick brown fox jumped over the lazy ' \
 199          'dog').items()) # doctest: +NORMALIZE_WHITESPACE
 200     [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1),
 201      ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1),
 202      ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2),
 203      ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
 204     >>> sorted(frequencies('The Quick BROWN fox jumped! over... the ' \
 205          '(9lazy) DOG').items()) # doctest: +NORMALIZE_WHITESPACE
 206     [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1),
 207      ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1),
 208      ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2),
 209      ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1),
 210      ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
 211     >>> sorted(frequencies(sanitise('The Quick BROWN fox jumped! over... '\
 212          'the (9lazy) DOG')).items()) # doctest: +NORMALIZE_WHITESPACE
 213     [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1),
 214      ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1),
 215      ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1),
 216      ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
 217     >>> frequencies('abcdefabcdef')['x']
 218     0
 219     """
 220     return collections.Counter(c for c in text)
 221
 222 if __name__ == "__main__":
 223     import doctest
 224     doctest.testmod()