language_models.py

   1 """Language-specific functions, including models of languages based on data of
   2 its use.
   3 """
   4
   5 import string
   6 import unicodedata
   7
   8 def letters(text):
   9     """Remove all non-alphabetic characters from a text
  10     >>> letters('The Quick')
  11     'TheQuick'
  12     >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
  13     'TheQuickBROWNfoxjumpedoverthelazyDOG'
  14     """
  15     return ''.join([c for c in text if c in string.ascii_letters])
  16
  17 def unaccent(text):
  18     """Remove all accents from letters.
  19     It does this by converting the unicode string to decomposed compatability
  20     form, dropping all the combining accents, then re-encoding the bytes.
  21
  22     >>> unaccent('hello')
  23     'hello'
  24     >>> unaccent('HELLO')
  25     'HELLO'
  26     >>> unaccent('héllo')
  27     'hello'
  28     >>> unaccent('héllö')
  29     'hello'
  30     >>> unaccent('HÉLLÖ')
  31     'HELLO'
  32     """
  33     return unicodedata.normalize('NFKD', text).\
  34         encode('ascii', 'ignore').\
  35         decode('utf-8')
  36
  37 def sanitise(text):
  38     """Remove all non-alphabetic characters and convert the text to lowercase
  39
  40     >>> sanitise('The Quick')
  41     'thequick'
  42     >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
  43     'thequickbrownfoxjumpedoverthelazydog'
  44     >>> sanitise('HÉLLÖ')
  45     'hello'
  46     """
  47     # sanitised = [c.lower() for c in text if c in string.ascii_letters]
  48     # return ''.join(sanitised)
  49     return letters(unaccent(text)).lower()
  50
  51
  52
  53 if __name__ == "__main__":
  54     import doctest
  55     doctest.testmod()