language_models.py

   1 """Language-specific functions, including models of languages based on data of
   2 its use.
   3 """
   4
   5 import string
   6 import norms
   7 import collections
   8 import unicodedata
   9
  10 def letters(text):
  11     """Remove all non-alphabetic characters from a text
  12     >>> letters('The Quick')
  13     'TheQuick'
  14     >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
  15     'TheQuickBROWNfoxjumpedoverthelazyDOG'
  16     """
  17     return ''.join([c for c in text if c in string.ascii_letters])
  18
  19 def unaccent(text):
  20     """Remove all accents from letters.
  21     It does this by converting the unicode string to decomposed compatability
  22     form, dropping all the combining accents, then re-encoding the bytes.
  23
  24     >>> unaccent('hello')
  25     'hello'
  26     >>> unaccent('HELLO')
  27     'HELLO'
  28     >>> unaccent('héllo')
  29     'hello'
  30     >>> unaccent('héllö')
  31     'hello'
  32     >>> unaccent('HÉLLÖ')
  33     'HELLO'
  34     """
  35     return unicodedata.normalize('NFKD', text).\
  36         encode('ascii', 'ignore').\
  37         decode('utf-8')
  38
  39 def sanitise(text):
  40     """Remove all non-alphabetic characters and convert the text to lowercase
  41
  42     >>> sanitise('The Quick')
  43     'thequick'
  44     >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
  45     'thequickbrownfoxjumpedoverthelazydog'
  46     >>> sanitise('HÉLLÖ')
  47     'hello'
  48     """
  49     # sanitised = [c.lower() for c in text if c in string.ascii_letters]
  50     # return ''.join(sanitised)
  51     return letters(unaccent(text)).lower()
  52
  53
  54 if __name__ == "__main__":
  55     import doctest
  56     doctest.testmod()