language_models.py

   1 import unicodedata
   2
   3 def unaccent(text):
   4     """Remove all accents from letters.
   5     It does this by converting the unicode string to decomposed compatability
   6     form, dropping all the combining accents, then re-encoding the bytes.
   7
   8     >>> unaccent('hello')
   9     'hello'
  10     >>> unaccent('HELLO')
  11     'HELLO'
  12     >>> unaccent('héllo')
  13     'hello'
  14     >>> unaccent('héllö')
  15     'hello'
  16     >>> unaccent('HÉLLÖ')
  17     'HELLO'
  18     """
  19     return unicodedata.normalize('NFKD', text).\
  20         encode('ascii', 'ignore').\
  21         decode('utf-8')
  22
  23
  24 if __name__ == "__main__":
  25     import doctest
  26     doctest.testmod()