X-Git-Url: https://git.njae.me.uk/?a=blobdiff_plain;f=language_models.py;h=a6a711f1562d8c70f091165fa15330a825a48559;hb=21c390a77d42729afa23844ef2f1295106bed3de;hp=da5d2d07fa2003a3bf95a4a6629c1eafd666382b;hpb=d0a53e974970bc915d94280b5158b50f93054dc3;p=cipher-tools.git

diff --git a/language_models.py b/language_models.py
index da5d2d0..a6a711f 100644
--- a/language_models.py
+++ b/language_models.py
@@ -7,51 +7,6 @@ import itertools
 from math import log10
 import os 
 
-unaccent_specials = ''.maketrans({"â": "'", 'â': '"', 'â': '"'})
-
-def letters(text):
-    """Remove all non-alphabetic characters from a text
-    >>> letters('The Quick')
-    'TheQuick'
-    >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
-    'TheQuickBROWNfoxjumpedoverthelazyDOG'
-    """
-    return ''.join([c for c in text if c in string.ascii_letters])
-
-def unaccent(text):
-    """Remove all accents from letters. 
-    It does this by converting the unicode string to decomposed compatability
-    form, dropping all the combining accents, then re-encoding the bytes.
-
-    >>> unaccent('hello')
-    'hello'
-    >>> unaccent('HELLO')
-    'HELLO'
-    >>> unaccent('hÃ©llo')
-    'hello'
-    >>> unaccent('hÃ©llÃ¶')
-    'hello'
-    >>> unaccent('HÃLLÃ')
-    'HELLO'
-    """
-    translated_text = text.translate(unaccent_specials)
-    return unicodedata.normalize('NFKD', translated_text).\
-        encode('ascii', 'ignore').\
-        decode('utf-8')
-
-def sanitise(text):
-    """Remove all non-alphabetic characters and convert the text to lowercase
-    
-    >>> sanitise('The Quick')
-    'thequick'
-    >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
-    'thequickbrownfoxjumpedoverthelazydog'
-    >>> sanitise('HÃLLÃ')
-    'hello'
-    """
-    # sanitised = [c.lower() for c in text if c in string.ascii_letters]
-    # return ''.join(sanitised)
-    return letters(unaccent(text)).lower()
 
 
 def datafile(name, sep='\t'):