From: Neil Smith <neil.github@njae.me.uk>
Date: Thu, 14 Nov 2013 11:58:50 +0000 (+0000)
Subject: Changed sanitise and segment to cope with capital letters
X-Git-Url: https://git.njae.me.uk/?a=commitdiff_plain;ds=sidebyside;h=62bbe4277e9676b9255ef98a33ba2ad3dbc0c7ed;p=cipher-tools.git

Changed sanitise and segment to cope with capital letters
---

diff --git a/cipher.py b/cipher.py
index d738d88..fdff17f 100644
--- a/cipher.py
+++ b/cipher.py
@@ -43,6 +43,14 @@ for a in range(26):
         c = (a * b) % 26
         modular_division_table[b][c] = a
 
+def letters(text):
+    """Remove all non-alphabetic characters from a text
+    >>> letters('The Quick')
+    'TheQuick'
+    >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
+    'TheQuickBROWNfoxjumpedoverthelazyDOG'
+    """
+    return ''.join([c for c in text if c in string.ascii_letters])
 
 def sanitise(text):
     """Remove all non-alphabetic characters and convert the text to lowercase
@@ -52,8 +60,9 @@ def sanitise(text):
     >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
     'thequickbrownfoxjumpedoverthelazydog'
     """
-    sanitised = [c.lower() for c in text if c in string.ascii_letters]
-    return ''.join(sanitised)
+    # sanitised = [c.lower() for c in text if c in string.ascii_letters]
+    # return ''.join(sanitised)
+    return letters(text).lower()
 
 def ngrams(text, n):
     """Returns all n-grams of a text
diff --git a/segment.py b/segment.py
index 712895b..bd15e00 100644
--- a/segment.py
+++ b/segment.py
@@ -24,7 +24,7 @@ def splits(text, L=20):
 def Pwords(words): 
     """The Naive Bayes log probability of a sequence of words.
     """
-    return sum(Pw[w] for w in words)
+    return sum(Pw[w.lower()] for w in words)
 
 class Pdist(dict):
     """A probability distribution estimated from counts in datafile.