Changed sanitise and segment to cope with capital letters

author Neil Smith <neil.github@njae.me.uk>

Thu, 14 Nov 2013 11:58:50 +0000 (11:58 +0000)

committer Neil Smith <neil.github@njae.me.uk>

Thu, 14 Nov 2013 11:58:50 +0000 (11:58 +0000)
author Neil Smith <neil.github@njae.me.uk>
Thu, 14 Nov 2013 11:58:50 +0000 (11:58 +0000)
committer Neil Smith <neil.github@njae.me.uk>
Thu, 14 Nov 2013 11:58:50 +0000 (11:58 +0000)
diff --git a/cipher.py b/cipher.py

index d738d8879c8e8a3c98fbc782c7a511a3a0fbf132..fdff17fc4e7c0c811253ef295c02d9791e7ec157 100644 (file)
--- a/cipher.py
+++ b/cipher.py
@@ -43,6 +43,14 @@ for a in range(26):
          c = (a * b) % 26
          modular_division_table[b][c] = a
  
+def letters(text):
+    """Remove all non-alphabetic characters from a text
+    >>> letters('The Quick')
+    'TheQuick'
+    >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
+    'TheQuickBROWNfoxjumpedoverthelazyDOG'
+    """
+    return ''.join([c for c in text if c in string.ascii_letters])
  
  def sanitise(text):
      """Remove all non-alphabetic characters and convert the text to lowercase
@@ -52,8 +60,9 @@ def sanitise(text):
      >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
      'thequickbrownfoxjumpedoverthelazydog'
      """
-    sanitised = [c.lower() for c in text if c in string.ascii_letters]
-    return ''.join(sanitised)
+    # sanitised = [c.lower() for c in text if c in string.ascii_letters]
+    # return ''.join(sanitised)
+    return letters(text).lower()
  
  def ngrams(text, n):
      """Returns all n-grams of a text
diff --git a/segment.py b/segment.py

index 712895b6b0d7f1563ee4149fe7d94445a3931233..bd15e00e41913aa5115da546fb70215143341df2 100644 (file)
--- a/segment.py
+++ b/segment.py
@@ -24,7 +24,7 @@ def splits(text, L=20):
  def Pwords(words): 
      """The Naive Bayes log probability of a sequence of words.
      """
-    return sum(Pw[w] for w in words)
+    return sum(Pw[w.lower()] for w in words)
  
  class Pdist(dict):
      """A probability distribution estimated from counts in datafile.
author	Neil Smith <neil.github@njae.me.uk>
	Thu, 14 Nov 2013 11:58:50 +0000 (11:58 +0000)
committer	Neil Smith <neil.github@njae.me.uk>
	Thu, 14 Nov 2013 11:58:50 +0000 (11:58 +0000)
cipher.py		patch \| blob \| history
segment.py		patch \| blob \| history