From 62bbe4277e9676b9255ef98a33ba2ad3dbc0c7ed Mon Sep 17 00:00:00 2001 From: Neil Smith Date: Thu, 14 Nov 2013 11:58:50 +0000 Subject: [PATCH] Changed sanitise and segment to cope with capital letters --- cipher.py | 13 +++++++++++-- segment.py | 2 +- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/cipher.py b/cipher.py index d738d88..fdff17f 100644 --- a/cipher.py +++ b/cipher.py @@ -43,6 +43,14 @@ for a in range(26): c = (a * b) % 26 modular_division_table[b][c] = a +def letters(text): + """Remove all non-alphabetic characters from a text + >>> letters('The Quick') + 'TheQuick' + >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG') + 'TheQuickBROWNfoxjumpedoverthelazyDOG' + """ + return ''.join([c for c in text if c in string.ascii_letters]) def sanitise(text): """Remove all non-alphabetic characters and convert the text to lowercase @@ -52,8 +60,9 @@ def sanitise(text): >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG') 'thequickbrownfoxjumpedoverthelazydog' """ - sanitised = [c.lower() for c in text if c in string.ascii_letters] - return ''.join(sanitised) + # sanitised = [c.lower() for c in text if c in string.ascii_letters] + # return ''.join(sanitised) + return letters(text).lower() def ngrams(text, n): """Returns all n-grams of a text diff --git a/segment.py b/segment.py index 712895b..bd15e00 100644 --- a/segment.py +++ b/segment.py @@ -24,7 +24,7 @@ def splits(text, L=20): def Pwords(words): """The Naive Bayes log probability of a sequence of words. """ - return sum(Pw[w] for w in words) + return sum(Pw[w.lower()] for w in words) class Pdist(dict): """A probability distribution estimated from counts in datafile. -- 2.34.1