Changed sanitise and segment to cope with capital letters

[cipher-tools.git] / cipher.py
diff --git a/cipher.py b/cipher.py

index f271120ccefcc7a687edd63964653547199c404c..fdff17fc4e7c0c811253ef295c02d9791e7ec157 100644 (file)
--- a/cipher.py
+++ b/cipher.py
@@ -43,6 +43,14 @@ for a in range(26):
          c = (a * b) % 26
          modular_division_table[b][c] = a
  
+def letters(text):
+    """Remove all non-alphabetic characters from a text
+    >>> letters('The Quick')
+    'TheQuick'
+    >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
+    'TheQuickBROWNfoxjumpedoverthelazyDOG'
+    """
+    return ''.join([c for c in text if c in string.ascii_letters])
  
  def sanitise(text):
      """Remove all non-alphabetic characters and convert the text to lowercase
@@ -52,8 +60,9 @@ def sanitise(text):
      >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
      'thequickbrownfoxjumpedoverthelazydog'
      """
-    sanitised = [c.lower() for c in text if c in string.ascii_letters]
-    return ''.join(sanitised)
+    # sanitised = [c.lower() for c in text if c in string.ascii_letters]
+    # return ''.join(sanitised)
+    return letters(text).lower()
  
  def ngrams(text, n):
      """Returns all n-grams of a text
@@ -120,11 +129,14 @@ def frequencies(text):
       ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), 
       ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1), 
       ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
+    >>> frequencies('abcdefabcdef')['x']
+    0
      """
-    counts = collections.defaultdict(int)
-    for c in text: 
-        counts[c] += 1
-    return counts
+    #counts = collections.defaultdict(int)
+    #for c in text: 
+    #    counts[c] += 1
+    #return counts
+    return collections.Counter(c for c in text)
  letter_frequencies = frequencies
  
  def deduplicate(text):
@@ -516,7 +528,9 @@ def keyword_break_mp(message,
          helper_args = [(message, word, wrap, metric, target_counts, 
                          message_frequency_scaling) 
                         for word in wordlist for wrap in range(3)]
-        breaks = pool.starmap(keyword_break_one, helper_args, chunksize) # Gotcha: the helper function here needs to be defined at the top level (limitation of Pool.starmap)
+        # Gotcha: the helper function here needs to be defined at the top level 
+        #   (limitation of Pool.starmap)
+        breaks = pool.starmap(keyword_break_one, helper_args, chunksize) 
          return min(breaks, key=lambda k: k[1])
  
  def keyword_break_one(message, keyword, wrap_alphabet, metric, target_counts,