X-Git-Url: https://git.njae.me.uk/?a=blobdiff_plain;f=segment.py;h=a64ea5d4eb4248edaaff12c791df088f8109e21c;hb=3ba8a3b82ccec64a2939708f428354176af6746e;hp=ba3ddd7405a91a40c025fcd34b5eadfa7f8d0b11;hpb=3e8d2bd8cd7c623116fa3d2b77db954f51b191e4;p=cipher-training.git

diff --git a/segment.py b/segment.py
index ba3ddd7..a64ea5d 100644
--- a/segment.py
+++ b/segment.py
@@ -1,3 +1,5 @@
+"""Segment a collection of letters into words"""
+
 import language_models
 import sys
 from functools import lru_cache
@@ -8,12 +10,21 @@ def segment(text):
     """Return a list of words that is the best segmentation of text.
     """
     if not text: return []
-    candidates = ([first]+segment(rest) for first,rest in splits(text))
+    candidates = ([first]+segment(rest) for first, rest in splits(text))
     return max(candidates, key=language_models.Pwords)
 
+@lru_cache()
+def segment_wrong(text):
+    """Return a list of words that is the best segmentation of text.
+    """
+    if not text: return []
+    candidates = ([first]+segment(rest) for first, rest in splits(text))
+    return max(candidates, key=language_models.Pwords_wrong)
+
+
 def splits(text, L=20):
     """Return a list of all possible (first, rest) pairs, len(first)<=L.
     """
-    return [(text[:i+1], text[i+1:]) 
+    return [(text[:i+1], text[i+1:])
             for i in range(min(len(text), L))]