X-Git-Url: https://git.njae.me.uk/?a=blobdiff_plain;f=szyfrow%2Fsupport%2Fsegment.py;fp=szyfrow%2Fsupport%2Fsegment.py;h=954dd97f83a22b4ee1917e41045009dc76a4a246;hb=27c8005f6dea0026887b80a01b5f93a8f1b3c2b2;hp=0000000000000000000000000000000000000000;hpb=a870050db6bc974b1bb0d132001750b6624fb43f;p=szyfrow.git

diff --git a/szyfrow/support/segment.py b/szyfrow/support/segment.py
new file mode 100644
index 0000000..954dd97
--- /dev/null
+++ b/szyfrow/support/segment.py
@@ -0,0 +1,20 @@
+import sys
+from functools import lru_cache
+from szyfrow.support.language_models import Pwords
+
+sys.setrecursionlimit(1000000)
+
+@lru_cache()
+def segment(text):
+    """Return a list of words that is the best segmentation of text.
+    """
+    if not text: return []
+    candidates = ([first]+segment(rest) for first,rest in splits(text))
+    return max(candidates, key=Pwords)
+
+def splits(text, L=20):
+    """Return a list of all possible (first, rest) pairs, len(first)<=L.
+    """
+    return [(text[:i+1], text[i+1:]) 
+            for i in range(min(len(text), L))]
+