X-Git-Url: https://git.njae.me.uk/?a=blobdiff_plain;f=szyfrow%2Fsupport%2Fsegment.py;fp=szyfrow%2Fsupport%2Fsegment.py;h=954dd97f83a22b4ee1917e41045009dc76a4a246;hb=27c8005f6dea0026887b80a01b5f93a8f1b3c2b2;hp=0000000000000000000000000000000000000000;hpb=a870050db6bc974b1bb0d132001750b6624fb43f;p=szyfrow.git diff --git a/szyfrow/support/segment.py b/szyfrow/support/segment.py new file mode 100644 index 0000000..954dd97 --- /dev/null +++ b/szyfrow/support/segment.py @@ -0,0 +1,20 @@ +import sys +from functools import lru_cache +from szyfrow.support.language_models import Pwords + +sys.setrecursionlimit(1000000) + +@lru_cache() +def segment(text): + """Return a list of words that is the best segmentation of text. + """ + if not text: return [] + candidates = ([first]+segment(rest) for first,rest in splits(text)) + return max(candidates, key=Pwords) + +def splits(text, L=20): + """Return a list of all possible (first, rest) pairs, len(first)<=L. + """ + return [(text[:i+1], text[i+1:]) + for i in range(min(len(text), L))] +