X-Git-Url: https://git.njae.me.uk/?a=blobdiff_plain;f=segment.py;h=a64ea5d4eb4248edaaff12c791df088f8109e21c;hb=3ba8a3b82ccec64a2939708f428354176af6746e;hp=1af1b62fc8eb3270c35c4bb39a773804faf8da47;hpb=4ee90e9b9683a7688d9a2b7ed972a2511530c0ba;p=cipher-training.git diff --git a/segment.py b/segment.py index 1af1b62..a64ea5d 100644 --- a/segment.py +++ b/segment.py @@ -1,3 +1,5 @@ +"""Segment a collection of letters into words""" + import language_models import sys from functools import lru_cache @@ -8,7 +10,7 @@ def segment(text): """Return a list of words that is the best segmentation of text. """ if not text: return [] - candidates = ([first]+segment(rest) for first,rest in splits(text)) + candidates = ([first]+segment(rest) for first, rest in splits(text)) return max(candidates, key=language_models.Pwords) @lru_cache() @@ -16,13 +18,13 @@ def segment_wrong(text): """Return a list of words that is the best segmentation of text. """ if not text: return [] - candidates = ([first]+segment(rest) for first,rest in splits(text)) + candidates = ([first]+segment(rest) for first, rest in splits(text)) return max(candidates, key=language_models.Pwords_wrong) def splits(text, L=20): """Return a list of all possible (first, rest) pairs, len(first)<=L. """ - return [(text[:i+1], text[i+1:]) + return [(text[:i+1], text[i+1:]) for i in range(min(len(text), L))]