Module szyfrow.support.segment
Setments a string of characters into words, following a language model.
Implementation taken from Peter Norvig
Expand source code
"""Setments a string of characters into words, following a language model.
Implementation taken from [Peter Norvig](https://norvig.com/ngrams/ch14.pdf)
"""
import sys
from functools import lru_cache
from szyfrow.support.language_models import Pwords
sys.setrecursionlimit(1000000)
@lru_cache()
def segment(text):
"""Return a list of words that is the best segmentation of text.
"""
if not text: return []
candidates = ([first]+segment(rest) for first,rest in splits(text))
return max(candidates, key=Pwords)
def splits(text, L=20):
"""Return a list of all possible (first, rest) pairs, len(first)<=L.
"""
return [(text[:i+1], text[i+1:])
for i in range(min(len(text), L))]
Functions
def segment(text)
-
Return a list of words that is the best segmentation of text.
Expand source code
@lru_cache() def segment(text): """Return a list of words that is the best segmentation of text. """ if not text: return [] candidates = ([first]+segment(rest) for first,rest in splits(text)) return max(candidates, key=Pwords)
def splits(text, L=20)
-
Return a list of all possible (first, rest) pairs, len(first)<=L.
Expand source code
def splits(text, L=20): """Return a list of all possible (first, rest) pairs, len(first)<=L. """ return [(text[:i+1], text[i+1:]) for i in range(min(len(text), L))]