szyfrow/support/segment.py

   1 """Setments a string of characters into words, following a language model.
   2
   3 Implementation taken from [Peter Norvig](https://norvig.com/ngrams/ch14.pdf)
   4 """
   5
   6 import sys
   7 from functools import lru_cache
   8 from szyfrow.support.language_models import Pwords
   9
  10 sys.setrecursionlimit(1000000)
  11
  12 @lru_cache()
  13 def segment(text):
  14     """Return a list of words that is the best segmentation of text.
  15     """
  16     if not text: return []
  17     candidates = ([first]+segment(rest) for first,rest in splits(text))
  18     return max(candidates, key=Pwords)
  19
  20 def splits(text, L=20):
  21     """Return a list of all possible (first, rest) pairs, len(first)<=L.
  22     """
  23     return [(text[:i+1], text[i+1:])
  24             for i in range(min(len(text), L))]
  25