segment.py

   1 import string
   2 import collections
   3 from math import log10
   4 import itertools
   5 import sys
   6 from functools import lru_cache
   7 sys.setrecursionlimit(1000000)
   8
   9 @lru_cache()
  10 def segment(text):
  11     """Return a list of words that is the best segmentation of text.
  12     """
  13     if not text: return []
  14     candidates = ([first]+segment(rest) for first,rest in splits(text))
  15     return max(candidates, key=Pwords)
  16
  17 def splits(text, L=20):
  18     """Return a list of all possible (first, rest) pairs, len(first)<=L.
  19     """
  20     return [(text[:i+1], text[i+1:])
  21             for i in range(min(len(text), L))]
  22
  23 def Pwords(words):
  24     """The Naive Bayes log probability of a sequence of words.
  25     """
  26     return sum(Pw[w.lower()] for w in words)
  27
  28 class Pdist(dict):
  29     """A probability distribution estimated from counts in datafile.
  30     Values are stored and returned as log probabilities.
  31     """
  32     def __init__(self, data=[], estimate_of_missing=None):
  33         data1, data2 = itertools.tee(data)
  34         self.total = sum([int(d[1]) for d in data1])
  35         for key, count in data2:
  36             self[key] = log10(int(count) / self.total)
  37         self.estimate_of_missing = estimate_of_missing or (lambda k, N: 1./N)
  38     def __missing__(self, key):
  39         return self.estimate_of_missing(key, self.total)
  40
  41 def datafile(name, sep='\t'):
  42     """Read key,value pairs from file.
  43     """
  44     with open(name, 'r') as f:
  45         for line in f:
  46             yield line.split(sep)
  47
  48 def avoid_long_words(key, N):
  49     """Estimate the probability of an unknown word.
  50     """
  51     return -log10((N * 10**(len(key) - 2)))
  52
  53 Pw  = Pdist(datafile('count_1w.txt'), avoid_long_words)
  54