segment.py

   1 # import re, string, random, glob, operator, heapq
   2 import string
   3 import collections
   4 from math import log10
   5
   6 def memo(f):
   7     "Memoize function f."
   8     table = {}
   9     def fmemo(*args):
  10         if args not in table:
  11             table[args] = f(*args)
  12         return table[args]
  13     fmemo.memo = table
  14     return fmemo
  15
  16 @memo
  17 def segment(text):
  18     """Return a list of words that is the best segmentation of text.
  19     """
  20     if not text: return []
  21     candidates = ([first]+segment(rem) for first,rem in splits(text))
  22     return max(candidates, key=Pwords)
  23
  24 def splits(text, L=20):
  25     """Return a list of all possible (first, rest) pairs, len(first)<=L.
  26     """
  27     return [(text[:i+1], text[i+1:])
  28             for i in range(min(len(text), L))]
  29
  30 def Pwords(words):
  31     """The Naive Bayes log probability of a sequence of words.
  32     """
  33     return sum(Pw(w) for w in words)
  34
  35 class Pdist(dict):
  36     """A probability distribution estimated from counts in datafile.
  37     Values are stored and returned as log probabilities.
  38     """
  39     def __init__(self, data=[], estimate_of_missing=None):
  40         self.total = sum([int(d[1]) for d in data])
  41         for key, count in data:
  42             self[key] = log10(int(count) / self.total)
  43         self.estimate_of_missing = estimate_of_missing or (lambda k, N: 1./N)
  44     def __call__(self, key):
  45         if key in self:
  46             return self[key]
  47         else:
  48             return self.estimate_of_missing(key, self.total)
  49
  50 def datafile(name, sep='\t'):
  51     """Read key,value pairs from file.
  52     """
  53     with open(name, 'r') as f:
  54         for line in f:
  55             yield line.split(sep)
  56
  57 def avoid_long_words(key, N):
  58     """Estimate the probability of an unknown word.
  59     """
  60     return -log10((N * 10**(len(key) - 2)))
  61
  62 N = 1024908267229 ## Number of tokens
  63
  64 Pw  = Pdist(datafile('count_1w.txt'), avoid_long_words)