segment.py

   1 # import re, string, random, glob, operator, heapq
   2 import string
   3 import collections
   4 from math import log10
   5
   6 def memo(f):
   7     "Memoize function f."
   8     table = {}
   9     def fmemo(*args):
  10         if args not in table:
  11             table[args] = f(*args)
  12         return table[args]
  13     fmemo.memo = table
  14     return fmemo
  15
  16 @memo
  17 def segment(text):
  18     "Return a list of words that is the best segmentation of text."
  19     if not text: return []
  20     candidates = ([first]+segment(rem) for first,rem in splits(text))
  21     return max(candidates, key=Pwords)
  22
  23 def splits(text, L=20):
  24     "Return a list of all possible (first, rem) pairs, len(first)<=L."
  25     return [(text[:i+1], text[i+1:])
  26             for i in range(min(len(text), L))]
  27
  28 def Pwords(words):
  29     "The Naive Bayes probability of a sequence of words."
  30     return product(Pw(w) for w in words)
  31
  32 class Pdist(dict):
  33     "A probability distribution estimated from counts in datafile."
  34     def __init__(self, data=[], N=None, missingfn=None):
  35         for key,count in data:
  36             self[key] = self.get(key, 0) + int(count)
  37         self.N = float(N or sum(self.itervalues()))
  38         self.missingfn = missingfn or (lambda k, N: 1./N)
  39     def __call__(self, key):
  40         if key in self: return self[key]/self.N
  41         else: return self.missingfn(key, self.N)
  42
  43 def datafile(name, sep='\t'):
  44     "Read key,value pairs from file."
  45     for line in file(name):
  46         yield line.split(sep)
  47
  48 def avoid_long_words(key, N):
  49     "Estimate the probability of an unknown word."
  50     return 10./(N * 10**len(key))
  51
  52 N = 1024908267229 ## Number of tokens
  53
  54 Pw  = Pdist(datafile('count_1w.txt'), N, avoid_long_words)