segment.py

   1 # import re, string, random, glob, operator, heapq
   2 import string
   3 import collections
   4 from math import log10
   5 import itertools
   6 import sys
   7 sys.setrecursionlimit(1000000)
   8
   9 def memo(f):
  10     "Memoize function f."
  11     table = {}
  12     def fmemo(*args):
  13         if args not in table:
  14             table[args] = f(*args)
  15         return table[args]
  16     fmemo.memo = table
  17     return fmemo
  18
  19 @memo
  20 def segment(text):
  21     """Return a list of words that is the best segmentation of text.
  22     """
  23     if not text: return []
  24     candidates = ([first]+segment(rest) for first,rest in splits(text))
  25     return max(candidates, key=Pwords)
  26
  27 def splits(text, L=20):
  28     """Return a list of all possible (first, rest) pairs, len(first)<=L.
  29     """
  30     return [(text[:i+1], text[i+1:])
  31             for i in range(min(len(text), L))]
  32
  33 def Pwords(words):
  34     """The Naive Bayes log probability of a sequence of words.
  35     """
  36     return sum(Pw[w] for w in words)
  37
  38 class Pdist(dict):
  39     """A probability distribution estimated from counts in datafile.
  40     Values are stored and returned as log probabilities.
  41     """
  42     def __init__(self, data=[], estimate_of_missing=None):
  43         data1, data2 = itertools.tee(data)
  44         self.total = sum([int(d[1]) for d in data1])
  45         for key, count in data2:
  46             self[key] = log10(int(count) / self.total)
  47         self.estimate_of_missing = estimate_of_missing or (lambda k, N: 1./N)
  48     def __missing__(self, key):
  49         return self.estimate_of_missing(key, self.total)
  50
  51 def datafile(name, sep='\t'):
  52     """Read key,value pairs from file.
  53     """
  54     with open(name, 'r') as f:
  55         for line in f:
  56             yield line.split(sep)
  57
  58 def avoid_long_words(key, N):
  59     """Estimate the probability of an unknown word.
  60     """
  61     return -log10((N * 10**(len(key) - 2)))
  62
  63 Pw  = Pdist(datafile('count_1w.txt'), avoid_long_words)
  64