segment.py

   1 # import re, string, random, glob, operator, heapq
   2 import string
   3 import collections
   4 from math import log10
   5 import itertools
   6
   7 def memo(f):
   8     "Memoize function f."
   9     table = {}
  10     def fmemo(*args):
  11         if args not in table:
  12             table[args] = f(*args)
  13         return table[args]
  14     fmemo.memo = table
  15     return fmemo
  16
  17 @memo
  18 def segment(text):
  19     """Return a list of words that is the best segmentation of text.
  20     """
  21     if not text: return []
  22     candidates = ([first]+segment(rest) for first,rest in splits(text))
  23     return max(candidates, key=Pwords)
  24
  25 def splits(text, L=20):
  26     """Return a list of all possible (first, rest) pairs, len(first)<=L.
  27     """
  28     return [(text[:i+1], text[i+1:])
  29             for i in range(min(len(text), L))]
  30
  31 def Pwords(words):
  32     """The Naive Bayes log probability of a sequence of words.
  33     """
  34     return sum(Pw[w] for w in words)
  35
  36 class Pdist(dict):
  37     """A probability distribution estimated from counts in datafile.
  38     Values are stored and returned as log probabilities.
  39     """
  40     def __init__(self, data=[], estimate_of_missing=None):
  41         data1, data2 = itertools.tee(data)
  42         self.total = sum([int(d[1]) for d in data1])
  43         for key, count in data2:
  44             self[key] = log10(int(count) / self.total)
  45         self.estimate_of_missing = estimate_of_missing or (lambda k, N: 1./N)
  46     def __missing__(self, key):
  47         return self.estimate_of_missing(key, self.total)
  48
  49 def datafile(name, sep='\t'):
  50     """Read key,value pairs from file.
  51     """
  52     with open(name, 'r') as f:
  53         for line in f:
  54             yield line.split(sep)
  55
  56 def avoid_long_words(key, N):
  57     """Estimate the probability of an unknown word.
  58     """
  59     return -log10((N * 10**(len(key) - 2)))
  60
  61 # N = 1024908267229 ## Number of tokens
  62
  63 Pw  = Pdist(datafile('count_1w.txt'), avoid_long_words)
  64