segment.py

   1 # import re, string, random, glob, operator, heapq
   2 import string
   3 import collections
   4 from math import log10
   5 import itertools
   6 import sys
   7 from functools import lru_cache
   8 sys.setrecursionlimit(1000000)
   9
  10 @lru_cache()
  11 def segment(text):
  12     """Return a list of words that is the best segmentation of text.
  13     """
  14     if not text: return []
  15     candidates = ([first]+segment(rest) for first,rest in splits(text))
  16     return max(candidates, key=Pwords)
  17
  18 def splits(text, L=20):
  19     """Return a list of all possible (first, rest) pairs, len(first)<=L.
  20     """
  21     return [(text[:i+1], text[i+1:])
  22             for i in range(min(len(text), L))]
  23
  24 def Pwords(words):
  25     """The Naive Bayes log probability of a sequence of words.
  26     """
  27     return sum(Pw[w] for w in words)
  28
  29 class Pdist(dict):
  30     """A probability distribution estimated from counts in datafile.
  31     Values are stored and returned as log probabilities.
  32     """
  33     def __init__(self, data=[], estimate_of_missing=None):
  34         data1, data2 = itertools.tee(data)
  35         self.total = sum([int(d[1]) for d in data1])
  36         for key, count in data2:
  37             self[key] = log10(int(count) / self.total)
  38         self.estimate_of_missing = estimate_of_missing or (lambda k, N: 1./N)
  39     def __missing__(self, key):
  40         return self.estimate_of_missing(key, self.total)
  41
  42 def datafile(name, sep='\t'):
  43     """Read key,value pairs from file.
  44     """
  45     with open(name, 'r') as f:
  46         for line in f:
  47             yield line.split(sep)
  48
  49 def avoid_long_words(key, N):
  50     """Estimate the probability of an unknown word.
  51     """
  52     return -log10((N * 10**(len(key) - 2)))
  53
  54 Pw  = Pdist(datafile('count_1w.txt'), avoid_long_words)
  55