Word segmentation not working, but it's now late...
[cipher-tools.git] / segment.py
1 # import re, string, random, glob, operator, heapq
2 import string
3 import collections
4 from math import log10
5
6 def memo(f):
7 "Memoize function f."
8 table = {}
9 def fmemo(*args):
10 if args not in table:
11 table[args] = f(*args)
12 return table[args]
13 fmemo.memo = table
14 return fmemo
15
16 @memo
17 def segment(text):
18 """Return a list of words that is the best segmentation of text.
19 """
20 if not text: return []
21 candidates = ([first]+segment(rem) for first,rem in splits(text))
22 return max(candidates, key=Pwords)
23
24 def splits(text, L=20):
25 """Return a list of all possible (first, rest) pairs, len(first)<=L.
26 """
27 return [(text[:i+1], text[i+1:])
28 for i in range(min(len(text), L))]
29
30 def Pwords(words):
31 """The Naive Bayes log probability of a sequence of words.
32 """
33 return sum(Pw(w) for w in words)
34
35 class Pdist(dict):
36 """A probability distribution estimated from counts in datafile.
37 Values are stored and returned as log probabilities.
38 """
39 def __init__(self, data=[], estimate_of_missing=None):
40 self.total = sum([int(d[1]) for d in data])
41 for key, count in data:
42 self[key] = log10(int(count) / self.total)
43 self.estimate_of_missing = estimate_of_missing or (lambda k, N: 1./N)
44 def __call__(self, key):
45 if key in self:
46 return self[key]
47 else:
48 return self.estimate_of_missing(key, self.total)
49
50 def datafile(name, sep='\t'):
51 """Read key,value pairs from file.
52 """
53 with open(name, 'r') as f:
54 for line in f:
55 yield line.split(sep)
56
57 def avoid_long_words(key, N):
58 """Estimate the probability of an unknown word.
59 """
60 return -log10((N * 10**(len(key) - 2)))
61
62 N = 1024908267229 ## Number of tokens
63
64 Pw = Pdist(datafile('count_1w.txt'), avoid_long_words)