Updated .gitignore to ignore .pyc files
[cipher-tools.git] / segment.py
1 # import re, string, random, glob, operator, heapq
2 import string
3 import collections
4 from math import log10
5
6 def memo(f):
7 "Memoize function f."
8 table = {}
9 def fmemo(*args):
10 if args not in table:
11 table[args] = f(*args)
12 return table[args]
13 fmemo.memo = table
14 return fmemo
15
16 @memo
17 def segment(text):
18 "Return a list of words that is the best segmentation of text."
19 if not text: return []
20 candidates = ([first]+segment(rem) for first,rem in splits(text))
21 return max(candidates, key=Pwords)
22
23 def splits(text, L=20):
24 "Return a list of all possible (first, rem) pairs, len(first)<=L."
25 return [(text[:i+1], text[i+1:])
26 for i in range(min(len(text), L))]
27
28 def Pwords(words):
29 "The Naive Bayes probability of a sequence of words."
30 return product(Pw(w) for w in words)
31
32 class Pdist(dict):
33 "A probability distribution estimated from counts in datafile."
34 def __init__(self, data=[], N=None, missingfn=None):
35 for key,count in data:
36 self[key] = self.get(key, 0) + int(count)
37 self.N = float(N or sum(self.itervalues()))
38 self.missingfn = missingfn or (lambda k, N: 1./N)
39 def __call__(self, key):
40 if key in self: return self[key]/self.N
41 else: return self.missingfn(key, self.N)
42
43 def datafile(name, sep='\t'):
44 "Read key,value pairs from file."
45 for line in file(name):
46 yield line.split(sep)
47
48 def avoid_long_words(key, N):
49 "Estimate the probability of an unknown word."
50 return 10./(N * 10**len(key))
51
52 N = 1024908267229 ## Number of tokens
53
54 Pw = Pdist(datafile('count_1w.txt'), N, avoid_long_words)