1 # import re, string, random, glob, operator, heapq
11 table
[args
] = f(*args
)
18 "Return a list of words that is the best segmentation of text."
19 if not text
: return []
20 candidates
= ([first
]+segment(rem
) for first
,rem
in splits(text
))
21 return max(candidates
, key
=Pwords
)
23 def splits(text
, L
=20):
24 "Return a list of all possible (first, rem) pairs, len(first)<=L."
25 return [(text
[:i
+1], text
[i
+1:])
26 for i
in range(min(len(text
), L
))]
29 "The Naive Bayes probability of a sequence of words."
30 return product(Pw(w
) for w
in words
)
33 "A probability distribution estimated from counts in datafile."
34 def __init__(self
, data
=[], N
=None, missingfn
=None):
35 for key
,count
in data
:
36 self
[key
] = self
.get(key
, 0) + int(count
)
37 self
.N
= float(N
or sum(self
.itervalues()))
38 self
.missingfn
= missingfn
or (lambda k
, N
: 1./N
)
39 def __call__(self
, key
):
40 if key
in self
: return self
[key
]/self
.N
41 else: return self
.missingfn(key
, self
.N
)
43 def datafile(name
, sep
='\t'):
44 "Read key,value pairs from file."
45 for line
in file(name
):
48 def avoid_long_words(key
, N
):
49 "Estimate the probability of an unknown word."
50 return 10./(N
* 10**len(key
))
52 N
= 1024908267229 ## Number of tokens
54 Pw
= Pdist(datafile('count_1w.txt'), N
, avoid_long_words
)