1 # import re, string, random, glob, operator, heapq
11 table
[args
] = f(*args
)
18 """Return a list of words that is the best segmentation of text.
20 if not text
: return []
21 candidates
= ([first
]+segment(rem
) for first
,rem
in splits(text
))
22 return max(candidates
, key
=Pwords
)
24 def splits(text
, L
=20):
25 """Return a list of all possible (first, rest) pairs, len(first)<=L.
27 return [(text
[:i
+1], text
[i
+1:])
28 for i
in range(min(len(text
), L
))]
31 """The Naive Bayes log probability of a sequence of words.
33 return sum(Pw(w
) for w
in words
)
36 """A probability distribution estimated from counts in datafile.
37 Values are stored and returned as log probabilities.
39 def __init__(self
, data
=[], estimate_of_missing
=None):
40 self
.total
= sum([int(d
[1]) for d
in data
])
41 for key
, count
in data
:
42 self
[key
] = log10(int(count
) / self
.total
)
43 self
.estimate_of_missing
= estimate_of_missing
or (lambda k
, N
: 1./N
)
44 def __call__(self
, key
):
48 return self
.estimate_of_missing(key
, self
.total
)
50 def datafile(name
, sep
='\t'):
51 """Read key,value pairs from file.
53 with
open(name
, 'r') as f
:
57 def avoid_long_words(key
, N
):
58 """Estimate the probability of an unknown word.
60 return -log10((N
* 10**(len(key
) - 2)))
62 N
= 1024908267229 ## Number of tokens
64 Pw
= Pdist(datafile('count_1w.txt'), avoid_long_words
)