1 # import re, string, random, glob, operator, heapq
12 table
[args
] = f(*args
)
19 """Return a list of words that is the best segmentation of text.
21 if not text
: return []
22 candidates
= ([first
]+segment(rest
) for first
,rest
in splits(text
))
23 return max(candidates
, key
=Pwords
)
25 def splits(text
, L
=20):
26 """Return a list of all possible (first, rest) pairs, len(first)<=L.
28 return [(text
[:i
+1], text
[i
+1:])
29 for i
in range(min(len(text
), L
))]
32 """The Naive Bayes log probability of a sequence of words.
34 return sum(Pw
[w
] for w
in words
)
37 """A probability distribution estimated from counts in datafile.
38 Values are stored and returned as log probabilities.
40 def __init__(self
, data
=[], estimate_of_missing
=None):
41 data1
, data2
= itertools
.tee(data
)
42 self
.total
= sum([int(d
[1]) for d
in data1
])
43 for key
, count
in data2
:
44 self
[key
] = log10(int(count
) / self
.total
)
45 self
.estimate_of_missing
= estimate_of_missing
or (lambda k
, N
: 1./N
)
46 def __missing__(self
, key
):
47 return self
.estimate_of_missing(key
, self
.total
)
49 def datafile(name
, sep
='\t'):
50 """Read key,value pairs from file.
52 with
open(name
, 'r') as f
:
56 def avoid_long_words(key
, N
):
57 """Estimate the probability of an unknown word.
59 return -log10((N
* 10**(len(key
) - 2)))
61 Pw
= Pdist(datafile('count_1w.txt'), avoid_long_words
)