1 # import re, string, random, glob, operator, heapq
7 sys
.setrecursionlimit(1000000)
14 table
[args
] = f(*args
)
21 """Return a list of words that is the best segmentation of text.
23 if not text
: return []
24 candidates
= ([first
]+segment(rest
) for first
,rest
in splits(text
))
25 return max(candidates
, key
=Pwords
)
27 def splits(text
, L
=20):
28 """Return a list of all possible (first, rest) pairs, len(first)<=L.
30 return [(text
[:i
+1], text
[i
+1:])
31 for i
in range(min(len(text
), L
))]
34 """The Naive Bayes log probability of a sequence of words.
36 return sum(Pw
[w
] for w
in words
)
39 """A probability distribution estimated from counts in datafile.
40 Values are stored and returned as log probabilities.
42 def __init__(self
, data
=[], estimate_of_missing
=None):
43 data1
, data2
= itertools
.tee(data
)
44 self
.total
= sum([int(d
[1]) for d
in data1
])
45 for key
, count
in data2
:
46 self
[key
] = log10(int(count
) / self
.total
)
47 self
.estimate_of_missing
= estimate_of_missing
or (lambda k
, N
: 1./N
)
48 def __missing__(self
, key
):
49 return self
.estimate_of_missing(key
, self
.total
)
51 def datafile(name
, sep
='\t'):
52 """Read key,value pairs from file.
54 with
open(name
, 'r') as f
:
58 def avoid_long_words(key
, N
):
59 """Estimate the probability of an unknown word.
61 return -log10((N
* 10**(len(key
) - 2)))
63 Pw
= Pdist(datafile('count_1w.txt'), avoid_long_words
)