6 from functools
import lru_cache
7 sys
.setrecursionlimit(1000000)
11 """Return a list of words that is the best segmentation of text.
13 if not text
: return []
14 candidates
= ([first
]+segment(rest
) for first
,rest
in splits(text
))
15 return max(candidates
, key
=Pwords
)
17 def splits(text
, L
=20):
18 """Return a list of all possible (first, rest) pairs, len(first)<=L.
20 return [(text
[:i
+1], text
[i
+1:])
21 for i
in range(min(len(text
), L
))]
24 """The Naive Bayes log probability of a sequence of words.
26 return sum(Pw
[w
.lower()] for w
in words
)
29 """A probability distribution estimated from counts in datafile.
30 Values are stored and returned as log probabilities.
32 def __init__(self
, data
=[], estimate_of_missing
=None):
33 data1
, data2
= itertools
.tee(data
)
34 self
.total
= sum([int(d
[1]) for d
in data1
])
35 for key
, count
in data2
:
36 self
[key
] = log10(int(count
) / self
.total
)
37 self
.estimate_of_missing
= estimate_of_missing
or (lambda k
, N
: 1./N
)
38 def __missing__(self
, key
):
39 return self
.estimate_of_missing(key
, self
.total
)
41 def datafile(name
, sep
='\t'):
42 """Read key,value pairs from file.
44 with
open(name
, 'r') as f
:
48 def avoid_long_words(key
, N
):
49 """Estimate the probability of an unknown word.
51 return -log10((N
* 10**(len(key
) - 2)))
53 Pw
= Pdist(datafile('count_1w.txt'), avoid_long_words
)