712895b6b0d7f1563ee4149fe7d94445a3931233
1 # import re, string, random, glob, operator, heapq
7 from functools
import lru_cache
8 sys
.setrecursionlimit(1000000)
12 """Return a list of words that is the best segmentation of text.
14 if not text
: return []
15 candidates
= ([first
]+segment(rest
) for first
,rest
in splits(text
))
16 return max(candidates
, key
=Pwords
)
18 def splits(text
, L
=20):
19 """Return a list of all possible (first, rest) pairs, len(first)<=L.
21 return [(text
[:i
+1], text
[i
+1:])
22 for i
in range(min(len(text
), L
))]
25 """The Naive Bayes log probability of a sequence of words.
27 return sum(Pw
[w
] for w
in words
)
30 """A probability distribution estimated from counts in datafile.
31 Values are stored and returned as log probabilities.
33 def __init__(self
, data
=[], estimate_of_missing
=None):
34 data1
, data2
= itertools
.tee(data
)
35 self
.total
= sum([int(d
[1]) for d
in data1
])
36 for key
, count
in data2
:
37 self
[key
] = log10(int(count
) / self
.total
)
38 self
.estimate_of_missing
= estimate_of_missing
or (lambda k
, N
: 1./N
)
39 def __missing__(self
, key
):
40 return self
.estimate_of_missing(key
, self
.total
)
42 def datafile(name
, sep
='\t'):
43 """Read key,value pairs from file.
45 with
open(name
, 'r') as f
:
49 def avoid_long_words(key
, N
):
50 """Estimate the probability of an unknown word.
52 return -log10((N
* 10**(len(key
) - 2)))
54 Pw
= Pdist(datafile('count_1w.txt'), avoid_long_words
)