Segmentation working, though hits recursion limit for texts longer than 250 characters
[cipher-tools.git] / segment.py
1 # import re, string, random, glob, operator, heapq
2 import string
3 import collections
4 from math import log10
5 import itertools
6
7 def memo(f):
8 "Memoize function f."
9 table = {}
10 def fmemo(*args):
11 if args not in table:
12 table[args] = f(*args)
13 return table[args]
14 fmemo.memo = table
15 return fmemo
16
17 @memo
18 def segment(text):
19 """Return a list of words that is the best segmentation of text.
20 """
21 if not text: return []
22 candidates = ([first]+segment(rest) for first,rest in splits(text))
23 return max(candidates, key=Pwords)
24
25 def splits(text, L=20):
26 """Return a list of all possible (first, rest) pairs, len(first)<=L.
27 """
28 return [(text[:i+1], text[i+1:])
29 for i in range(min(len(text), L))]
30
31 def Pwords(words):
32 """The Naive Bayes log probability of a sequence of words.
33 """
34 return sum(Pw[w] for w in words)
35
36 class Pdist(dict):
37 """A probability distribution estimated from counts in datafile.
38 Values are stored and returned as log probabilities.
39 """
40 def __init__(self, data=[], estimate_of_missing=None):
41 data1, data2 = itertools.tee(data)
42 self.total = sum([int(d[1]) for d in data1])
43 for key, count in data2:
44 self[key] = log10(int(count) / self.total)
45 self.estimate_of_missing = estimate_of_missing or (lambda k, N: 1./N)
46 def __missing__(self, key):
47 return self.estimate_of_missing(key, self.total)
48
49 def datafile(name, sep='\t'):
50 """Read key,value pairs from file.
51 """
52 with open(name, 'r') as f:
53 for line in f:
54 yield line.split(sep)
55
56 def avoid_long_words(key, N):
57 """Estimate the probability of an unknown word.
58 """
59 return -log10((N * 10**(len(key) - 2)))
60
61 # N = 1024908267229 ## Number of tokens
62
63 Pw = Pdist(datafile('count_1w.txt'), avoid_long_words)
64