More whitespace and line length tidying
[cipher-tools.git] / segment.py
1 # import re, string, random, glob, operator, heapq
2 import string
3 import collections
4 from math import log10
5 import itertools
6 import sys
7 from functools import lru_cache
8 sys.setrecursionlimit(1000000)
9
10 @lru_cache()
11 def segment(text):
12 """Return a list of words that is the best segmentation of text.
13 """
14 if not text: return []
15 candidates = ([first]+segment(rest) for first,rest in splits(text))
16 return max(candidates, key=Pwords)
17
18 def splits(text, L=20):
19 """Return a list of all possible (first, rest) pairs, len(first)<=L.
20 """
21 return [(text[:i+1], text[i+1:])
22 for i in range(min(len(text), L))]
23
24 def Pwords(words):
25 """The Naive Bayes log probability of a sequence of words.
26 """
27 return sum(Pw[w] for w in words)
28
29 class Pdist(dict):
30 """A probability distribution estimated from counts in datafile.
31 Values are stored and returned as log probabilities.
32 """
33 def __init__(self, data=[], estimate_of_missing=None):
34 data1, data2 = itertools.tee(data)
35 self.total = sum([int(d[1]) for d in data1])
36 for key, count in data2:
37 self[key] = log10(int(count) / self.total)
38 self.estimate_of_missing = estimate_of_missing or (lambda k, N: 1./N)
39 def __missing__(self, key):
40 return self.estimate_of_missing(key, self.total)
41
42 def datafile(name, sep='\t'):
43 """Read key,value pairs from file.
44 """
45 with open(name, 'r') as f:
46 for line in f:
47 yield line.split(sep)
48
49 def avoid_long_words(key, N):
50 """Estimate the probability of an unknown word.
51 """
52 return -log10((N * 10**(len(key) - 2)))
53
54 Pw = Pdist(datafile('count_1w.txt'), avoid_long_words)
55