X-Git-Url: https://git.njae.me.uk/?a=blobdiff_plain;f=segment.py;fp=segment.py;h=f90af1d92c8fe0046b4f5c7a4975f53e7011e6ea;hb=269665fe76e7aeb87165a87d3a1cbb755a5e3768;hp=0000000000000000000000000000000000000000;hpb=b70e360b5476dbbc9b4da67bedd15674ef34086f;p=cipher-tools.git

diff --git a/segment.py b/segment.py
new file mode 100644
index 0000000..f90af1d
--- /dev/null
+++ b/segment.py
@@ -0,0 +1,54 @@
+# import re, string, random, glob, operator, heapq
+import string
+import collections
+from math import log10
+
+def memo(f):
+    "Memoize function f."
+    table = {}
+    def fmemo(*args):
+        if args not in table:
+            table[args] = f(*args)
+        return table[args]
+    fmemo.memo = table
+    return fmemo
+
+@memo
+def segment(text):
+    "Return a list of words that is the best segmentation of text."
+    if not text: return []
+    candidates = ([first]+segment(rem) for first,rem in splits(text))
+    return max(candidates, key=Pwords)
+
+def splits(text, L=20):
+    "Return a list of all possible (first, rem) pairs, len(first)<=L."
+    return [(text[:i+1], text[i+1:]) 
+            for i in range(min(len(text), L))]
+
+def Pwords(words): 
+    "The Naive Bayes probability of a sequence of words."
+    return product(Pw(w) for w in words)
+
+class Pdist(dict):
+    "A probability distribution estimated from counts in datafile."
+    def __init__(self, data=[], N=None, missingfn=None):
+        for key,count in data:
+            self[key] = self.get(key, 0) + int(count)
+        self.N = float(N or sum(self.itervalues()))
+        self.missingfn = missingfn or (lambda k, N: 1./N)
+    def __call__(self, key): 
+        if key in self: return self[key]/self.N  
+        else: return self.missingfn(key, self.N)
+
+def datafile(name, sep='\t'):
+    "Read key,value pairs from file."
+    for line in file(name):
+        yield line.split(sep)
+
+def avoid_long_words(key, N):
+    "Estimate the probability of an unknown word."
+    return 10./(N * 10**len(key))
+
+N = 1024908267229 ## Number of tokens
+
+Pw  = Pdist(datafile('count_1w.txt'), N, avoid_long_words)