Segmentation working, though hits recursion limit for texts longer than 250 characters

author Neil Smith <neil.git@njae.me.uk>

Sat, 19 Oct 2013 14:09:06 +0000 (15:09 +0100)

committer Neil Smith <neil.git@njae.me.uk>

Sat, 19 Oct 2013 14:09:06 +0000 (15:09 +0100)
author Neil Smith <neil.git@njae.me.uk>
Sat, 19 Oct 2013 14:09:06 +0000 (15:09 +0100)
committer Neil Smith <neil.git@njae.me.uk>
Sat, 19 Oct 2013 14:09:06 +0000 (15:09 +0100)
diff --git a/cipher.py b/cipher.py

index 8034043e8c59807df788f3add91e426b491ff2be..752efed3582c9caba29f2bb7db35b36c1d38dbd2 100644 (file)
--- a/cipher.py
+++ b/cipher.py
@@ -2,6 +2,7 @@ import string
  import collections
  import norms
  import logging
+from segment import segment
  
  logger = logging.getLogger(__name__)
  logger.addHandler(logging.FileHandler('cipher.log'))
diff --git a/segment.py b/segment.py

index e4b0d8ba3d70654650756a131f4d5dc7800e166e..e4b019f4c8248d8647f938fea8295c132308de0b 100644 (file)
--- a/segment.py
+++ b/segment.py
@@ -2,6 +2,7 @@
  import string
  import collections
  from math import log10
+import itertools
  
  def memo(f):
      "Memoize function f."
@@ -18,7 +19,7 @@ def segment(text):
      """Return a list of words that is the best segmentation of text.
      """
      if not text: return []
-    candidates = ([first]+segment(rem) for first,rem in splits(text))
+    candidates = ([first]+segment(rest) for first,rest in splits(text))
      return max(candidates, key=Pwords)
  
  def splits(text, L=20):
@@ -30,22 +31,20 @@ def splits(text, L=20):
  def Pwords(words): 
      """The Naive Bayes log probability of a sequence of words.
      """
-    return sum(Pw(w) for w in words)
+    return sum(Pw[w] for w in words)
  
  class Pdist(dict):
      """A probability distribution estimated from counts in datafile.
      Values are stored and returned as log probabilities.
      """
      def __init__(self, data=[], estimate_of_missing=None):
-        self.total = sum([int(d[1]) for d in data])
-        for key, count in data:
+        data1, data2 = itertools.tee(data)
+        self.total = sum([int(d[1]) for d in data1])
+        for key, count in data2:
              self[key] = log10(int(count) / self.total)
          self.estimate_of_missing = estimate_of_missing or (lambda k, N: 1./N)
-    def __call__(self, key): 
-        if key in self: 
-            return self[key]  
-        else: 
-            return self.estimate_of_missing(key, self.total)
+    def __missing__(self, key):
+        return self.estimate_of_missing(key, self.total)
  
  def datafile(name, sep='\t'):
      """Read key,value pairs from file.
@@ -59,6 +58,7 @@ def avoid_long_words(key, N):
      """
      return -log10((N * 10**(len(key) - 2)))
  
-N = 1024908267229 ## Number of tokens
+# N = 1024908267229 ## Number of tokens
  
  Pw  = Pdist(datafile('count_1w.txt'), avoid_long_words)
+
author	Neil Smith <neil.git@njae.me.uk>
	Sat, 19 Oct 2013 14:09:06 +0000 (15:09 +0100)
committer	Neil Smith <neil.git@njae.me.uk>
	Sat, 19 Oct 2013 14:09:06 +0000 (15:09 +0100)
cipher.py		patch \| blob \| history
segment.py		patch \| blob \| history