projects
/
cipher-tools.git
/ commitdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
| commitdiff |
tree
raw
|
patch
|
inline
| side by side (parent:
6e185eb
)
Segmentation working, though hits recursion limit for texts longer than 250 characters
author
Neil Smith
<neil.git@njae.me.uk>
Sat, 19 Oct 2013 14:09:06 +0000
(15:09 +0100)
committer
Neil Smith
<neil.git@njae.me.uk>
Sat, 19 Oct 2013 14:09:06 +0000
(15:09 +0100)
cipher.py
patch
|
blob
|
history
segment.py
patch
|
blob
|
history
diff --git
a/cipher.py
b/cipher.py
index 8034043e8c59807df788f3add91e426b491ff2be..752efed3582c9caba29f2bb7db35b36c1d38dbd2 100644
(file)
--- a/
cipher.py
+++ b/
cipher.py
@@
-2,6
+2,7
@@
import string
import collections
import norms
import logging
import collections
import norms
import logging
+from segment import segment
logger = logging.getLogger(__name__)
logger.addHandler(logging.FileHandler('cipher.log'))
logger = logging.getLogger(__name__)
logger.addHandler(logging.FileHandler('cipher.log'))
diff --git
a/segment.py
b/segment.py
index e4b0d8ba3d70654650756a131f4d5dc7800e166e..e4b019f4c8248d8647f938fea8295c132308de0b 100644
(file)
--- a/
segment.py
+++ b/
segment.py
@@
-2,6
+2,7
@@
import string
import collections
from math import log10
import string
import collections
from math import log10
+import itertools
def memo(f):
"Memoize function f."
def memo(f):
"Memoize function f."
@@
-18,7
+19,7
@@
def segment(text):
"""Return a list of words that is the best segmentation of text.
"""
if not text: return []
"""Return a list of words that is the best segmentation of text.
"""
if not text: return []
- candidates = ([first]+segment(re
m) for first,rem
in splits(text))
+ candidates = ([first]+segment(re
st) for first,rest
in splits(text))
return max(candidates, key=Pwords)
def splits(text, L=20):
return max(candidates, key=Pwords)
def splits(text, L=20):
@@
-30,22
+31,20
@@
def splits(text, L=20):
def Pwords(words):
"""The Naive Bayes log probability of a sequence of words.
"""
def Pwords(words):
"""The Naive Bayes log probability of a sequence of words.
"""
- return sum(Pw
(w)
for w in words)
+ return sum(Pw
[w]
for w in words)
class Pdist(dict):
"""A probability distribution estimated from counts in datafile.
Values are stored and returned as log probabilities.
"""
def __init__(self, data=[], estimate_of_missing=None):
class Pdist(dict):
"""A probability distribution estimated from counts in datafile.
Values are stored and returned as log probabilities.
"""
def __init__(self, data=[], estimate_of_missing=None):
- self.total = sum([int(d[1]) for d in data])
- for key, count in data:
+ data1, data2 = itertools.tee(data)
+ self.total = sum([int(d[1]) for d in data1])
+ for key, count in data2:
self[key] = log10(int(count) / self.total)
self.estimate_of_missing = estimate_of_missing or (lambda k, N: 1./N)
self[key] = log10(int(count) / self.total)
self.estimate_of_missing = estimate_of_missing or (lambda k, N: 1./N)
- def __call__(self, key):
- if key in self:
- return self[key]
- else:
- return self.estimate_of_missing(key, self.total)
+ def __missing__(self, key):
+ return self.estimate_of_missing(key, self.total)
def datafile(name, sep='\t'):
"""Read key,value pairs from file.
def datafile(name, sep='\t'):
"""Read key,value pairs from file.
@@
-59,6
+58,7
@@
def avoid_long_words(key, N):
"""
return -log10((N * 10**(len(key) - 2)))
"""
return -log10((N * 10**(len(key) - 2)))
-N = 1024908267229 ## Number of tokens
+
#
N = 1024908267229 ## Number of tokens
Pw = Pdist(datafile('count_1w.txt'), avoid_long_words)
Pw = Pdist(datafile('count_1w.txt'), avoid_long_words)
+