From a28096fcfca066aea1cddc4ff29c2f09f0528852 Mon Sep 17 00:00:00 2001 From: Neil Smith Date: Mon, 14 Jul 2014 20:48:33 +0100 Subject: [PATCH] Breaking affine ciphers --- language_models.py | 38 +++----------------------------------- segment.py | 30 ------------------------------ 2 files changed, 3 insertions(+), 65 deletions(-) delete mode 100644 segment.py diff --git a/language_models.py b/language_models.py index 8f4bd9c..4798d73 100644 --- a/language_models.py +++ b/language_models.py @@ -65,44 +65,12 @@ def datafile(name, sep='\t'): english_counts = collections.Counter(dict(datafile('count_1l.txt'))) normalised_english_counts = norms.normalise(english_counts) +# The log 10 probabilities of letters +Pl = {l: log10(n) for l, n in normalised_english_counts.items()} + with open('words.txt', 'r') as f: keywords = [line.rstrip() for line in f] - -class Pdist(dict): - """A probability distribution estimated from counts in datafile. - Values are stored and returned as log probabilities. - """ - def __init__(self, data=[], estimate_of_missing=None): - data1, data2 = itertools.tee(data) - self.total = sum([d[1] for d in data1]) - for key, count in data2: - self[key] = log10(count / self.total) - self.estimate_of_missing = estimate_of_missing or (lambda k, N: 1./N) - def __missing__(self, key): - return self.estimate_of_missing(key, self.total) - -def log_probability_of_unknown_word(key, N): - """Estimate the probability of an unknown word. - """ - return -log10(N * 10**((len(key) - 2) * 1.4)) - -Pw = Pdist(datafile('count_1w.txt'), log_probability_of_unknown_word) -Pw_wrong = Pdist(datafile('count_1w.txt'), lambda _k, N: log10(1/N)) -Pl = Pdist(datafile('count_1l.txt'), lambda _k, _N: 0) -P2l = Pdist(datafile('count_2l.txt'), lambda _k, _N: 0) -P3l = Pdist(datafile('count_3l.txt'), lambda _k, _N: 0) - -def Pwords(words): - """The Naive Bayes log probability of a sequence of words. - """ - return sum(Pw[w.lower()] for w in words) - -def Pwords_wrong(words): - """The Naive Bayes log probability of a sequence of words. - """ - return sum(Pw_wrong[w.lower()] for w in words) - def Pletters(letters): """The Naive Bayes log probability of a sequence of letters. """ diff --git a/segment.py b/segment.py deleted file mode 100644 index a64ea5d..0000000 --- a/segment.py +++ /dev/null @@ -1,30 +0,0 @@ -"""Segment a collection of letters into words""" - -import language_models -import sys -from functools import lru_cache -sys.setrecursionlimit(1000000) - -@lru_cache() -def segment(text): - """Return a list of words that is the best segmentation of text. - """ - if not text: return [] - candidates = ([first]+segment(rest) for first, rest in splits(text)) - return max(candidates, key=language_models.Pwords) - -@lru_cache() -def segment_wrong(text): - """Return a list of words that is the best segmentation of text. - """ - if not text: return [] - candidates = ([first]+segment(rest) for first, rest in splits(text)) - return max(candidates, key=language_models.Pwords_wrong) - - -def splits(text, L=20): - """Return a list of all possible (first, rest) pairs, len(first)<=L. - """ - return [(text[:i+1], text[i+1:]) - for i in range(min(len(text), L))] - -- 2.34.1