From 7d5fbf6f51f9d68e684e4f46fcbc19b6153c9dc7 Mon Sep 17 00:00:00 2001 From: Neil Smith Date: Tue, 15 Jul 2014 08:43:45 +0100 Subject: [PATCH] Breaking affine ciphers --- cipherbreak.py | 1 - language_models.py | 33 +-------------------------------- segment.py | 30 ------------------------------ 3 files changed, 1 insertion(+), 63 deletions(-) delete mode 100644 segment.py diff --git a/cipherbreak.py b/cipherbreak.py index aba309d..7b004cf 100644 --- a/cipherbreak.py +++ b/cipherbreak.py @@ -5,7 +5,6 @@ import string import collections import norms import logging -from segment import segment import matplotlib.pyplot as plt diff --git a/language_models.py b/language_models.py index d8b9da8..bd09a15 100644 --- a/language_models.py +++ b/language_models.py @@ -64,43 +64,12 @@ def datafile(name, sep='\t'): english_counts = collections.Counter(dict(datafile('count_1l.txt'))) normalised_english_counts = norms.normalise(english_counts) +Pl = {l: log10(n) for l, n in normalised_english_counts.items()} with open('words.txt', 'r') as f: keywords = [line.rstrip() for line in f] -class Pdist(dict): - """A probability distribution estimated from counts in datafile. - Values are stored and returned as log probabilities. - """ - def __init__(self, data=[], estimate_of_missing=None): - data1, data2 = itertools.tee(data) - self.total = sum([d[1] for d in data1]) - for key, count in data2: - self[key] = log10(count / self.total) - self.estimate_of_missing = estimate_of_missing or (lambda k, N: 1./N) - def __missing__(self, key): - return self.estimate_of_missing(key, self.total) - -def log_probability_of_unknown_word(key, N): - """Estimate the probability of an unknown word. - """ - return -log10(N * 10**((len(key) - 2) * 1.4)) - -Pw = Pdist(datafile('count_1w.txt'), log_probability_of_unknown_word) -Pw_wrong = Pdist(datafile('count_1w.txt'), lambda _k, N: log10(1/N)) -Pl = Pdist(datafile('count_1l.txt'), lambda _k, _N: 0) - -def Pwords(words): - """The Naive Bayes log probability of a sequence of words. - """ - return sum(Pw[w.lower()] for w in words) - -def Pwords_wrong(words): - """The Naive Bayes log probability of a sequence of words. - """ - return sum(Pw_wrong[w.lower()] for w in words) - def Pletters(letters): """The Naive Bayes log probability of a sequence of letters. """ diff --git a/segment.py b/segment.py deleted file mode 100644 index a64ea5d..0000000 --- a/segment.py +++ /dev/null @@ -1,30 +0,0 @@ -"""Segment a collection of letters into words""" - -import language_models -import sys -from functools import lru_cache -sys.setrecursionlimit(1000000) - -@lru_cache() -def segment(text): - """Return a list of words that is the best segmentation of text. - """ - if not text: return [] - candidates = ([first]+segment(rest) for first, rest in splits(text)) - return max(candidates, key=language_models.Pwords) - -@lru_cache() -def segment_wrong(text): - """Return a list of words that is the best segmentation of text. - """ - if not text: return [] - candidates = ([first]+segment(rest) for first, rest in splits(text)) - return max(candidates, key=language_models.Pwords_wrong) - - -def splits(text, L=20): - """Return a list of all possible (first, rest) pairs, len(first)<=L. - """ - return [(text[:i+1], text[i+1:]) - for i in range(min(len(text), L))] - -- 2.34.1