From 4ee90e9b9683a7688d9a2b7ed972a2511530c0ba Mon Sep 17 00:00:00 2001 From: Neil Smith Date: Sun, 1 Jun 2014 20:24:19 +0100 Subject: [PATCH] Included Python 3.4's Enum for keyword alphabets, added Pwords_wrong and friends for development of word segmentation --- cipher.py | 38 +++++++++++++++++++++----------------- cipherbreak.py | 13 +++++++------ language_models.py | 7 +++++++ segment.py | 9 +++++++++ 4 files changed, 44 insertions(+), 23 deletions(-) diff --git a/cipher.py b/cipher.py index 11bdde6..ba62d41 100644 --- a/cipher.py +++ b/cipher.py @@ -1,6 +1,7 @@ import string import collections import math +from enum import Enum from itertools import zip_longest, cycle, chain from language_models import * @@ -240,28 +241,31 @@ def affine_decipher(message, multiplier=1, adder=0, one_based=True): return ''.join(enciphered) -def keyword_cipher_alphabet_of(keyword, wrap_alphabet=0): +class Keyword_wrap_alphabet(Enum): + from_a = 0 + from_last = 1 + from_largest = 2 + + +def keyword_cipher_alphabet_of(keyword, wrap_alphabet=Keyword_wrap_alphabet.from_a): """Find the cipher alphabet given a keyword. wrap_alphabet controls how the rest of the alphabet is added after the keyword. - 0 : from 'a' - 1 : from the last letter in the sanitised keyword - 2 : from the largest letter in the sanitised keyword >>> keyword_cipher_alphabet_of('bayes') 'bayescdfghijklmnopqrtuvwxz' - >>> keyword_cipher_alphabet_of('bayes', 0) + >>> keyword_cipher_alphabet_of('bayes', Keyword_wrap_alphabet.from_a) 'bayescdfghijklmnopqrtuvwxz' - >>> keyword_cipher_alphabet_of('bayes', 1) + >>> keyword_cipher_alphabet_of('bayes', Keyword_wrap_alphabet.from_last) 'bayestuvwxzcdfghijklmnopqr' - >>> keyword_cipher_alphabet_of('bayes', 2) + >>> keyword_cipher_alphabet_of('bayes', Keyword_wrap_alphabet.from_largest) 'bayeszcdfghijklmnopqrtuvwx' """ - if wrap_alphabet == 0: + if wrap_alphabet == Keyword_wrap_alphabet.from_a: cipher_alphabet = ''.join(deduplicate(sanitise(keyword) + string.ascii_lowercase)) else: - if wrap_alphabet == 1: + if wrap_alphabet == Keyword_wrap_alphabet.from_last: last_keyword_letter = deduplicate(sanitise(keyword))[-1] else: last_keyword_letter = sorted(sanitise(keyword))[-1] @@ -274,7 +278,7 @@ def keyword_cipher_alphabet_of(keyword, wrap_alphabet=0): return cipher_alphabet -def keyword_encipher(message, keyword, wrap_alphabet=0): +def keyword_encipher(message, keyword, wrap_alphabet=Keyword_wrap_alphabet.from_a): """Enciphers a message with a keyword substitution cipher. wrap_alphabet controls how the rest of the alphabet is added after the keyword. @@ -284,18 +288,18 @@ def keyword_encipher(message, keyword, wrap_alphabet=0): >>> keyword_encipher('test message', 'bayes') 'rsqr ksqqbds' - >>> keyword_encipher('test message', 'bayes', 0) + >>> keyword_encipher('test message', 'bayes', Keyword_wrap_alphabet.from_a) 'rsqr ksqqbds' - >>> keyword_encipher('test message', 'bayes', 1) + >>> keyword_encipher('test message', 'bayes', Keyword_wrap_alphabet.from_last) 'lskl dskkbus' - >>> keyword_encipher('test message', 'bayes', 2) + >>> keyword_encipher('test message', 'bayes', Keyword_wrap_alphabet.from_largest) 'qspq jsppbcs' """ cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet) cipher_translation = ''.maketrans(string.ascii_lowercase, cipher_alphabet) return unaccent(message).lower().translate(cipher_translation) -def keyword_decipher(message, keyword, wrap_alphabet=0): +def keyword_decipher(message, keyword, wrap_alphabet=Keyword_wrap_alphabet.from_a): """Deciphers a message with a keyword substitution cipher. wrap_alphabet controls how the rest of the alphabet is added after the keyword. @@ -305,11 +309,11 @@ def keyword_decipher(message, keyword, wrap_alphabet=0): >>> keyword_decipher('rsqr ksqqbds', 'bayes') 'test message' - >>> keyword_decipher('rsqr ksqqbds', 'bayes', 0) + >>> keyword_decipher('rsqr ksqqbds', 'bayes', Keyword_wrap_alphabet.from_a) 'test message' - >>> keyword_decipher('lskl dskkbus', 'bayes', 1) + >>> keyword_decipher('lskl dskkbus', 'bayes', Keyword_wrap_alphabet.from_last) 'test message' - >>> keyword_decipher('qspq jsppbcs', 'bayes', 2) + >>> keyword_decipher('qspq jsppbcs', 'bayes', Keyword_wrap_alphabet.from_largest) 'test message' """ cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet) diff --git a/cipherbreak.py b/cipherbreak.py index 95b5c20..0d3fd78 100644 --- a/cipherbreak.py +++ b/cipherbreak.py @@ -131,14 +131,14 @@ def keyword_break(message, wordlist=keywords, fitness=Pletters): frequency analysis >>> keyword_break(keyword_encipher('this is a test message for the ' \ - 'keyword decipherment', 'elephant', 1), \ + 'keyword decipherment', 'elephant', Keyword_wrap_alphabet.from_last), \ wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS - (('elephant', 1), -52.834575011...) + (('elephant', ), -52.834575011...) """ best_keyword = '' best_wrap_alphabet = True best_fit = float("-inf") - for wrap_alphabet in range(3): + for wrap_alphabet in Keyword_wrap_alphabet: for keyword in wordlist: plaintext = keyword_decipher(message, keyword, wrap_alphabet) fit = fitness(plaintext) @@ -162,13 +162,14 @@ def keyword_break_mp(message, wordlist=keywords, fitness=Pletters, chunksize=500 frequency analysis >>> keyword_break_mp(keyword_encipher('this is a test message for the ' \ - 'keyword decipherment', 'elephant', 1), \ + 'keyword decipherment', 'elephant', Keyword_wrap_alphabet.from_last), \ wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS - (('elephant', 1), -52.834575011...) + (('elephant', ), -52.834575011...) """ with Pool() as pool: helper_args = [(message, word, wrap, fitness) - for word in wordlist for wrap in range(3)] + for word in wordlist + for wrap in Keyword_wrap_alphabet] # Gotcha: the helper function here needs to be defined at the top level # (limitation of Pool.starmap) breaks = pool.starmap(keyword_break_worker, helper_args, chunksize) diff --git a/language_models.py b/language_models.py index ceb4596..52e7ac4 100644 --- a/language_models.py +++ b/language_models.py @@ -120,6 +120,7 @@ def log_probability_of_unknown_word(key, N): return -log10(N * 10**((len(key) - 2) * 1.4)) Pw = Pdist(datafile('count_1w.txt'), log_probability_of_unknown_word) +Pw_wrong = Pdist(datafile('count_1w.txt'), lambda _k, N: log10(1/N)) Pl = Pdist(datafile('count_1l.txt'), lambda _k, _N: 0) P2l = Pdist(datafile('count_2l.txt'), lambda _k, _N: 0) @@ -128,6 +129,12 @@ def Pwords(words): """ return sum(Pw[w.lower()] for w in words) +def Pwords_wrong(words): + """The Naive Bayes log probability of a sequence of words. + """ + return sum(Pw_wrong[w.lower()] for w in words) + + def Pletters(letters): """The Naive Bayes log probability of a sequence of letters. """ diff --git a/segment.py b/segment.py index ba3ddd7..1af1b62 100644 --- a/segment.py +++ b/segment.py @@ -11,6 +11,15 @@ def segment(text): candidates = ([first]+segment(rest) for first,rest in splits(text)) return max(candidates, key=language_models.Pwords) +@lru_cache() +def segment_wrong(text): + """Return a list of words that is the best segmentation of text. + """ + if not text: return [] + candidates = ([first]+segment(rest) for first,rest in splits(text)) + return max(candidates, key=language_models.Pwords_wrong) + + def splits(text, L=20): """Return a list of all possible (first, rest) pairs, len(first)<=L. """ -- 2.34.1