From 4ee90e9b9683a7688d9a2b7ed972a2511530c0ba Mon Sep 17 00:00:00 2001
From: Neil Smith <neil.git@njae.me.uk>
Date: Sun, 1 Jun 2014 20:24:19 +0100
Subject: [PATCH] Included Python 3.4's Enum for keyword alphabets, added
 Pwords_wrong and friends for development of word segmentation

---
 cipher.py          | 38 +++++++++++++++++++++-----------------
 cipherbreak.py     | 13 +++++++------
 language_models.py |  7 +++++++
 segment.py         |  9 +++++++++
 4 files changed, 44 insertions(+), 23 deletions(-)

diff --git a/cipher.py b/cipher.py
index 11bdde6..ba62d41 100644
--- a/cipher.py
+++ b/cipher.py
@@ -1,6 +1,7 @@
 import string
 import collections
 import math
+from enum import Enum
 from itertools import zip_longest, cycle, chain
 from language_models import *
 
@@ -240,28 +241,31 @@ def affine_decipher(message, multiplier=1, adder=0, one_based=True):
     return ''.join(enciphered)
 
 
-def keyword_cipher_alphabet_of(keyword, wrap_alphabet=0):
+class Keyword_wrap_alphabet(Enum):
+    from_a = 0
+    from_last = 1
+    from_largest = 2
+
+
+def keyword_cipher_alphabet_of(keyword, wrap_alphabet=Keyword_wrap_alphabet.from_a):
     """Find the cipher alphabet given a keyword.
     wrap_alphabet controls how the rest of the alphabet is added
     after the keyword.
-    0 : from 'a'
-    1 : from the last letter in the sanitised keyword
-    2 : from the largest letter in the sanitised keyword
 
     >>> keyword_cipher_alphabet_of('bayes')
     'bayescdfghijklmnopqrtuvwxz'
-    >>> keyword_cipher_alphabet_of('bayes', 0)
+    >>> keyword_cipher_alphabet_of('bayes', Keyword_wrap_alphabet.from_a)
     'bayescdfghijklmnopqrtuvwxz'
-    >>> keyword_cipher_alphabet_of('bayes', 1)
+    >>> keyword_cipher_alphabet_of('bayes', Keyword_wrap_alphabet.from_last)
     'bayestuvwxzcdfghijklmnopqr'
-    >>> keyword_cipher_alphabet_of('bayes', 2)
+    >>> keyword_cipher_alphabet_of('bayes', Keyword_wrap_alphabet.from_largest)
     'bayeszcdfghijklmnopqrtuvwx'
     """
-    if wrap_alphabet == 0:
+    if wrap_alphabet == Keyword_wrap_alphabet.from_a:
         cipher_alphabet = ''.join(deduplicate(sanitise(keyword) + 
                                               string.ascii_lowercase))
     else:
-        if wrap_alphabet == 1:
+        if wrap_alphabet == Keyword_wrap_alphabet.from_last:
             last_keyword_letter = deduplicate(sanitise(keyword))[-1]
         else:
             last_keyword_letter = sorted(sanitise(keyword))[-1]
@@ -274,7 +278,7 @@ def keyword_cipher_alphabet_of(keyword, wrap_alphabet=0):
     return cipher_alphabet
 
 
-def keyword_encipher(message, keyword, wrap_alphabet=0):
+def keyword_encipher(message, keyword, wrap_alphabet=Keyword_wrap_alphabet.from_a):
     """Enciphers a message with a keyword substitution cipher.
     wrap_alphabet controls how the rest of the alphabet is added
     after the keyword.
@@ -284,18 +288,18 @@ def keyword_encipher(message, keyword, wrap_alphabet=0):
 
     >>> keyword_encipher('test message', 'bayes')
     'rsqr ksqqbds'
-    >>> keyword_encipher('test message', 'bayes', 0)
+    >>> keyword_encipher('test message', 'bayes', Keyword_wrap_alphabet.from_a)
     'rsqr ksqqbds'
-    >>> keyword_encipher('test message', 'bayes', 1)
+    >>> keyword_encipher('test message', 'bayes', Keyword_wrap_alphabet.from_last)
     'lskl dskkbus'
-    >>> keyword_encipher('test message', 'bayes', 2)
+    >>> keyword_encipher('test message', 'bayes', Keyword_wrap_alphabet.from_largest)
     'qspq jsppbcs'
     """
     cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
     cipher_translation = ''.maketrans(string.ascii_lowercase, cipher_alphabet)
     return unaccent(message).lower().translate(cipher_translation)
 
-def keyword_decipher(message, keyword, wrap_alphabet=0):
+def keyword_decipher(message, keyword, wrap_alphabet=Keyword_wrap_alphabet.from_a):
     """Deciphers a message with a keyword substitution cipher.
     wrap_alphabet controls how the rest of the alphabet is added
     after the keyword.
@@ -305,11 +309,11 @@ def keyword_decipher(message, keyword, wrap_alphabet=0):
     
     >>> keyword_decipher('rsqr ksqqbds', 'bayes')
     'test message'
-    >>> keyword_decipher('rsqr ksqqbds', 'bayes', 0)
+    >>> keyword_decipher('rsqr ksqqbds', 'bayes', Keyword_wrap_alphabet.from_a)
     'test message'
-    >>> keyword_decipher('lskl dskkbus', 'bayes', 1)
+    >>> keyword_decipher('lskl dskkbus', 'bayes', Keyword_wrap_alphabet.from_last)
     'test message'
-    >>> keyword_decipher('qspq jsppbcs', 'bayes', 2)                                                                                            
+    >>> keyword_decipher('qspq jsppbcs', 'bayes', Keyword_wrap_alphabet.from_largest)
     'test message'
     """
     cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
diff --git a/cipherbreak.py b/cipherbreak.py
index 95b5c20..0d3fd78 100644
--- a/cipherbreak.py
+++ b/cipherbreak.py
@@ -131,14 +131,14 @@ def keyword_break(message, wordlist=keywords, fitness=Pletters):
     frequency analysis
 
     >>> keyword_break(keyword_encipher('this is a test message for the ' \
-          'keyword decipherment', 'elephant', 1), \
+          'keyword decipherment', 'elephant', Keyword_wrap_alphabet.from_last), \
           wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
-    (('elephant', 1), -52.834575011...)
+    (('elephant', <Keyword_wrap_alphabet.from_last: 1>), -52.834575011...)
     """
     best_keyword = ''
     best_wrap_alphabet = True
     best_fit = float("-inf")
-    for wrap_alphabet in range(3):
+    for wrap_alphabet in Keyword_wrap_alphabet:
         for keyword in wordlist:
             plaintext = keyword_decipher(message, keyword, wrap_alphabet)
             fit = fitness(plaintext)
@@ -162,13 +162,14 @@ def keyword_break_mp(message, wordlist=keywords, fitness=Pletters, chunksize=500
     frequency analysis
 
     >>> keyword_break_mp(keyword_encipher('this is a test message for the ' \
-          'keyword decipherment', 'elephant', 1), \
+          'keyword decipherment', 'elephant', Keyword_wrap_alphabet.from_last), \
           wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
-    (('elephant', 1), -52.834575011...)
+    (('elephant', <Keyword_wrap_alphabet.from_last: 1>), -52.834575011...)
     """
     with Pool() as pool:
         helper_args = [(message, word, wrap, fitness) 
-                       for word in wordlist for wrap in range(3)]
+                       for word in wordlist 
+                       for wrap in Keyword_wrap_alphabet]
         # Gotcha: the helper function here needs to be defined at the top level 
         #   (limitation of Pool.starmap)
         breaks = pool.starmap(keyword_break_worker, helper_args, chunksize) 
diff --git a/language_models.py b/language_models.py
index ceb4596..52e7ac4 100644
--- a/language_models.py
+++ b/language_models.py
@@ -120,6 +120,7 @@ def log_probability_of_unknown_word(key, N):
     return -log10(N * 10**((len(key) - 2) * 1.4))
 
 Pw = Pdist(datafile('count_1w.txt'), log_probability_of_unknown_word)
+Pw_wrong = Pdist(datafile('count_1w.txt'), lambda _k, N: log10(1/N))
 Pl = Pdist(datafile('count_1l.txt'), lambda _k, _N: 0)
 P2l = Pdist(datafile('count_2l.txt'), lambda _k, _N: 0)
 
@@ -128,6 +129,12 @@ def Pwords(words):
     """
     return sum(Pw[w.lower()] for w in words)
 
+def Pwords_wrong(words): 
+    """The Naive Bayes log probability of a sequence of words.
+    """
+    return sum(Pw_wrong[w.lower()] for w in words)
+
+
 def Pletters(letters):
     """The Naive Bayes log probability of a sequence of letters.
     """
diff --git a/segment.py b/segment.py
index ba3ddd7..1af1b62 100644
--- a/segment.py
+++ b/segment.py
@@ -11,6 +11,15 @@ def segment(text):
     candidates = ([first]+segment(rest) for first,rest in splits(text))
     return max(candidates, key=language_models.Pwords)
 
+@lru_cache()
+def segment_wrong(text):
+    """Return a list of words that is the best segmentation of text.
+    """
+    if not text: return []
+    candidates = ([first]+segment(rest) for first,rest in splits(text))
+    return max(candidates, key=language_models.Pwords_wrong)
+
+
 def splits(text, L=20):
     """Return a list of all possible (first, rest) pairs, len(first)<=L.
     """
-- 
2.34.1