Included Python 3.4's Enum for keyword alphabets, added Pwords_wrong and friends...

author Neil Smith <neil.git@njae.me.uk>

Sun, 1 Jun 2014 19:24:19 +0000 (20:24 +0100)

committer Neil Smith <neil.git@njae.me.uk>

Sun, 1 Jun 2014 19:24:19 +0000 (20:24 +0100)
author Neil Smith <neil.git@njae.me.uk>
Sun, 1 Jun 2014 19:24:19 +0000 (20:24 +0100)
committer Neil Smith <neil.git@njae.me.uk>
Sun, 1 Jun 2014 19:24:19 +0000 (20:24 +0100)
diff --git a/cipher.py b/cipher.py

index 11bdde67916ca1303a1c6ce6d1318001d5233061..ba62d411a902e40f6bc1b259d2733111f891407d 100644 (file)
--- a/cipher.py
+++ b/cipher.py
@@ -1,6 +1,7 @@
  import string
  import collections
  import math
+from enum import Enum
  from itertools import zip_longest, cycle, chain
  from language_models import *
  
@@ -240,28 +241,31 @@ def affine_decipher(message, multiplier=1, adder=0, one_based=True):
      return ''.join(enciphered)
  
  
-def keyword_cipher_alphabet_of(keyword, wrap_alphabet=0):
+class Keyword_wrap_alphabet(Enum):
+    from_a = 0
+    from_last = 1
+    from_largest = 2
+
+
+def keyword_cipher_alphabet_of(keyword, wrap_alphabet=Keyword_wrap_alphabet.from_a):
      """Find the cipher alphabet given a keyword.
      wrap_alphabet controls how the rest of the alphabet is added
      after the keyword.
-    0 : from 'a'
-    1 : from the last letter in the sanitised keyword
-    2 : from the largest letter in the sanitised keyword
  
      >>> keyword_cipher_alphabet_of('bayes')
      'bayescdfghijklmnopqrtuvwxz'
-    >>> keyword_cipher_alphabet_of('bayes', 0)
+    >>> keyword_cipher_alphabet_of('bayes', Keyword_wrap_alphabet.from_a)
      'bayescdfghijklmnopqrtuvwxz'
-    >>> keyword_cipher_alphabet_of('bayes', 1)
+    >>> keyword_cipher_alphabet_of('bayes', Keyword_wrap_alphabet.from_last)
      'bayestuvwxzcdfghijklmnopqr'
-    >>> keyword_cipher_alphabet_of('bayes', 2)
+    >>> keyword_cipher_alphabet_of('bayes', Keyword_wrap_alphabet.from_largest)
      'bayeszcdfghijklmnopqrtuvwx'
      """
-    if wrap_alphabet == 0:
+    if wrap_alphabet == Keyword_wrap_alphabet.from_a:
          cipher_alphabet = ''.join(deduplicate(sanitise(keyword) + 
                                                string.ascii_lowercase))
      else:
-        if wrap_alphabet == 1:
+        if wrap_alphabet == Keyword_wrap_alphabet.from_last:
              last_keyword_letter = deduplicate(sanitise(keyword))[-1]
          else:
              last_keyword_letter = sorted(sanitise(keyword))[-1]
@@ -274,7 +278,7 @@ def keyword_cipher_alphabet_of(keyword, wrap_alphabet=0):
      return cipher_alphabet
  
  
-def keyword_encipher(message, keyword, wrap_alphabet=0):
+def keyword_encipher(message, keyword, wrap_alphabet=Keyword_wrap_alphabet.from_a):
      """Enciphers a message with a keyword substitution cipher.
      wrap_alphabet controls how the rest of the alphabet is added
      after the keyword.
@@ -284,18 +288,18 @@ def keyword_encipher(message, keyword, wrap_alphabet=0):
  
      >>> keyword_encipher('test message', 'bayes')
      'rsqr ksqqbds'
-    >>> keyword_encipher('test message', 'bayes', 0)
+    >>> keyword_encipher('test message', 'bayes', Keyword_wrap_alphabet.from_a)
      'rsqr ksqqbds'
-    >>> keyword_encipher('test message', 'bayes', 1)
+    >>> keyword_encipher('test message', 'bayes', Keyword_wrap_alphabet.from_last)
      'lskl dskkbus'
-    >>> keyword_encipher('test message', 'bayes', 2)
+    >>> keyword_encipher('test message', 'bayes', Keyword_wrap_alphabet.from_largest)
      'qspq jsppbcs'
      """
      cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
      cipher_translation = ''.maketrans(string.ascii_lowercase, cipher_alphabet)
      return unaccent(message).lower().translate(cipher_translation)
  
-def keyword_decipher(message, keyword, wrap_alphabet=0):
+def keyword_decipher(message, keyword, wrap_alphabet=Keyword_wrap_alphabet.from_a):
      """Deciphers a message with a keyword substitution cipher.
      wrap_alphabet controls how the rest of the alphabet is added
      after the keyword.
@@ -305,11 +309,11 @@ def keyword_decipher(message, keyword, wrap_alphabet=0):
      
      >>> keyword_decipher('rsqr ksqqbds', 'bayes')
      'test message'
-    >>> keyword_decipher('rsqr ksqqbds', 'bayes', 0)
+    >>> keyword_decipher('rsqr ksqqbds', 'bayes', Keyword_wrap_alphabet.from_a)
      'test message'
-    >>> keyword_decipher('lskl dskkbus', 'bayes', 1)
+    >>> keyword_decipher('lskl dskkbus', 'bayes', Keyword_wrap_alphabet.from_last)
      'test message'
-    >>> keyword_decipher('qspq jsppbcs', 'bayes', 2)                                                                                            
+    >>> keyword_decipher('qspq jsppbcs', 'bayes', Keyword_wrap_alphabet.from_largest)
      'test message'
      """
      cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet)
diff --git a/cipherbreak.py b/cipherbreak.py

index 95b5c208975c5035a2b2c2f50d5c3ebeb5f6bb8a..0d3fd7828d5d3fbd69f86453298975dd176a33e1 100644 (file)
--- a/cipherbreak.py
+++ b/cipherbreak.py
@@ -131,14 +131,14 @@ def keyword_break(message, wordlist=keywords, fitness=Pletters):
      frequency analysis
  
      >>> keyword_break(keyword_encipher('this is a test message for the ' \
-          'keyword decipherment', 'elephant', 1), \
+          'keyword decipherment', 'elephant', Keyword_wrap_alphabet.from_last), \
            wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
-    (('elephant', 1), -52.834575011...)
+    (('elephant', <Keyword_wrap_alphabet.from_last: 1>), -52.834575011...)
      """
      best_keyword = ''
      best_wrap_alphabet = True
      best_fit = float("-inf")
-    for wrap_alphabet in range(3):
+    for wrap_alphabet in Keyword_wrap_alphabet:
          for keyword in wordlist:
              plaintext = keyword_decipher(message, keyword, wrap_alphabet)
              fit = fitness(plaintext)
@@ -162,13 +162,14 @@ def keyword_break_mp(message, wordlist=keywords, fitness=Pletters, chunksize=500
      frequency analysis
  
      >>> keyword_break_mp(keyword_encipher('this is a test message for the ' \
-          'keyword decipherment', 'elephant', 1), \
+          'keyword decipherment', 'elephant', Keyword_wrap_alphabet.from_last), \
            wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
-    (('elephant', 1), -52.834575011...)
+    (('elephant', <Keyword_wrap_alphabet.from_last: 1>), -52.834575011...)
      """
      with Pool() as pool:
          helper_args = [(message, word, wrap, fitness) 
-                       for word in wordlist for wrap in range(3)]
+                       for word in wordlist 
+                       for wrap in Keyword_wrap_alphabet]
          # Gotcha: the helper function here needs to be defined at the top level 
          #   (limitation of Pool.starmap)
          breaks = pool.starmap(keyword_break_worker, helper_args, chunksize) 
diff --git a/language_models.py b/language_models.py

index ceb4596eb2fd87d3d2375f338892f9652525f2d4..52e7ac43db6f62376735808f030c9d6aaa3ba17f 100644 (file)
--- a/language_models.py
+++ b/language_models.py
@@ -120,6 +120,7 @@ def log_probability_of_unknown_word(key, N):
      return -log10(N * 10**((len(key) - 2) * 1.4))
  
  Pw = Pdist(datafile('count_1w.txt'), log_probability_of_unknown_word)
+Pw_wrong = Pdist(datafile('count_1w.txt'), lambda _k, N: log10(1/N))
  Pl = Pdist(datafile('count_1l.txt'), lambda _k, _N: 0)
  P2l = Pdist(datafile('count_2l.txt'), lambda _k, _N: 0)
  
@@ -128,6 +129,12 @@ def Pwords(words):
      """
      return sum(Pw[w.lower()] for w in words)
  
+def Pwords_wrong(words): 
+    """The Naive Bayes log probability of a sequence of words.
+    """
+    return sum(Pw_wrong[w.lower()] for w in words)
+
+
  def Pletters(letters):
      """The Naive Bayes log probability of a sequence of letters.
      """
diff --git a/segment.py b/segment.py

index ba3ddd7405a91a40c025fcd34b5eadfa7f8d0b11..1af1b62fc8eb3270c35c4bb39a773804faf8da47 100644 (file)
--- a/segment.py
+++ b/segment.py
@@ -11,6 +11,15 @@ def segment(text):
      candidates = ([first]+segment(rest) for first,rest in splits(text))
      return max(candidates, key=language_models.Pwords)
  
+@lru_cache()
+def segment_wrong(text):
+    """Return a list of words that is the best segmentation of text.
+    """
+    if not text: return []
+    candidates = ([first]+segment(rest) for first,rest in splits(text))
+    return max(candidates, key=language_models.Pwords_wrong)
+
+
  def splits(text, L=20):
      """Return a list of all possible (first, rest) pairs, len(first)<=L.
      """
author	Neil Smith <neil.git@njae.me.uk>
	Sun, 1 Jun 2014 19:24:19 +0000 (20:24 +0100)
committer	Neil Smith <neil.git@njae.me.uk>
	Sun, 1 Jun 2014 19:24:19 +0000 (20:24 +0100)
cipher.py		patch \| blob \| history
cipherbreak.py		patch \| blob \| history
language_models.py		patch \| blob \| history
segment.py		patch \| blob \| history