From: Neil Smith <neil.git@njae.me.uk>
Date: Mon, 14 Jul 2014 20:01:42 +0000 (+0100)
Subject: Caesar ciphers
X-Git-Url: https://git.njae.me.uk/?a=commitdiff_plain;h=5022df4c849824315bf0ee432b223f6cdc07712d;p=cipher-training.git

Caesar ciphers
---

diff --git a/cipher.py b/cipher.py
index e71902d..c5b29f0 100644
--- a/cipher.py
+++ b/cipher.py
@@ -3,9 +3,10 @@ them. See cipherbreak for automatic breaking of these ciphers
 """
 
 import string
-import collections
-from enum import Enum
-from itertools import zip_longest, cycle, chain
+# import collections
+# from enum import Enum
+# from itertools import zip_longest, cycle, chain
+
 from language_models import unaccent, sanitise
 
 
diff --git a/cipherbreak.py b/cipherbreak.py
deleted file mode 100644
index ddcb423..0000000
--- a/cipherbreak.py
+++ /dev/null
@@ -1,103 +0,0 @@
-"""A set of functions to break the ciphers give in ciphers.py.
-"""
-
-import string
-import collections
-import norms
-import logging
-import math
-
-import matplotlib.pyplot as plt
-
-logger = logging.getLogger(__name__)
-logger.addHandler(logging.FileHandler('cipher.log'))
-logger.setLevel(logging.WARNING)
-#logger.setLevel(logging.INFO)
-#logger.setLevel(logging.DEBUG)
-
-from cipher import *
-from language_models import *
-
-# To time a run:
-#
-# import timeit
-# c5a = open('2012/5a.ciphertext', 'r').read()
-# timeit.timeit('keyword_break(c5a)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break', number=1)
-# timeit.repeat('keyword_break_mp(c5a, chunksize=500)', setup='gc.enable() ; from __main__ import c5a ; from cipher import keyword_break_mp', repeat=5, number=1)
-
-
-def frequencies(text):
-    """Count the number of occurrences of each character in text
-
-    >>> sorted(frequencies('abcdefabc').items())
-    [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)]
-    >>> sorted(frequencies('the quick brown fox jumped over the lazy ' \
-         'dog').items()) # doctest: +NORMALIZE_WHITESPACE
-    [(' ', 8), ('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1),
-     ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1),
-     ('n', 1), ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2),
-     ('v', 1), ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
-    >>> sorted(frequencies('The Quick BROWN fox jumped! over... the ' \
-         '(9lazy) DOG').items()) # doctest: +NORMALIZE_WHITESPACE
-    [(' ', 8), ('!', 1), ('(', 1), (')', 1), ('.', 3), ('9', 1), ('B', 1),
-     ('D', 1), ('G', 1), ('N', 1), ('O', 2), ('Q', 1), ('R', 1), ('T', 1),
-     ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2),
-     ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1),
-     ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)]
-    >>> sorted(frequencies(sanitise('The Quick BROWN fox jumped! over... '\
-         'the (9lazy) DOG')).items()) # doctest: +NORMALIZE_WHITESPACE
-    [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1),
-     ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1),
-     ('o', 4), ('p', 1), ('q', 1), ('r', 2), ('t', 2), ('u', 2), ('v', 1),
-     ('w', 1), ('x', 1), ('y', 1), ('z', 1)]
-    >>> frequencies('abcdefabcdef')['x']
-    0
-    """
-    return collections.Counter(c for c in text)
-
-
-def caesar_break(message, fitness=Pletters):
-    """Breaks a Caesar cipher using frequency analysis
-
-    >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrh' \
-          'ecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS
-    (4, -130.849989015...)
-    >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgtee' \
-          'raxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS
-    (19, -128.82410410...)
-    >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurer' \
-          'svaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS
-    (13, -126.25403935...)
-    """
-    sanitised_message = sanitise(message)
-    best_shift = 0
-    best_fit = float('-inf')
-    for shift in range(26):
-        plaintext = caesar_decipher(sanitised_message, shift)
-        fit = fitness(plaintext)
-        logger.debug('Caesar break attempt using key {0} gives fit of {1} '
-                     'and decrypt starting: {2}'.format(shift, fit,
-                                                        plaintext[:50]))
-        if fit > best_fit:
-            best_fit = fit
-            best_shift = shift
-    logger.info('Caesar break best fit: key {0} gives fit of {1} and '
-                'decrypt starting: {2}'.format(best_shift, best_fit, 
-                    caesar_decipher(sanitised_message, best_shift)[:50]))
-    return best_shift, best_fit
-
-
-def plot_frequency_histogram(freqs, sort_key=None):
-    x = range(len(freqs.keys()))
-    y = [freqs[l] for l in sorted(freqs.keys(), key=sort_key)]
-    f = plt.figure()
-    ax = f.add_axes([0.1, 0.1, 0.9, 0.9])
-    ax.bar(x, y, align='center')
-    ax.set_xticks(x)
-    ax.set_xticklabels(sorted(freqs.keys(), key=sort_key))
-    f.show()
-
-
-if __name__ == "__main__":
-    import doctest
-    doctest.testmod()
diff --git a/count_1l.txt b/count_1l.txt
deleted file mode 100644
index e9ac0c6..0000000
--- a/count_1l.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-e	758103
-t	560576
-o	504520
-a	490129
-i	421240
-n	419374
-h	416369
-s	404473
-r	373599
-d	267917
-l	259023
-u	190269
-m	172199
-w	154157
-y	143040
-c	141094
-f	135318
-g	117888
-p	100690
-b	92919
-v	65297
-k	54248
-x	7414
-j	6679
-q	5499
-z	3577
diff --git a/language_models.py b/language_models.py
index 4798d73..0ee0394 100644
--- a/language_models.py
+++ b/language_models.py
@@ -3,12 +3,7 @@ its use.
 """
 
 import string
-import random
-import norms
-import collections
 import unicodedata
-import itertools
-from math import log10
 
 def letters(text):
     """Remove all non-alphabetic characters from a text
@@ -54,39 +49,6 @@ def sanitise(text):
     return letters(unaccent(text)).lower()
 
 
-def datafile(name, sep='\t'):
-    """Read key,value pairs from file.
-    """
-    with open(name, 'r') as f:
-        for line in f:
-            splits = line.split(sep)
-            yield [splits[0], int(splits[1])]
-
-english_counts = collections.Counter(dict(datafile('count_1l.txt')))
-normalised_english_counts = norms.normalise(english_counts)
-
-# The log 10 probabilities of letters
-Pl = {l: log10(n) for l, n in normalised_english_counts.items()}
-
-with open('words.txt', 'r') as f:
-    keywords = [line.rstrip() for line in f]
-
-def Pletters(letters):
-    """The Naive Bayes log probability of a sequence of letters.
-    """
-    return sum(Pl[l.lower()] for l in letters)
-
-
-def cosine_similarity_score(text):
-    """Finds the dissimilarity of a text to English, using the cosine distance
-    of the frequency distribution.
-
-    >>> cosine_similarity_score('abcabc') # doctest: +ELLIPSIS
-    0.26228882...
-    """
-    return norms.cosine_similarity(english_counts,
-                                   collections.Counter(sanitise(text)))
-
 
 if __name__ == "__main__":
     import doctest
diff --git a/lettercount.py b/lettercount.py
deleted file mode 100644
index 956eca1..0000000
--- a/lettercount.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from language_models import sanitise
-import collections
-
-corpora = ['shakespeare.txt', 'sherlock-holmes.txt', 'war-and-peace.txt']
-counts = collections.Counter()
-
-for corpus in corpora:
-    text = sanitise(open(corpus).read())
-    counts.update(text)
-
-with open('count_1l.txt', 'w') as f:
-    for l, c in counts.most_common():
-        f.write("{}\t{}\n".format(l, c))