From d2960b05c3b7f4fb77add9d8df9201bb8903c002 Mon Sep 17 00:00:00 2001 From: Neil Smith Date: Sun, 13 Jul 2014 23:13:09 +0100 Subject: [PATCH] More tweaking to conform with linting --- cipher.py | 68 ++++++++++++----------- cipherbreak.py | 133 +++++++++++++++++++++++---------------------- language_models.py | 17 +++--- norms.py | 36 +++++++----- segment.py | 8 ++- 5 files changed, 138 insertions(+), 124 deletions(-) diff --git a/cipher.py b/cipher.py index 1656720..3783d57 100644 --- a/cipher.py +++ b/cipher.py @@ -44,7 +44,7 @@ def combine_every_nth(split_text): 'abcdefghijklmnopqrstuvwxyz' """ return ''.join([''.join(l) - for l in zip_longest(*split_text, fillvalue='')]) + for l in zip_longest(*split_text, fillvalue='')]) def chunks(text, n, fillvalue=None): """Split a text into chunks of n characters @@ -79,7 +79,7 @@ def transpose(items, transposition): def untranspose(items, transposition): """Undoes a transpose - + >>> untranspose(['a', 'b', 'c', 'd'], [0,1,2,3]) ['a', 'b', 'c', 'd'] >>> untranspose(['d', 'b', 'c', 'a'], [3,1,2,0]) @@ -183,7 +183,7 @@ def caesar_decipher(message, shift): return caesar_encipher(message, -shift) def affine_encipher_letter(accented_letter, multiplier=1, adder=0, - one_based=True): + one_based=True): """Encipher a letter, given a multiplier and adder >>> ''.join([affine_encipher_letter(l, 3, 5, True) \ for l in string.ascii_uppercase]) @@ -225,7 +225,8 @@ def affine_decipher_letter(letter, multiplier=1, adder=0, one_based=True): if one_based: cipher_number += 1 plaintext_number = ( modular_division_table[multiplier] - [(cipher_number - adder) % 26] ) + [(cipher_number - adder) % 26] + ) if one_based: plaintext_number -= 1 return chr(plaintext_number % 26 + alphabet_start) else: @@ -254,7 +255,7 @@ def affine_decipher(message, multiplier=1, adder=0, one_based=True): return ''.join(enciphered) -class Keyword_wrap_alphabet(Enum): +class KeywordWrapAlphabet(Enum): """Ways of wrapping the alphabet for keyword-based substitution ciphers.""" from_a = 1 from_last = 2 @@ -262,25 +263,25 @@ class Keyword_wrap_alphabet(Enum): def keyword_cipher_alphabet_of(keyword, - wrap_alphabet=Keyword_wrap_alphabet.from_a): + wrap_alphabet=KeywordWrapAlphabet.from_a): """Find the cipher alphabet given a keyword. wrap_alphabet controls how the rest of the alphabet is added after the keyword. >>> keyword_cipher_alphabet_of('bayes') 'bayescdfghijklmnopqrtuvwxz' - >>> keyword_cipher_alphabet_of('bayes', Keyword_wrap_alphabet.from_a) + >>> keyword_cipher_alphabet_of('bayes', KeywordWrapAlphabet.from_a) 'bayescdfghijklmnopqrtuvwxz' - >>> keyword_cipher_alphabet_of('bayes', Keyword_wrap_alphabet.from_last) + >>> keyword_cipher_alphabet_of('bayes', KeywordWrapAlphabet.from_last) 'bayestuvwxzcdfghijklmnopqr' - >>> keyword_cipher_alphabet_of('bayes', Keyword_wrap_alphabet.from_largest) + >>> keyword_cipher_alphabet_of('bayes', KeywordWrapAlphabet.from_largest) 'bayeszcdfghijklmnopqrtuvwx' """ - if wrap_alphabet == Keyword_wrap_alphabet.from_a: + if wrap_alphabet == KeywordWrapAlphabet.from_a: cipher_alphabet = ''.join(deduplicate(sanitise(keyword) + string.ascii_lowercase)) else: - if wrap_alphabet == Keyword_wrap_alphabet.from_last: + if wrap_alphabet == KeywordWrapAlphabet.from_last: last_keyword_letter = deduplicate(sanitise(keyword))[-1] else: last_keyword_letter = sorted(sanitise(keyword))[-1] @@ -294,7 +295,7 @@ def keyword_cipher_alphabet_of(keyword, def keyword_encipher(message, keyword, - wrap_alphabet=Keyword_wrap_alphabet.from_a): + wrap_alphabet=KeywordWrapAlphabet.from_a): """Enciphers a message with a keyword substitution cipher. wrap_alphabet controls how the rest of the alphabet is added after the keyword. @@ -304,18 +305,19 @@ def keyword_encipher(message, keyword, >>> keyword_encipher('test message', 'bayes') 'rsqr ksqqbds' - >>> keyword_encipher('test message', 'bayes', Keyword_wrap_alphabet.from_a) + >>> keyword_encipher('test message', 'bayes', KeywordWrapAlphabet.from_a) 'rsqr ksqqbds' - >>> keyword_encipher('test message', 'bayes', Keyword_wrap_alphabet.from_last) + >>> keyword_encipher('test message', 'bayes', KeywordWrapAlphabet.from_last) 'lskl dskkbus' - >>> keyword_encipher('test message', 'bayes', Keyword_wrap_alphabet.from_largest) + >>> keyword_encipher('test message', 'bayes', KeywordWrapAlphabet.from_largest) 'qspq jsppbcs' """ cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet) cipher_translation = ''.maketrans(string.ascii_lowercase, cipher_alphabet) return unaccent(message).lower().translate(cipher_translation) -def keyword_decipher(message, keyword, wrap_alphabet=Keyword_wrap_alphabet.from_a): +def keyword_decipher(message, keyword, + wrap_alphabet=KeywordWrapAlphabet.from_a): """Deciphers a message with a keyword substitution cipher. wrap_alphabet controls how the rest of the alphabet is added after the keyword. @@ -325,11 +327,11 @@ def keyword_decipher(message, keyword, wrap_alphabet=Keyword_wrap_alphabet.from_ >>> keyword_decipher('rsqr ksqqbds', 'bayes') 'test message' - >>> keyword_decipher('rsqr ksqqbds', 'bayes', Keyword_wrap_alphabet.from_a) + >>> keyword_decipher('rsqr ksqqbds', 'bayes', KeywordWrapAlphabet.from_a) 'test message' - >>> keyword_decipher('lskl dskkbus', 'bayes', Keyword_wrap_alphabet.from_last) + >>> keyword_decipher('lskl dskkbus', 'bayes', KeywordWrapAlphabet.from_last) 'test message' - >>> keyword_decipher('qspq jsppbcs', 'bayes', Keyword_wrap_alphabet.from_largest) + >>> keyword_decipher('qspq jsppbcs', 'bayes', KeywordWrapAlphabet.from_largest) 'test message' """ cipher_alphabet = keyword_cipher_alphabet_of(keyword, wrap_alphabet) @@ -357,14 +359,14 @@ def vigenere_decipher(message, keyword): pairs = zip(message, cycle(shifts)) return ''.join([caesar_decipher_letter(l, k) for l, k in pairs]) -beaufort_encipher=vigenere_decipher -beaufort_decipher=vigenere_encipher +beaufort_encipher = vigenere_decipher +beaufort_decipher = vigenere_encipher def transpositions_of(keyword): """Finds the transpostions given by a keyword. For instance, the keyword 'clever' rearranges to 'celrv', so the first column (0) stays first, the - second column (1) moves to third, the third column (2) moves to second, + second column (1) moves to third, the third column (2) moves to second, and so on. If passed a tuple, assume it's already a transposition and just return it. @@ -408,8 +410,8 @@ def pad(message_len, group_len, fillvalue): return padding def column_transposition_encipher(message, keyword, fillvalue=' ', - fillcolumnwise=False, - emptycolumnwise=False): + fillcolumnwise=False, + emptycolumnwise=False): """Enciphers using the column transposition cipher. Message is padded to allow all rows to be the same length. @@ -457,8 +459,8 @@ def column_transposition_encipher(message, keyword, fillvalue=' ', return ''.join(chain(*transposed)) def column_transposition_decipher(message, keyword, fillvalue=' ', - fillcolumnwise=False, - emptycolumnwise=False): + fillcolumnwise=False, + emptycolumnwise=False): """Deciphers using the column transposition cipher. Message is padded to allow all rows to be the same length. @@ -508,12 +510,12 @@ def scytale_encipher(message, rows, fillvalue=' '): """ transpositions = [i for i in range(rows)] return column_transposition_encipher(message, transpositions, - fillvalue=fillvalue, fillcolumnwise=True, emptycolumnwise=False) + fillvalue=fillvalue, fillcolumnwise=True, emptycolumnwise=False) def scytale_decipher(message, rows): """Deciphers using the scytale transposition cipher. Assumes the message is padded so that all rows are the same length. - + >>> scytale_decipher('tcnhkfeboqrxuo iw ', 3) 'thequickbrownfox ' >>> scytale_decipher('tubnhirfecooqkwx', 4) @@ -600,14 +602,14 @@ class PocketEnigma(object): ValueError: Wheel specification does not contain 26 letters """ if len(wheel_spec) != 13: - raise ValueError("Wheel specification has {} pairs, requires 13". - format(len(wheel_spec))) + raise ValueError("Wheel specification has {} pairs, requires" + " 13".format(len(wheel_spec))) for p in wheel_spec: if len(p) != 2: raise ValueError("Not all mappings in wheel specification" - "have two elements") - if len(set([p[0] for p in wheel_spec] + - [p[1] for p in wheel_spec])) != 26: + "have two elements") + if len(set([p[0] for p in wheel_spec] + + [p[1] for p in wheel_spec])) != 26: raise ValueError("Wheel specification does not contain 26 letters") def encipher_letter(self, letter): diff --git a/cipherbreak.py b/cipherbreak.py index 3639da5..b28b763 100644 --- a/cipherbreak.py +++ b/cipherbreak.py @@ -1,12 +1,15 @@ +"""A set of functions to break the ciphers give in ciphers.py. +""" + import string import collections import norms import logging import random -from itertools import zip_longest, cycle, permutations, starmap +import math +from itertools import starmap from segment import segment from multiprocessing import Pool -from math import log10 import matplotlib.pyplot as plt @@ -32,7 +35,7 @@ for word in keywords: def frequencies(text): """Count the number of occurrences of each character in text - + >>> sorted(frequencies('abcdefabc').items()) [('a', 2), ('b', 2), ('c', 2), ('d', 1), ('e', 1), ('f', 1)] >>> sorted(frequencies('the quick brown fox jumped over the lazy ' \ @@ -48,7 +51,7 @@ def frequencies(text): ('W', 1), ('a', 1), ('c', 1), ('d', 1), ('e', 4), ('f', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('o', 2), ('p', 1), ('r', 1), ('t', 1), ('u', 2), ('v', 1), ('x', 1), ('y', 1), ('z', 1)] - >>> sorted(frequencies(sanitise('The Quick BROWN fox jumped! over... '\ + >>> sorted(frequencies(sanitise('The Quick BROWN fox jumped! over... '\ 'the (9lazy) DOG')).items()) # doctest: +NORMALIZE_WHITESPACE [('a', 1), ('b', 1), ('c', 1), ('d', 2), ('e', 4), ('f', 1), ('g', 1), ('h', 2), ('i', 1), ('j', 1), ('k', 1), ('l', 1), ('m', 1), ('n', 1), @@ -80,8 +83,8 @@ def caesar_break(message, fitness=Pletters): plaintext = caesar_decipher(sanitised_message, shift) fit = fitness(plaintext) logger.debug('Caesar break attempt using key {0} gives fit of {1} ' - 'and decrypt starting: {2}'.format(shift, fit, - plaintext[:50])) + 'and decrypt starting: {2}'.format(shift, fit, + plaintext[:50])) if fit > best_fit: best_fit = fit best_shift = shift @@ -109,12 +112,12 @@ def affine_break(message, fitness=Pletters): for one_based in [True, False]: for multiplier in [x for x in range(1, 26, 2) if x != 13]: for adder in range(26): - plaintext = affine_decipher(sanitised_message, + plaintext = affine_decipher(sanitised_message, multiplier, adder, one_based) fit = fitness(plaintext) logger.debug('Affine break attempt using key {0}x+{1} ({2}) ' 'gives fit of {3} and decrypt starting: {4}'. - format(multiplier, adder, one_based, fit, + format(multiplier, adder, one_based, fit, plaintext[:50])) if fit > best_fit: best_fit = fit @@ -125,22 +128,22 @@ def affine_break(message, fitness=Pletters): '{3} and decrypt starting: {4}'.format( best_multiplier, best_adder, best_one_based, best_fit, affine_decipher(sanitised_message, best_multiplier, - best_adder, best_one_based)[:50])) + best_adder, best_one_based)[:50])) return (best_multiplier, best_adder, best_one_based), best_fit def keyword_break(message, wordlist=keywords, fitness=Pletters): - """Breaks a keyword substitution cipher using a dictionary and - frequency analysis + """Breaks a keyword substitution cipher using a dictionary and + frequency analysis. >>> keyword_break(keyword_encipher('this is a test message for the ' \ - 'keyword decipherment', 'elephant', Keyword_wrap_alphabet.from_last), \ + 'keyword decipherment', 'elephant', KeywordWrapAlphabet.from_last), \ wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS - (('elephant', ), -52.834575011...) + (('elephant', ), -52.834575011...) """ best_keyword = '' best_wrap_alphabet = True best_fit = float("-inf") - for wrap_alphabet in Keyword_wrap_alphabet: + for wrap_alphabet in KeywordWrapAlphabet: for keyword in wordlist: plaintext = keyword_decipher(message, keyword, wrap_alphabet) fit = fitness(plaintext) @@ -159,19 +162,20 @@ def keyword_break(message, wordlist=keywords, fitness=Pletters): best_wrap_alphabet))[:50])) return (best_keyword, best_wrap_alphabet), best_fit -def keyword_break_mp(message, wordlist=keywords, fitness=Pletters, chunksize=500): - """Breaks a keyword substitution cipher using a dictionary and +def keyword_break_mp(message, wordlist=keywords, fitness=Pletters, + chunksize=500): + """Breaks a keyword substitution cipher using a dictionary and frequency analysis >>> keyword_break_mp(keyword_encipher('this is a test message for the ' \ - 'keyword decipherment', 'elephant', Keyword_wrap_alphabet.from_last), \ + 'keyword decipherment', 'elephant', KeywordWrapAlphabet.from_last), \ wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS - (('elephant', ), -52.834575011...) + (('elephant', ), -52.834575011...) """ with Pool() as pool: helper_args = [(message, word, wrap, fitness) for word in wordlist - for wrap in Keyword_wrap_alphabet] + for wrap in KeywordWrapAlphabet] # Gotcha: the helper function here needs to be defined at the top level # (limitation of Pool.starmap) breaks = pool.starmap(keyword_break_worker, helper_args, chunksize) @@ -185,14 +189,14 @@ def keyword_break_worker(message, keyword, wrap_alphabet, fitness): wrap_alphabet, fit, sanitise(plaintext)[:50])) return (keyword, wrap_alphabet), fit -def monoalphabetic_break_hillclimbing(message, max_iterations = 10000000, +def monoalphabetic_break_hillclimbing(message, max_iterations=10000000, fitness=Pletters): ciphertext = unaccent(message).lower() alphabet = list(string.ascii_lowercase) random.shuffle(alphabet) alphabet = ''.join(alphabet) return monoalphabetic_break_hillclimbing_worker(ciphertext, alphabet, - max_iterations, fitness) + max_iterations, fitness) def monoalphabetic_break_hillclimbing_mp(message, workers=10, max_iterations = 10000000, fitness=Pletters, chunksize=1): @@ -205,10 +209,10 @@ def monoalphabetic_break_hillclimbing_mp(message, workers=10, worker_args.append((ciphertext, alphabet, max_iterations, fitness)) with Pool() as pool: breaks = pool.starmap(monoalphabetic_break_hillclimbing_worker, - worker_args, chunksize) + worker_args, chunksize) return max(breaks, key=lambda k: k[1]) -def monoalphabetic_break_hillclimbing_worker(message, alphabet, +def monoalphabetic_break_hillclimbing_worker(message, alphabet, max_iterations, fitness): def swap(letters, i, j): if i > j: @@ -216,8 +220,8 @@ def monoalphabetic_break_hillclimbing_worker(message, alphabet, if i == j: return letters else: - return letters[:i] + letters[j] + letters[i+1:j] + - letters[i] + letters[j+1:] + return (letters[:i] + letters[j] + letters[i+1:j] + letters[i] + + letters[j+1:]) best_alphabet = alphabet best_fitness = float('-inf') for i in range(max_iterations): @@ -231,17 +235,17 @@ def monoalphabetic_break_hillclimbing_worker(message, alphabet, return best_alphabet, best_fitness -def column_transposition_break_mp(message, translist=transpositions, - fitness=Pbigrams, chunksize=500): - """Breaks a column transposition cipher using a dictionary and +def column_transposition_break_mp(message, translist=transpositions, + fitness=Pbigrams, chunksize=500): + """Breaks a column transposition cipher using a dictionary and n-gram frequency analysis >>> column_transposition_break_mp(column_transposition_encipher(sanitise( \ "It is a truth universally acknowledged, that a single man in \ possession of a good fortune, must be in want of a wife. However \ little known the feelings or views of such a man may be on his \ - first entering a neighbourhood, this truth is so well fixed in the \ - minds of the surrounding families, that he is considered the \ + first entering a neighbourhood, this truth is so well fixed in \ + the minds of the surrounding families, that he is considered the \ rightful property of some one or other of their daughters."), \ 'encipher'), \ translist={(2, 0, 5, 3, 1, 4, 6): ['encipher'], \ @@ -252,8 +256,8 @@ def column_transposition_break_mp(message, translist=transpositions, "It is a truth universally acknowledged, that a single man in \ possession of a good fortune, must be in want of a wife. However \ little known the feelings or views of such a man may be on his \ - first entering a neighbourhood, this truth is so well fixed in the \ - minds of the surrounding families, that he is considered the \ + first entering a neighbourhood, this truth is so well fixed in \ + the minds of the surrounding families, that he is considered the \ rightful property of some one or other of their daughters."), \ 'encipher'), \ translist={(2, 0, 5, 3, 1, 4, 6): ['encipher'], \ @@ -263,21 +267,21 @@ def column_transposition_break_mp(message, translist=transpositions, (((2, 0, 5, 3, 1, 4, 6), False, False), -997.0129085...) """ with Pool() as pool: - helper_args = [(message, trans, fillcolumnwise, emptycolumnwise, - fitness) - for trans in translist.keys() + helper_args = [(message, trans, fillcolumnwise, emptycolumnwise, + fitness) + for trans in translist.keys() for fillcolumnwise in [True, False] for emptycolumnwise in [True, False]] - # Gotcha: the helper function here needs to be defined at the top level + # Gotcha: the helper function here needs to be defined at the top level # (limitation of Pool.starmap) - breaks = pool.starmap(column_transposition_break_worker, - helper_args, chunksize) + breaks = pool.starmap(column_transposition_break_worker, + helper_args, chunksize) return max(breaks, key=lambda k: k[1]) column_transposition_break = column_transposition_break_mp -def column_transposition_break_worker(message, transposition, +def column_transposition_break_worker(message, transposition, fillcolumnwise, emptycolumnwise, fitness): - plaintext = column_transposition_decipher(message, transposition, + plaintext = column_transposition_decipher(message, transposition, fillcolumnwise=fillcolumnwise, emptycolumnwise=emptycolumnwise) fit = fitness(sanitise(plaintext)) logger.debug('Column transposition break attempt using key {0} ' @@ -296,8 +300,8 @@ def scytale_break_mp(message, max_key_length=20, "It is a truth universally acknowledged, that a single man in \ possession of a good fortune, must be in want of a wife. However \ little known the feelings or views of such a man may be on his \ - first entering a neighbourhood, this truth is so well fixed in the \ - minds of the surrounding families, that he is considered the \ + first entering a neighbourhood, this truth is so well fixed in \ + the minds of the surrounding families, that he is considered the \ rightful property of some one or other of their daughters."), \ 5)) # doctest: +ELLIPSIS (5, -709.4646722...) @@ -305,31 +309,30 @@ def scytale_break_mp(message, max_key_length=20, "It is a truth universally acknowledged, that a single man in \ possession of a good fortune, must be in want of a wife. However \ little known the feelings or views of such a man may be on his \ - first entering a neighbourhood, this truth is so well fixed in the \ - minds of the surrounding families, that he is considered the \ + first entering a neighbourhood, this truth is so well fixed in \ + the minds of the surrounding families, that he is considered the \ rightful property of some one or other of their daughters."), \ 5), \ fitness=Ptrigrams) # doctest: +ELLIPSIS (5, -997.0129085...) """ with Pool() as pool: - helper_args = [(message, trans, False, True, fitness) - for trans in - [[col for col in range(math.ceil(len(message)/rows))] + helper_args = [(message, trans, False, True, fitness) + for trans in + [[col for col in range(math.ceil(len(message)/rows))] for rows in range(1,max_key_length+1)]] - # Gotcha: the helper function here needs to be defined at the top level + # Gotcha: the helper function here needs to be defined at the top level # (limitation of Pool.starmap) - breaks = pool.starmap(column_transposition_break_worker, - helper_args, chunksize) - best = max(breaks, key=lambda k: k[1]) + breaks = pool.starmap(column_transposition_break_worker, + helper_args, chunksize) + best = max(breaks, key=lambda k: k[1]) return math.trunc(len(message) / len(best[0][0])), best[1] scytale_break = scytale_break_mp -def vigenere_keyword_break_mp(message, wordlist=keywords, fitness=Pletters, - chunksize=500): - """Breaks a vigenere cipher using a dictionary and - frequency analysis +def vigenere_keyword_break_mp(message, wordlist=keywords, fitness=Pletters, + chunksize=500): + """Breaks a vigenere cipher using a dictionary and frequency analysis. >>> vigenere_keyword_break_mp(vigenere_encipher(sanitise('this is a test ' \ 'message for the vigenere decipherment'), 'cat'), \ @@ -337,11 +340,12 @@ def vigenere_keyword_break_mp(message, wordlist=keywords, fitness=Pletters, ('cat', -52.947271216...) """ with Pool() as pool: - helper_args = [(message, word, fitness) + helper_args = [(message, word, fitness) for word in wordlist] - # Gotcha: the helper function here needs to be defined at the top level + # Gotcha: the helper function here needs to be defined at the top level # (limitation of Pool.starmap) - breaks = pool.starmap(vigenere_keyword_break_worker, helper_args, chunksize) + breaks = pool.starmap(vigenere_keyword_break_worker, helper_args, + chunksize) return max(breaks, key=lambda k: k[1]) vigenere_keyword_break = vigenere_keyword_break_mp @@ -349,7 +353,7 @@ def vigenere_keyword_break_worker(message, keyword, fitness): plaintext = vigenere_decipher(message, keyword) fit = fitness(plaintext) logger.debug('Vigenere keyword break attempt using key {0} gives fit of ' - '{1} and decrypt starting: {2}'.format(keyword, + '{1} and decrypt starting: {2}'.format(keyword, fit, sanitise(plaintext)[:50])) return keyword, fit @@ -374,8 +378,8 @@ def vigenere_frequency_break(message, max_key_length=20, fitness=Pletters): fit = fitness(plaintext) return key, fit sanitised_message = sanitise(message) - results = starmap(worker, [(sanitised_message, i, fitness) - for i in range(1, max_key_length+1)]) + results = starmap(worker, [(sanitised_message, i, fitness) + for i in range(1, max_key_length+1)]) return max(results, key=lambda k: k[1]) @@ -393,13 +397,14 @@ def beaufort_frequency_break(message, max_key_length=20, fitness=Pletters): """ def worker(message, key_length, fitness): splits = every_nth(sanitised_message, key_length) - key = ''.join([chr(-caesar_break(s)[0] % 26 + ord('a')) for s in splits]) + key = ''.join([chr(-caesar_break(s)[0] % 26 + ord('a')) + for s in splits]) plaintext = beaufort_decipher(message, key) fit = fitness(plaintext) return key, fit sanitised_message = sanitise(message) - results = starmap(worker, [(sanitised_message, i, fitness) - for i in range(1, max_key_length+1)]) + results = starmap(worker, [(sanitised_message, i, fitness) + for i in range(1, max_key_length+1)]) return max(results, key=lambda k: k[1]) diff --git a/language_models.py b/language_models.py index babbea1..62219ef 100644 --- a/language_models.py +++ b/language_models.py @@ -96,10 +96,10 @@ def ngrams(text, n): """Returns all n-grams of a text >>> ngrams(sanitise('the quick brown fox'), 2) # doctest: +NORMALIZE_WHITESPACE - ['th', 'he', 'eq', 'qu', 'ui', 'ic', 'ck', 'kb', 'br', 'ro', 'ow', 'wn', + ['th', 'he', 'eq', 'qu', 'ui', 'ic', 'ck', 'kb', 'br', 'ro', 'ow', 'wn', 'nf', 'fo', 'ox'] >>> ngrams(sanitise('the quick brown fox'), 4) # doctest: +NORMALIZE_WHITESPACE - ['theq', 'hequ', 'equi', 'quic', 'uick', 'ickb', 'ckbr', 'kbro', 'brow', + ['theq', 'hequ', 'equi', 'quic', 'uick', 'ickb', 'ckbr', 'kbro', 'brow', 'rown', 'ownf', 'wnfo', 'nfox'] """ return [text[i:i+n] for i in range(len(text)-n+1)] @@ -129,30 +129,29 @@ Pl = Pdist(datafile('count_1l.txt'), lambda _k, _N: 0) P2l = Pdist(datafile('count_2l.txt'), lambda _k, _N: 0) P3l = Pdist(datafile('count_3l.txt'), lambda _k, _N: 0) -def Pwords(words): +def Pwords(words): """The Naive Bayes log probability of a sequence of words. """ return sum(Pw[w.lower()] for w in words) -def Pwords_wrong(words): +def Pwords_wrong(words): """The Naive Bayes log probability of a sequence of words. """ return sum(Pw_wrong[w.lower()] for w in words) - def Pletters(letters): """The Naive Bayes log probability of a sequence of letters. """ return sum(Pl[l.lower()] for l in letters) def Pbigrams(letters): - """The Naive Bayes log probability of the bigrams formed from a sequence + """The Naive Bayes log probability of the bigrams formed from a sequence of letters. """ return sum(P2l[p] for p in ngrams(letters, 2)) def Ptrigrams(letters): - """The Naive Bayes log probability of the trigrams formed from a sequence + """The Naive Bayes log probability of the trigrams formed from a sequence of letters. """ return sum(P3l[p] for p in ngrams(letters, 3)) @@ -165,8 +164,8 @@ def cosine_similarity_score(text): >>> cosine_similarity_score('abcabc') # doctest: +ELLIPSIS 0.26228882... """ - return norms.cosine_similarity(english_counts, - collections.Counter(sanitise(text))) + return norms.cosine_similarity(english_counts, + collections.Counter(sanitise(text))) if __name__ == "__main__": diff --git a/norms.py b/norms.py index b8e4bf1..36e7fa4 100644 --- a/norms.py +++ b/norms.py @@ -1,9 +1,10 @@ +"""Define a variety of norms for finding distances between vectors""" + import collections -from math import log10 def normalise(frequencies): """Scale a set of frequencies so they sum to one - + >>> sorted(normalise({1: 1, 2: 0}).items()) [(1, 1.0), (2, 0.0)] >>> sorted(normalise({1: 1, 2: 1}).items()) @@ -14,12 +15,12 @@ def normalise(frequencies): [(1, 0.25), (2, 0.5), (3, 0.25)] """ length = sum([f for f in frequencies.values()]) - return collections.defaultdict(int, ((k, v / length) + return collections.defaultdict(int, ((k, v / length) for (k, v) in frequencies.items())) def euclidean_scale(frequencies): """Scale a set of frequencies so they have a unit euclidean length - + >>> sorted(euclidean_scale({1: 1, 2: 0}).items()) [(1, 1.0), (2, 0.0)] >>> sorted(euclidean_scale({1: 1, 2: 1}).items()) # doctest: +ELLIPSIS @@ -30,17 +31,21 @@ def euclidean_scale(frequencies): [(1, 0.408248...), (2, 0.81649658...), (3, 0.408248...)] """ length = sum([f ** 2 for f in frequencies.values()]) ** 0.5 - return collections.defaultdict(int, ((k, v / length) + return collections.defaultdict(int, ((k, v / length) for (k, v) in frequencies.items())) def identity_scale(frequencies): + """Don't scale a set of frequencies. (For use when a function expects a + scaling function but you don't want to supply one.) + """ return frequencies def l2(frequencies1, frequencies2): - """Finds the distances between two frequency profiles, expressed as dictionaries. + """Finds the distances between two frequency profiles, expressed as + dictionaries. Assumes every key in frequencies1 is also in frequencies2 - + >>> l2({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1}) 0.0 >>> l2({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS @@ -62,7 +67,7 @@ def l2(frequencies1, frequencies2): euclidean_distance = l2 def l1(frequencies1, frequencies2): - """Finds the distances between two frequency profiles, expressed as + """Finds the distances between two frequency profiles, expressed as dictionaries. Assumes every key in frequencies1 is also in frequencies2 >>> l1({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1}) @@ -82,7 +87,7 @@ def l1(frequencies1, frequencies2): return total def l3(frequencies1, frequencies2): - """Finds the distances between two frequency profiles, expressed as + """Finds the distances between two frequency profiles, expressed as dictionaries. Assumes every key in frequencies1 is also in frequencies2 >>> l3({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1}) @@ -105,10 +110,10 @@ def l3(frequencies1, frequencies2): return total ** (1/3) def geometric_mean(frequencies1, frequencies2): - """Finds the geometric mean of the absolute differences between two frequency profiles, - expressed as dictionaries. + """Finds the geometric mean of the absolute differences between two + frequency profiles, expressed as dictionaries. Assumes every key in frequencies1 is also in frequencies2 - + >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) 1 >>> geometric_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) @@ -131,8 +136,8 @@ def geometric_mean(frequencies1, frequencies2): return total def harmonic_mean(frequencies1, frequencies2): - """Finds the harmonic mean of the absolute differences between two frequency profiles, - expressed as dictionaries. + """Finds the harmonic mean of the absolute differences between two + frequency profiles, expressed as dictionaries. Assumes every key in frequencies1 is also in frequencies2 >>> harmonic_mean({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) @@ -160,7 +165,8 @@ def harmonic_mean(frequencies1, frequencies2): def cosine_similarity(frequencies1, frequencies2): - """Finds the distances between two frequency profiles, expressed as dictionaries. + """Finds the distances between two frequency profiles, expressed as + dictionaries. Assumes every key in frequencies1 is also in frequencies2 >>> cosine_similarity({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS diff --git a/segment.py b/segment.py index 1af1b62..a64ea5d 100644 --- a/segment.py +++ b/segment.py @@ -1,3 +1,5 @@ +"""Segment a collection of letters into words""" + import language_models import sys from functools import lru_cache @@ -8,7 +10,7 @@ def segment(text): """Return a list of words that is the best segmentation of text. """ if not text: return [] - candidates = ([first]+segment(rest) for first,rest in splits(text)) + candidates = ([first]+segment(rest) for first, rest in splits(text)) return max(candidates, key=language_models.Pwords) @lru_cache() @@ -16,13 +18,13 @@ def segment_wrong(text): """Return a list of words that is the best segmentation of text. """ if not text: return [] - candidates = ([first]+segment(rest) for first,rest in splits(text)) + candidates = ([first]+segment(rest) for first, rest in splits(text)) return max(candidates, key=language_models.Pwords_wrong) def splits(text, L=20): """Return a list of all possible (first, rest) pairs, len(first)<=L. """ - return [(text[:i+1], text[i+1:]) + return [(text[:i+1], text[i+1:]) for i in range(min(len(text), L))] -- 2.34.1