From: Neil Smith <neil.git@njae.me.uk>
Date: Fri, 14 Mar 2014 12:12:50 +0000 (+0000)
Subject: Updated letter frequencies, updated test values to reflect them
X-Git-Url: https://git.njae.me.uk/?a=commitdiff_plain;h=96d46a680a808555a9ff77f2eaa68383569f07ee;p=cipher-training.git

Updated letter frequencies, updated test values to reflect them
---

diff --git a/cipherbreak.py b/cipherbreak.py
index d2c35c9..17df97a 100644
--- a/cipherbreak.py
+++ b/cipherbreak.py
@@ -50,10 +50,6 @@ def frequencies(text):
     >>> frequencies('abcdefabcdef')['x']
     0
     """
-    #counts = collections.defaultdict(int)
-    #for c in text: 
-    #    counts[c] += 1
-    #return counts
     return collections.Counter(c for c in text)
 
 
@@ -62,13 +58,13 @@ def caesar_break(message, fitness=Pletters):
     
     >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrh' \
           'ecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS
-    (4, -130.849890899...)
+    (4, -130.849989015...)
     >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgtee' \
           'raxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS
-    (19, -128.82516920...)
+    (19, -128.82410410...)
     >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurer' \
           'svaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS
-    (13, -126.25233502...)
+    (13, -126.25403935...)
     """
     sanitised_message = sanitise(message)
     best_shift = 0
@@ -95,7 +91,7 @@ def affine_break(message, fitness=Pletters):
           'ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls ' \
           'bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai ' \
           'kxd clm ckuxj.') # doctest: +ELLIPSIS
-    ((15, 22, True), -340.611412245...)
+    ((15, 22, True), -340.601181913...)
     """
     sanitised_message = sanitise(message)
     best_multiplier = 0
@@ -131,7 +127,7 @@ def keyword_break(message, wordlist=keywords, fitness=Pletters):
     >>> keyword_break(keyword_encipher('this is a test message for the ' \
           'keyword decipherment', 'elephant', 1), \
           wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
-    (('elephant', 1), -52.8345642265...)
+    (('elephant', 1), -52.834575011...)
     """
     best_keyword = ''
     best_wrap_alphabet = True
@@ -162,7 +158,7 @@ def keyword_break_mp(message, wordlist=keywords, fitness=Pletters, chunksize=500
     >>> keyword_break_mp(keyword_encipher('this is a test message for the ' \
           'keyword decipherment', 'elephant', 1), \
           wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
-    (('elephant', 1), -52.834564226507...)
+    (('elephant', 1), -52.834575011...)
     """
     with Pool() as pool:
         helper_args = [(message, word, wrap, fitness) 
@@ -287,7 +283,7 @@ def vigenere_keyword_break(message, wordlist=keywords, fitness=Pletters):
     >>> vigenere_keyword_break(vigenere_encipher(sanitise('this is a test ' \
              'message for the vigenere decipherment'), 'cat'), \
              wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
-    ('cat', -52.9479167030...)
+    ('cat', -52.947271216...)
     """
     best_keyword = ''
     best_fit = float("-inf")
@@ -315,7 +311,7 @@ def vigenere_keyword_break_mp(message, wordlist=keywords, fitness=Pletters,
     >>> vigenere_keyword_break_mp(vigenere_encipher(sanitise('this is a test ' \
              'message for the vigenere decipherment'), 'cat'), \
              wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
-    ('cat', -52.9479167030...)
+    ('cat', -52.947271216...)
     """
     with Pool() as pool:
         helper_args = [(message, word, fitness) 
@@ -345,7 +341,7 @@ def vigenere_frequency_break(message, fitness=Pletters):
             "certain that the theft has been discovered and that I will " \
             "be caught. The SS officer visits less often now that he is " \
             "sure"), 'florence')) # doctest: +ELLIPSIS
-    ('florence', -307.5549865898...)
+    ('florence', -307.5473096791...)
     """
     best_fit = float("-inf")
     best_key = ''
@@ -376,7 +372,7 @@ def beaufort_frequency_break(message, fitness=Pletters):
             "certain that the theft has been discovered and that I will " \
             "be caught. The SS officer visits less often now " \
             "that he is sure"), 'florence')) # doctest: +ELLIPSIS
-    ('florence', -307.5549865898...)
+    ('florence', -307.5473096791...)
     """
     best_fit = float("-inf")
     best_key = ''
diff --git a/count_1l.txt b/count_1l.txt
index c1fc8ef..e9ac0c6 100644
--- a/count_1l.txt
+++ b/count_1l.txt
@@ -1,26 +1,26 @@
-e	756288
-t	559059
-o	503173
-a	489107
-i	420131
-n	418342
-h	415853
-s	403715
-r	372431
-d	267381
-l	258537
-u	189758
-m	171836
-w	153882
-y	142711
-c	140497
-f	134935
-g	117474
-p	100241
-b	92647
-v	65181
-k	54114
-x	7386
-j	6590
-q	5488
-z	3575
+e	758103
+t	560576
+o	504520
+a	490129
+i	421240
+n	419374
+h	416369
+s	404473
+r	373599
+d	267917
+l	259023
+u	190269
+m	172199
+w	154157
+y	143040
+c	141094
+f	135318
+g	117888
+p	100690
+b	92919
+v	65297
+k	54248
+x	7414
+j	6679
+q	5499
+z	3577
diff --git a/language_models.py b/language_models.py
index 9297468..ceb4596 100644
--- a/language_models.py
+++ b/language_models.py
@@ -140,14 +140,14 @@ def Pbigrams(letters):
     return sum(P2l[p] for p in ngrams(letters, 2))
 
 
-def cosine_distance_score(text):
+def cosine_similarity_score(text):
     """Finds the dissimilarity of a text to English, using the cosine distance
     of the frequency distribution.
 
-    >>> cosine_distance_score('abcabc') # doctest: +ELLIPSIS
-    0.370847405...
+    >>> cosine_similarity_score('abcabc') # doctest: +ELLIPSIS
+    0.26228882...
     """
-    return norms.cosine_distance(english_counts, 
+    return norms.cosine_similarity(english_counts, 
         collections.Counter(sanitise(text)))
 
 
diff --git a/lettercount.py b/lettercount.py
index 18f75c4..956eca1 100644
--- a/lettercount.py
+++ b/lettercount.py
@@ -8,8 +8,6 @@ for corpus in corpora:
     text = sanitise(open(corpus).read())
     counts.update(text)
 
-sorted_letters = sorted(counts, key=counts.get, reverse=True)
-
 with open('count_1l.txt', 'w') as f:
-    for l in sorted_letters:
-        f.write("{}\t{}\n".format(l, counts[l]))
+    for l, c in counts.most_common():
+        f.write("{}\t{}\n".format(l, c))
diff --git a/norms.py b/norms.py
index 37fd3c9..b8e4bf1 100644
--- a/norms.py
+++ b/norms.py
@@ -163,13 +163,13 @@ def cosine_similarity(frequencies1, frequencies2):
     """Finds the distances between two frequency profiles, expressed as dictionaries.
     Assumes every key in frequencies1 is also in frequencies2
 
-    >>> cosine_distance({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
+    >>> cosine_similarity({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
     1.0000000000...
-    >>> cosine_distance({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
+    >>> cosine_similarity({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
     1.0000000000...
-    >>> cosine_distance({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
+    >>> cosine_similarity({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
     0.5773502691...
-    >>> cosine_distance({'a':0, 'b':1}, {'a':1, 'b':1}) # doctest: +ELLIPSIS
+    >>> cosine_similarity({'a':0, 'b':1}, {'a':1, 'b':1}) # doctest: +ELLIPSIS
     0.7071067811...
     """
     numerator = 0