Updated letter frequencies, updated test values to reflect them

author Neil Smith <neil.git@njae.me.uk>

Fri, 14 Mar 2014 12:12:50 +0000 (12:12 +0000)

committer Neil Smith <neil.git@njae.me.uk>

Fri, 14 Mar 2014 12:12:50 +0000 (12:12 +0000)
author Neil Smith <neil.git@njae.me.uk>
Fri, 14 Mar 2014 12:12:50 +0000 (12:12 +0000)
committer Neil Smith <neil.git@njae.me.uk>
Fri, 14 Mar 2014 12:12:50 +0000 (12:12 +0000)
diff --git a/cipherbreak.py b/cipherbreak.py

index d2c35c93ed77b1c693dd68fd98bce7989b108066..17df97af0d7ee6769a8789ec1eee74a92a369ef8 100644 (file)
--- a/cipherbreak.py
+++ b/cipherbreak.py
@@ -50,10 +50,6 @@ def frequencies(text):
      >>> frequencies('abcdefabcdef')['x']
      0
      """
-    #counts = collections.defaultdict(int)
-    #for c in text: 
-    #    counts[c] += 1
-    #return counts
      return collections.Counter(c for c in text)
  
  
@@ -62,13 +58,13 @@ def caesar_break(message, fitness=Pletters):
      
      >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrh' \
            'ecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS
-    (4, -130.849890899...)
+    (4, -130.849989015...)
      >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgtee' \
            'raxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS
-    (19, -128.82516920...)
+    (19, -128.82410410...)
      >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurer' \
            'svaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS
-    (13, -126.25233502...)
+    (13, -126.25403935...)
      """
      sanitised_message = sanitise(message)
      best_shift = 0
@@ -95,7 +91,7 @@ def affine_break(message, fitness=Pletters):
            'ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls ' \
            'bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai ' \
            'kxd clm ckuxj.') # doctest: +ELLIPSIS
-    ((15, 22, True), -340.611412245...)
+    ((15, 22, True), -340.601181913...)
      """
      sanitised_message = sanitise(message)
      best_multiplier = 0
@@ -131,7 +127,7 @@ def keyword_break(message, wordlist=keywords, fitness=Pletters):
      >>> keyword_break(keyword_encipher('this is a test message for the ' \
            'keyword decipherment', 'elephant', 1), \
            wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
-    (('elephant', 1), -52.8345642265...)
+    (('elephant', 1), -52.834575011...)
      """
      best_keyword = ''
      best_wrap_alphabet = True
@@ -162,7 +158,7 @@ def keyword_break_mp(message, wordlist=keywords, fitness=Pletters, chunksize=500
      >>> keyword_break_mp(keyword_encipher('this is a test message for the ' \
            'keyword decipherment', 'elephant', 1), \
            wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
-    (('elephant', 1), -52.834564226507...)
+    (('elephant', 1), -52.834575011...)
      """
      with Pool() as pool:
          helper_args = [(message, word, wrap, fitness) 
@@ -287,7 +283,7 @@ def vigenere_keyword_break(message, wordlist=keywords, fitness=Pletters):
      >>> vigenere_keyword_break(vigenere_encipher(sanitise('this is a test ' \
               'message for the vigenere decipherment'), 'cat'), \
               wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
-    ('cat', -52.9479167030...)
+    ('cat', -52.947271216...)
      """
      best_keyword = ''
      best_fit = float("-inf")
@@ -315,7 +311,7 @@ def vigenere_keyword_break_mp(message, wordlist=keywords, fitness=Pletters,
      >>> vigenere_keyword_break_mp(vigenere_encipher(sanitise('this is a test ' \
               'message for the vigenere decipherment'), 'cat'), \
               wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
-    ('cat', -52.9479167030...)
+    ('cat', -52.947271216...)
      """
      with Pool() as pool:
          helper_args = [(message, word, fitness) 
@@ -345,7 +341,7 @@ def vigenere_frequency_break(message, fitness=Pletters):
              "certain that the theft has been discovered and that I will " \
              "be caught. The SS officer visits less often now that he is " \
              "sure"), 'florence')) # doctest: +ELLIPSIS
-    ('florence', -307.5549865898...)
+    ('florence', -307.5473096791...)
      """
      best_fit = float("-inf")
      best_key = ''
@@ -376,7 +372,7 @@ def beaufort_frequency_break(message, fitness=Pletters):
              "certain that the theft has been discovered and that I will " \
              "be caught. The SS officer visits less often now " \
              "that he is sure"), 'florence')) # doctest: +ELLIPSIS
-    ('florence', -307.5549865898...)
+    ('florence', -307.5473096791...)
      """
      best_fit = float("-inf")
      best_key = ''
diff --git a/count_1l.txt b/count_1l.txt

index c1fc8efb1514c6b67cd0d66b1da928916a12ba45..e9ac0c6594c432e6a478fa453c6579459d88f26a 100644 (file)
--- a/count_1l.txt
+++ b/count_1l.txt
@@ -1,26 +1,26 @@
-e      756288
-t      559059
-o      503173
-a      489107
-i      420131
-n      418342
-h      415853
-s      403715
-r      372431
-d      267381
-l      258537
-u      189758
-m      171836
-w      153882
-y      142711
-c      140497
-f      134935
-g      117474
-p      100241
-b      92647
-v      65181
-k      54114
-x      7386
-j      6590
-q      5488
-z      3575
+e      758103
+t      560576
+o      504520
+a      490129
+i      421240
+n      419374
+h      416369
+s      404473
+r      373599
+d      267917
+l      259023
+u      190269
+m      172199
+w      154157
+y      143040
+c      141094
+f      135318
+g      117888
+p      100690
+b      92919
+v      65297
+k      54248
+x      7414
+j      6679
+q      5499
+z      3577
diff --git a/language_models.py b/language_models.py

index 929746888d036fb54de3f1fbf228e296e0bcd027..ceb4596eb2fd87d3d2375f338892f9652525f2d4 100644 (file)
--- a/language_models.py
+++ b/language_models.py
@@ -140,14 +140,14 @@ def Pbigrams(letters):
      return sum(P2l[p] for p in ngrams(letters, 2))
  
  
-def cosine_distance_score(text):
+def cosine_similarity_score(text):
      """Finds the dissimilarity of a text to English, using the cosine distance
      of the frequency distribution.
  
-    >>> cosine_distance_score('abcabc') # doctest: +ELLIPSIS
-    0.370847405...
+    >>> cosine_similarity_score('abcabc') # doctest: +ELLIPSIS
+    0.26228882...
      """
-    return norms.cosine_distance(english_counts, 
+    return norms.cosine_similarity(english_counts, 
          collections.Counter(sanitise(text)))
  
  
diff --git a/lettercount.py b/lettercount.py

index 18f75c48db4d8f141f19a645267a86f58b704d5e..956eca1a5b2ba469d66c80612446726cdf8307d4 100644 (file)
--- a/lettercount.py
+++ b/lettercount.py
@@ -8,8 +8,6 @@ for corpus in corpora:
      text = sanitise(open(corpus).read())
      counts.update(text)
  
-sorted_letters = sorted(counts, key=counts.get, reverse=True)
-
  with open('count_1l.txt', 'w') as f:
-    for l in sorted_letters:
-        f.write("{}\t{}\n".format(l, counts[l]))
+    for l, c in counts.most_common():
+        f.write("{}\t{}\n".format(l, c))
diff --git a/norms.py b/norms.py

index 37fd3c93329aa018b31fdf7f9a944eb496c41d44..b8e4bf1ef82c8b8ea6f85c0e3c3597d90b7d9b8b 100644 (file)
--- a/norms.py
+++ b/norms.py
@@ -163,13 +163,13 @@ def cosine_similarity(frequencies1, frequencies2):
      """Finds the distances between two frequency profiles, expressed as dictionaries.
      Assumes every key in frequencies1 is also in frequencies2
  
-    >>> cosine_distance({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
+    >>> cosine_similarity({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
      1.0000000000...
-    >>> cosine_distance({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
+    >>> cosine_similarity({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
      1.0000000000...
-    >>> cosine_distance({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
+    >>> cosine_similarity({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
      0.5773502691...
-    >>> cosine_distance({'a':0, 'b':1}, {'a':1, 'b':1}) # doctest: +ELLIPSIS
+    >>> cosine_similarity({'a':0, 'b':1}, {'a':1, 'b':1}) # doctest: +ELLIPSIS
      0.7071067811...
      """
      numerator = 0
author	Neil Smith <neil.git@njae.me.uk>
	Fri, 14 Mar 2014 12:12:50 +0000 (12:12 +0000)
committer	Neil Smith <neil.git@njae.me.uk>
	Fri, 14 Mar 2014 12:12:50 +0000 (12:12 +0000)
cipherbreak.py		patch \| blob \| history
count_1l.txt		patch \| blob \| history
language_models.py		patch \| blob \| history
lettercount.py		patch \| blob \| history
norms.py		patch \| blob \| history