Updated letter frequencies, updated test values to reflect them

author Neil Smith <neil.git@njae.me.uk>

Fri, 14 Mar 2014 12:12:50 +0000 (12:12 +0000)

committer Neil Smith <neil.git@njae.me.uk>

Fri, 14 Mar 2014 12:12:50 +0000 (12:12 +0000)
author Neil Smith <neil.git@njae.me.uk>
Fri, 14 Mar 2014 12:12:50 +0000 (12:12 +0000)
committer Neil Smith <neil.git@njae.me.uk>
Fri, 14 Mar 2014 12:12:50 +0000 (12:12 +0000)
diff --git a/cipherbreak.py b/cipherbreak.py

index d2c35c93ed77b1c693dd68fd98bce7989b108066..17df97af0d7ee6769a8789ec1eee74a92a369ef8 100644 (file)
--- a/cipherbreak.py
+++ b/cipherbreak.py
@@ -50,10 +50,6 @@ def frequencies(text):
      >>> frequencies('abcdefabcdef')['x']
      0
      """
      >>> frequencies('abcdefabcdef')['x']
      0
      """
-    #counts = collections.defaultdict(int)
-    #for c in text: 
-    #    counts[c] += 1
-    #return counts
      return collections.Counter(c for c in text)
  
  
      return collections.Counter(c for c in text)
  
  
@@ -62,13 +58,13 @@ def caesar_break(message, fitness=Pletters):
      
      >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrh' \
            'ecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS
      
      >>> caesar_break('ibxcsyorsaqcheyklxivoexlevmrimwxsfiqevvmihrsasrxliwyrh' \
            'ecjsppsamrkwleppfmergefifvmhixscsymjcsyqeoixlm') # doctest: +ELLIPSIS
-    (4, -130.849890899...)
+    (4, -130.849989015...)
      >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgtee' \
            'raxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS
      >>> caesar_break('wxwmaxdgheetgwuxztgptedbgznitgwwhpguxyhkxbmhvvtlbhgtee' \
            'raxlmhiixweblmxgxwmhmaxybkbgztgwztsxwbgmxgmert') # doctest: +ELLIPSIS
-    (19, -128.82516920...)
+    (19, -128.82410410...)
      >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurer' \
            'svaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS
      >>> caesar_break('yltbbqnqnzvguvaxurorgenafsbezqvagbnornfgsbevpnaabjurer' \
            'svaquvzyvxrnznazlybequrvfohgriraabjtbaruraprur') # doctest: +ELLIPSIS
-    (13, -126.25233502...)
+    (13, -126.25403935...)
      """
      sanitised_message = sanitise(message)
      best_shift = 0
      """
      sanitised_message = sanitise(message)
      best_shift = 0
@@ -95,7 +91,7 @@ def affine_break(message, fitness=Pletters):
            'ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls ' \
            'bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai ' \
            'kxd clm ckuxj.') # doctest: +ELLIPSIS
            'ls umfjsd jlsi zg hfsqysxog. ls dmmdtsd mx jls bats mh bkbsf. ls ' \
            'bfmctsd kfmyxd jls lyj, mztanamyu xmc jm clm cku tmmeaxw kj lai ' \
            'kxd clm ckuxj.') # doctest: +ELLIPSIS
-    ((15, 22, True), -340.611412245...)
+    ((15, 22, True), -340.601181913...)
      """
      sanitised_message = sanitise(message)
      best_multiplier = 0
      """
      sanitised_message = sanitise(message)
      best_multiplier = 0
@@ -131,7 +127,7 @@ def keyword_break(message, wordlist=keywords, fitness=Pletters):
      >>> keyword_break(keyword_encipher('this is a test message for the ' \
            'keyword decipherment', 'elephant', 1), \
            wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
      >>> keyword_break(keyword_encipher('this is a test message for the ' \
            'keyword decipherment', 'elephant', 1), \
            wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
-    (('elephant', 1), -52.8345642265...)
+    (('elephant', 1), -52.834575011...)
      """
      best_keyword = ''
      best_wrap_alphabet = True
      """
      best_keyword = ''
      best_wrap_alphabet = True
@@ -162,7 +158,7 @@ def keyword_break_mp(message, wordlist=keywords, fitness=Pletters, chunksize=500
      >>> keyword_break_mp(keyword_encipher('this is a test message for the ' \
            'keyword decipherment', 'elephant', 1), \
            wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
      >>> keyword_break_mp(keyword_encipher('this is a test message for the ' \
            'keyword decipherment', 'elephant', 1), \
            wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
-    (('elephant', 1), -52.834564226507...)
+    (('elephant', 1), -52.834575011...)
      """
      with Pool() as pool:
          helper_args = [(message, word, wrap, fitness) 
      """
      with Pool() as pool:
          helper_args = [(message, word, wrap, fitness) 
@@ -287,7 +283,7 @@ def vigenere_keyword_break(message, wordlist=keywords, fitness=Pletters):
      >>> vigenere_keyword_break(vigenere_encipher(sanitise('this is a test ' \
               'message for the vigenere decipherment'), 'cat'), \
               wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
      >>> vigenere_keyword_break(vigenere_encipher(sanitise('this is a test ' \
               'message for the vigenere decipherment'), 'cat'), \
               wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
-    ('cat', -52.9479167030...)
+    ('cat', -52.947271216...)
      """
      best_keyword = ''
      best_fit = float("-inf")
      """
      best_keyword = ''
      best_fit = float("-inf")
@@ -315,7 +311,7 @@ def vigenere_keyword_break_mp(message, wordlist=keywords, fitness=Pletters,
      >>> vigenere_keyword_break_mp(vigenere_encipher(sanitise('this is a test ' \
               'message for the vigenere decipherment'), 'cat'), \
               wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
      >>> vigenere_keyword_break_mp(vigenere_encipher(sanitise('this is a test ' \
               'message for the vigenere decipherment'), 'cat'), \
               wordlist=['cat', 'elephant', 'kangaroo']) # doctest: +ELLIPSIS
-    ('cat', -52.9479167030...)
+    ('cat', -52.947271216...)
      """
      with Pool() as pool:
          helper_args = [(message, word, fitness) 
      """
      with Pool() as pool:
          helper_args = [(message, word, fitness) 
@@ -345,7 +341,7 @@ def vigenere_frequency_break(message, fitness=Pletters):
              "certain that the theft has been discovered and that I will " \
              "be caught. The SS officer visits less often now that he is " \
              "sure"), 'florence')) # doctest: +ELLIPSIS
              "certain that the theft has been discovered and that I will " \
              "be caught. The SS officer visits less often now that he is " \
              "sure"), 'florence')) # doctest: +ELLIPSIS
-    ('florence', -307.5549865898...)
+    ('florence', -307.5473096791...)
      """
      best_fit = float("-inf")
      best_key = ''
      """
      best_fit = float("-inf")
      best_key = ''
@@ -376,7 +372,7 @@ def beaufort_frequency_break(message, fitness=Pletters):
              "certain that the theft has been discovered and that I will " \
              "be caught. The SS officer visits less often now " \
              "that he is sure"), 'florence')) # doctest: +ELLIPSIS
              "certain that the theft has been discovered and that I will " \
              "be caught. The SS officer visits less often now " \
              "that he is sure"), 'florence')) # doctest: +ELLIPSIS
-    ('florence', -307.5549865898...)
+    ('florence', -307.5473096791...)
      """
      best_fit = float("-inf")
      best_key = ''
      """
      best_fit = float("-inf")
      best_key = ''
diff --git a/count_1l.txt b/count_1l.txt

index c1fc8efb1514c6b67cd0d66b1da928916a12ba45..e9ac0c6594c432e6a478fa453c6579459d88f26a 100644 (file)
--- a/count_1l.txt
+++ b/count_1l.txt
@@ -1,26 +1,26 @@
-e      756288
-t      559059
-o      503173
-a      489107
-i      420131
-n      418342
-h      415853
-s      403715
-r      372431
-d      267381
-l      258537
-u      189758
-m      171836
-w      153882
-y      142711
-c      140497
-f      134935
-g      117474
-p      100241
-b      92647
-v      65181
-k      54114
-x      7386
-j      6590
-q      5488
-z      3575
+e      758103
+t      560576
+o      504520
+a      490129
+i      421240
+n      419374
+h      416369
+s      404473
+r      373599
+d      267917
+l      259023
+u      190269
+m      172199
+w      154157
+y      143040
+c      141094
+f      135318
+g      117888
+p      100690
+b      92919
+v      65297
+k      54248
+x      7414
+j      6679
+q      5499
+z      3577
diff --git a/language_models.py b/language_models.py

index 929746888d036fb54de3f1fbf228e296e0bcd027..ceb4596eb2fd87d3d2375f338892f9652525f2d4 100644 (file)
--- a/language_models.py
+++ b/language_models.py
@@ -140,14 +140,14 @@ def Pbigrams(letters):
      return sum(P2l[p] for p in ngrams(letters, 2))
  
  
      return sum(P2l[p] for p in ngrams(letters, 2))
  
  
-def cosine_distance_score(text):
+def cosine_similarity_score(text):
      """Finds the dissimilarity of a text to English, using the cosine distance
      of the frequency distribution.
  
      """Finds the dissimilarity of a text to English, using the cosine distance
      of the frequency distribution.
  
-    >>> cosine_distance_score('abcabc') # doctest: +ELLIPSIS
-    0.370847405...
+    >>> cosine_similarity_score('abcabc') # doctest: +ELLIPSIS
+    0.26228882...
      """
      """
-    return norms.cosine_distance(english_counts, 
+    return norms.cosine_similarity(english_counts, 
          collections.Counter(sanitise(text)))
  
  
          collections.Counter(sanitise(text)))
  
  
diff --git a/lettercount.py b/lettercount.py

index 18f75c48db4d8f141f19a645267a86f58b704d5e..956eca1a5b2ba469d66c80612446726cdf8307d4 100644 (file)
--- a/lettercount.py
+++ b/lettercount.py
@@ -8,8 +8,6 @@ for corpus in corpora:
      text = sanitise(open(corpus).read())
      counts.update(text)
  
      text = sanitise(open(corpus).read())
      counts.update(text)
  
-sorted_letters = sorted(counts, key=counts.get, reverse=True)
-
  with open('count_1l.txt', 'w') as f:
  with open('count_1l.txt', 'w') as f:
-    for l in sorted_letters:
-        f.write("{}\t{}\n".format(l, counts[l]))
+    for l, c in counts.most_common():
+        f.write("{}\t{}\n".format(l, c))
diff --git a/norms.py b/norms.py

index 37fd3c93329aa018b31fdf7f9a944eb496c41d44..b8e4bf1ef82c8b8ea6f85c0e3c3597d90b7d9b8b 100644 (file)
--- a/norms.py
+++ b/norms.py
@@ -163,13 +163,13 @@ def cosine_similarity(frequencies1, frequencies2):
      """Finds the distances between two frequency profiles, expressed as dictionaries.
      Assumes every key in frequencies1 is also in frequencies2
  
      """Finds the distances between two frequency profiles, expressed as dictionaries.
      Assumes every key in frequencies1 is also in frequencies2
  
-    >>> cosine_distance({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
+    >>> cosine_similarity({'a':1, 'b':1, 'c':1}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
      1.0000000000...
      1.0000000000...
-    >>> cosine_distance({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
+    >>> cosine_similarity({'a':2, 'b':2, 'c':2}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
      1.0000000000...
      1.0000000000...
-    >>> cosine_distance({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
+    >>> cosine_similarity({'a':0, 'b':2, 'c':0}, {'a':1, 'b':1, 'c':1}) # doctest: +ELLIPSIS
      0.5773502691...
      0.5773502691...
-    >>> cosine_distance({'a':0, 'b':1}, {'a':1, 'b':1}) # doctest: +ELLIPSIS
+    >>> cosine_similarity({'a':0, 'b':1}, {'a':1, 'b':1}) # doctest: +ELLIPSIS
      0.7071067811...
      """
      numerator = 0
      0.7071067811...
      """
      numerator = 0
author	Neil Smith <neil.git@njae.me.uk>
	Fri, 14 Mar 2014 12:12:50 +0000 (12:12 +0000)
committer	Neil Smith <neil.git@njae.me.uk>
	Fri, 14 Mar 2014 12:12:50 +0000 (12:12 +0000)
cipherbreak.py		patch \| blob \| history
count_1l.txt		patch \| blob \| history
language_models.py		patch \| blob \| history
lettercount.py		patch \| blob \| history
norms.py		patch \| blob \| history