X-Git-Url: https://git.njae.me.uk/?a=blobdiff_plain;f=lettercount.py;h=956eca1a5b2ba469d66c80612446726cdf8307d4;hb=317066e6551d143e38d093a55c4645dbd53c1c57;hp=4a7082d1068669762d1c8526c761382d07ed6182;hpb=3e8d2bd8cd7c623116fa3d2b77db954f51b191e4;p=cipher-training.git

diff --git a/lettercount.py b/lettercount.py
index 4a7082d..956eca1 100644
--- a/lettercount.py
+++ b/lettercount.py
@@ -1,21 +1,13 @@
+from language_models import sanitise
 import collections
-import string
-
-def sanitise(text):
-    return [l.lower() for l in text if l in string.ascii_letters]
 
 corpora = ['shakespeare.txt', 'sherlock-holmes.txt', 'war-and-peace.txt']
-counts = collections.defaultdict(int)
+counts = collections.Counter()
 
 for corpus in corpora:
-    text = sanitise(open(corpus, 'r').read())
-    for letter in text:
-        counts[letter] += 1
-
-sorted_letters = sorted(counts, key=counts.get, reverse=True)
+    text = sanitise(open(corpus).read())
+    counts.update(text)
 
 with open('count_1l.txt', 'w') as f:
-    for l in sorted_letters:
-        f.write("{0}\t{1}\n".format(l, counts[l]))
-        
-    
\ No newline at end of file
+    for l, c in counts.most_common():
+        f.write("{}\t{}\n".format(l, c))