OMerge branch 'development' of git.njae.me.uk:cipher-training into development
[cipher-training.git] / lettercount.py
1 from language_models import sanitise
2 import collections
3
4 corpora = ['shakespeare.txt', 'sherlock-holmes.txt', 'war-and-peace.txt']
5 counts = collections.Counter()
6
7 for corpus in corpora:
8 text = sanitise(open(corpus).read())
9 counts.update(text)
10
11 with open('count_1l.txt', 'w') as f:
12 for l, c in counts.most_common():
13 f.write("{}\t{}\n".format(l, c))