7 english_counts
= collections
.defaultdict(int)
8 with
open('count_1l.txt', 'r') as f
:
10 (letter
, count
) = line
.split("\t")
11 english_counts
[letter
] = int(count
)
12 normalised_english_counts
= norms
.normalise(english_counts
)
14 english_bigram_counts
= collections
.defaultdict(int)
15 with
open('count_2l.txt', 'r') as f
:
17 (bigram
, count
) = line
.split("\t")
18 english_bigram_counts
[bigram
] = int(count
)
19 normalised_english_bigram_counts
= norms
.normalise(english_bigram_counts
)
21 english_trigram_counts
= collections
.defaultdict(int)
22 with
open('count_3l.txt', 'r') as f
:
24 (trigram
, count
) = line
.split("\t")
25 english_trigram_counts
[trigram
] = int(count
)
26 normalised_english_trigram_counts
= norms
.normalise(english_trigram_counts
)
28 with
open('words.txt', 'r') as f
:
29 keywords
= [line
.rstrip() for line
in f
]
31 def weighted_choice(d
):
32 """Generate a set of random items from a dictionary of item counts
34 target
= random
.uniform(0, sum(d
.values()))
36 for (l
, p
) in d
.items():
42 def random_english_letter():
43 """Generate a random letter based on English letter counts
45 return weighted_choice(normalised_english_counts
)
49 """Remove all non-alphabetic characters from a text
50 >>> letters('The Quick')
52 >>> letters('The Quick BROWN fox jumped! over... the (9lazy) DOG')
53 'TheQuickBROWNfoxjumpedoverthelazyDOG'
55 return ''.join([c
for c
in text
if c
in string
.ascii_letters
])
58 """Remove all accents from letters.
59 It does this by converting the unicode string to decomposed compatability
60 form, dropping all the combining accents, then re-encoding the bytes.
73 return unicodedata
.normalize('NFKD', text
).\
74 encode('ascii', 'ignore').\
78 """Remove all non-alphabetic characters and convert the text to lowercase
80 >>> sanitise('The Quick')
82 >>> sanitise('The Quick BROWN fox jumped! over... the (9lazy) DOG')
83 'thequickbrownfoxjumpedoverthelazydog'
87 # sanitised = [c.lower() for c in text if c in string.ascii_letters]
88 # return ''.join(sanitised)
89 return letters(unaccent(text
)).lower()
92 if __name__
== "__main__":