From 2f33e16ccc84ddb0023f3621dd6ad545c1bb3251 Mon Sep 17 00:00:00 2001 From: Neil Smith Date: Wed, 12 Feb 2014 17:03:41 +0000 Subject: [PATCH] Fiddling with cracking dictionaries --- find_wikipedia_titles.py | 35 +++++++++++++++++++++++++++++++++++ make-cracking-dictionary.py | 8 +++++--- 2 files changed, 40 insertions(+), 3 deletions(-) create mode 100644 find_wikipedia_titles.py diff --git a/find_wikipedia_titles.py b/find_wikipedia_titles.py new file mode 100644 index 0000000..8d56124 --- /dev/null +++ b/find_wikipedia_titles.py @@ -0,0 +1,35 @@ +import urllib.request +import urllib.parse +import json +import time + +initial_request_url = "http://en.wikipedia.org/w/api.php?action=query&list=allpages&format=json&aplimit=10&apminsize=5000" +request_url = "http://en.wikipedia.org/w/api.php?action=query&list=allpages&format=json&aplimit=10&apminsize=5000&apcontinue={}" +titles_file = '/opt/sources/wp-titles.txt' + +def titles_of(result): + return [p['title'] for p in result['query']['allpages'] ] + +def next_title(result): + return result['query-continue']['allpages']['apcontinue'] + +def write_titles(titles): + with open(titles_file, 'a') as f: + print('\n'.join(titles), file=f) + +def request_again(start_title): + request = urllib.request.Request(request_url.format(urllib.parse.quote(start_title))) + request.add_header('User-Agent','neil.wpspider@njae.me.uk') + result = json.loads(urllib.request.urlopen(request).read().decode()) + return titles_of(result), next_title(result) + +f = open(titles_file, 'w') +f.close() + +result = json.loads(urllib.request.urlopen(initial_request_url).read().decode()) +n_title = next_title(result) +titles = titles_of(result) +while titles != []: + write_titles(titles) + time.sleep(0.5) + titles, n_title = request_again(n_title) diff --git a/make-cracking-dictionary.py b/make-cracking-dictionary.py index f2ba6cb..37de917 100644 --- a/make-cracking-dictionary.py +++ b/make-cracking-dictionary.py @@ -6,11 +6,13 @@ cracklib = set(open('/usr/share/dict/cracklib-small', 'r').readlines()) words = american | british | cracklib -sanitised_words = set() +# sanitised_words = set() -for w in words: - sanitised_words.add(language_models.sanitise(w)) +# for w in words: + # sanitised_words.add(language_models.sanitise(w)) +sanitised_words = set(language_models.sanitise(w) for w in words) + sanitised_words.discard('') with open('words.txt', 'w') as f: -- 2.34.1