Fiddling with cracking dictionaries
authorNeil Smith <neil.git@njae.me.uk>
Wed, 12 Feb 2014 17:03:41 +0000 (17:03 +0000)
committerNeil Smith <neil.git@njae.me.uk>
Wed, 12 Feb 2014 17:03:41 +0000 (17:03 +0000)
find_wikipedia_titles.py [new file with mode: 0644]
make-cracking-dictionary.py

diff --git a/find_wikipedia_titles.py b/find_wikipedia_titles.py
new file mode 100644 (file)
index 0000000..8d56124
--- /dev/null
@@ -0,0 +1,35 @@
+import urllib.request
+import urllib.parse
+import json
+import time
+
+initial_request_url = "http://en.wikipedia.org/w/api.php?action=query&list=allpages&format=json&aplimit=10&apminsize=5000"
+request_url = "http://en.wikipedia.org/w/api.php?action=query&list=allpages&format=json&aplimit=10&apminsize=5000&apcontinue={}"
+titles_file = '/opt/sources/wp-titles.txt'
+
+def titles_of(result):
+    return [p['title'] for p in result['query']['allpages'] ]
+
+def next_title(result):
+    return result['query-continue']['allpages']['apcontinue']
+
+def write_titles(titles):
+    with open(titles_file, 'a') as f:
+        print('\n'.join(titles), file=f)
+
+def request_again(start_title):
+    request = urllib.request.Request(request_url.format(urllib.parse.quote(start_title)))
+    request.add_header('User-Agent','neil.wpspider@njae.me.uk')
+    result = json.loads(urllib.request.urlopen(request).read().decode())
+    return titles_of(result), next_title(result)
+
+f = open(titles_file, 'w')
+f.close()
+
+result = json.loads(urllib.request.urlopen(initial_request_url).read().decode())
+n_title = next_title(result)
+titles = titles_of(result)
+while titles != []:
+    write_titles(titles)
+    time.sleep(0.5)
+    titles, n_title = request_again(n_title)
index f2ba6cbf8c02500fe276510cead64a0d2ca662dd..37de917608fb63b9f730fcf04cfc8c86035e7e58 100644 (file)
@@ -6,11 +6,13 @@ cracklib = set(open('/usr/share/dict/cracklib-small', 'r').readlines())
 
 words = american | british | cracklib
 
-sanitised_words = set()
+sanitised_words = set()
 
-for w in words:
-    sanitised_words.add(language_models.sanitise(w))
+for w in words:
+    sanitised_words.add(language_models.sanitise(w))
     
+sanitised_words = set(language_models.sanitise(w) for w in words)
+
 sanitised_words.discard('')
 
 with open('words.txt', 'w') as f: