6 initial_request_url
= "http://en.wikipedia.org/w/api.php?action=query&list=allpages&format=json&aplimit=10&apminsize=5000"
7 request_url
= "http://en.wikipedia.org/w/api.php?action=query&list=allpages&format=json&aplimit=10&apminsize=5000&apcontinue={}"
8 titles_file
= '/opt/sources/wp-titles.txt'
10 def titles_of(result
):
11 return [p
['title'] for p
in result
['query']['allpages'] ]
13 def next_title(result
):
14 return result
['query-continue']['allpages']['apcontinue']
16 def write_titles(titles
):
17 with
open(titles_file
, 'a') as f
:
18 print('\n'.join(titles
), file=f
)
20 def request_again(start_title
):
21 request
= urllib
.request
.Request(request_url
.format(urllib
.parse
.quote(start_title
)))
22 request
.add_header('User-Agent','neil.wpspider@njae.me.uk')
23 result
= json
.loads(urllib
.request
.urlopen(request
).read().decode())
24 return titles_of(result
), next_title(result
)
26 f
= open(titles_file
, 'w')
29 result
= json
.loads(urllib
.request
.urlopen(initial_request_url
).read().decode())
30 n_title
= next_title(result
)
31 titles
= titles_of(result
)
35 titles
, n_title
= request_again(n_title
)