User:Sylvain Ribault/Saving script
Appearance
This is a Python 3 script for downloading and saving the wiki source for Wiki pages in a given category. It is parametrized for Appropedia.
#!/usr/bin/env python
# coding: utf-8
# # Saving content from Appropedia or other wikis
#
# This script saves the wiki source of all wiki pages in a given category. It does not look into subcategories.
# In[ ]:
from urllib.request import Request, urlopen
import re
def get_text(url, strip = False):
"""
url = an URL
strip = whether to keep only the wiki content, after removing the fluff
return: the text
"""
env = "textarea"
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
text = urlopen(req).read().decode("utf8")
text = re.sub('<', '<', text) # Dirty fix to bizarre bug
if strip:
text = re.search(env + '[^>]*>(.*)</' + env, text, re.DOTALL).group(1)
return text
if False:
url = "https://www.appropedia.org/w/index.php?title=List_of_low-carbon_conferences&action=edit"
print(get_text(url, strip = True))
# In[ ]:
import re
def get_pages(category, verbose = True):
"""
category = the name of a category
verbose = whether to print the number of pages
return: the list of the pages in that category
"""
url_prefix = "https://www.appropedia.org/Category:"
page_prefix = "<li>"
category_url = url_prefix + re.sub(' ','_', category)
text = get_text(category_url)
matches = re.finditer(page_prefix + '[^"]*"([^"]*)"', text)
pages = [match.group(1) for match in matches]
if verbose:
print('Found', len(pages), 'pages in category', category)
return pages
if False:
print(get_pages("Academia"))
print(get_pages("Air travel"))
# In[ ]:
import re
def get_edit_url(title):
"""
title = a title or URL of a wiki page
return : the URL of the edit page, where the source can be viewed
"""
prefix = "https://www.appropedia.org/w/index.php?title="
suffix = "&action=edit"
stripped_title = re.sub('.*/','', title) # Removing all until the last /
underscore_title = re.sub(' ','_', stripped_title) # Replacing spaces with underscores
return prefix + underscore_title + suffix
if False:
print(get_edit_url("https://www.appropedia.org/List_of_low-carbon_conferences"))
print(get_edit_url("List of low-carbon conferences"))
# In[ ]:
import datetime
def save(category, filename):
"""
category = the name of a category
filename = a file name
"""
separator = '\n%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n\n'
pages = get_pages(category)
f = open(filename, "a+")
counter = []
for page in pages:
url = get_edit_url(page)
f.write(separator)
f.write("URL: " + url + '\n')
f.write(separator)
text = get_text(url, strip = True)
f.write(text)
counter.append(len(text))
f.write(separator)
f.write("Saved category " + category + " on date " + str(datetime.datetime.now()) + '\n')
numbers = "Text length: total " + str(sum(counter)) + ", values " + str(counter) + '\n'
f.write(numbers)
print(numbers)
f.close()
# In[ ]:
save("Academia", "Appropedia.save")