Jump to content

User:Sylvain Ribault/Saving script

From Wikipedia, the free encyclopedia

This is a Python 3 script for downloading and saving the wiki source for Wiki pages in a given category. It is parametrized for Appropedia.

#!/usr/bin/env python
# coding: utf-8

# # Saving content from Appropedia or other wikis
# 
# This script saves the wiki source of all wiki pages in a given category. It does not look into subcategories.

# In[ ]:


from urllib.request import Request, urlopen
import re

def get_text(url, strip = False):
    """ 
    url = an URL 
    strip = whether to keep only the wiki content, after removing the fluff
    return: the text
    """
    env = "textarea"
    
    req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    text = urlopen(req).read().decode("utf8")
    text = re.sub('&lt;', '<', text)   # Dirty fix to bizarre bug
    if strip:
        text = re.search(env + '[^>]*>(.*)</' + env, text, re.DOTALL).group(1)
    return text

if False:
    url = "https://www.appropedia.org/w/index.php?title=List_of_low-carbon_conferences&action=edit"
    print(get_text(url, strip = True))


# In[ ]:


import re 

def get_pages(category, verbose = True):
    """
    category = the name of a category
    verbose = whether to print the number of pages
    return: the list of the pages in that category
    """ 
    url_prefix = "https://www.appropedia.org/Category:"
    page_prefix = "<li>"
            
    category_url = url_prefix + re.sub(' ','_', category)
    text = get_text(category_url)
    matches = re.finditer(page_prefix + '[^"]*"([^"]*)"', text)
    pages = [match.group(1) for match in matches]
    if verbose:
        print('Found', len(pages), 'pages in category', category)
    return pages
    
if False:
    print(get_pages("Academia"))
    print(get_pages("Air travel"))


# In[ ]:


import re 

def get_edit_url(title):
    """ 
    title = a title or URL of a wiki page
    return : the URL of the edit page, where the source can be viewed
    """
    prefix = "https://www.appropedia.org/w/index.php?title="
    suffix = "&action=edit"
    
    stripped_title = re.sub('.*/','', title)    # Removing all until the last / 
    underscore_title = re.sub(' ','_', stripped_title)  # Replacing spaces with underscores
    
    return prefix + underscore_title + suffix 

if False:
    print(get_edit_url("https://www.appropedia.org/List_of_low-carbon_conferences"))
    print(get_edit_url("List of low-carbon conferences"))


# In[ ]:


import datetime

def save(category, filename):
    """
    category = the name of a category
    filename = a file name
    """
    separator = '\n%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n\n'
    
    pages = get_pages(category)
    f = open(filename, "a+")
    counter = []
    for page in pages:
        url = get_edit_url(page)
        f.write(separator)
        f.write("URL: " + url + '\n')
        f.write(separator)
        text = get_text(url, strip = True)
        f.write(text)
        counter.append(len(text))
    f.write(separator)
    f.write("Saved category " + category + " on date " + str(datetime.datetime.now()) + '\n')
    numbers = "Text length: total " + str(sum(counter)) + ", values " + str(counter) + '\n'
    f.write(numbers)
    print(numbers)
    f.close()


# In[ ]:


save("Academia", "Appropedia.save")