User:MastCell/dermimages.py

# File: Dermimages.py
# By MastCell
# Released for any and all reuse and modification
# Use at your own risk.
# ========================
# This script does the following:
#  1. Load all articles linked to the Dermatology Task Force,
#     using Category:Dermatology task force articles.
#  2. Check each page for images, removing those which are part of
#     common templates
#  3. Output the results in a sortable wikitable which can be
#     cut-and-pasted onto Wikipedia.
#
# The goal is to assess how many dermatology-related articles currently
# lack images, and to assess the overall prevalence of images across
# derm-related articles.
#
# The script can be run from the command line. It will create a file
# called "dermimages_output.txt" in the same directory where the script
# itself resides. Note that if such a file already exists, it will be
# overwritten. The output format is meant to be cut-and-pasted into
# Wikipedia as a wikitable.
# =========================

# MWclient module for Wikimedia API calls
import mwclient


# Global set of image names to exclude
# (include images from templates, featured article stars, etc
#  which should not be counted as "content" images)
# Modify as needed.
global_exclude_list =\
    set(["Normal Epidermis and Dermis with Intradermal Nevus 10x.JPG",\
        "LinkFA-star.png",\
        "Featured article star.svg",\
        "Symbol support vote.svg",\
        "Rod of Asclepius2.svg",\
        "Mergefrom.svg",\
        "Gray944.png",\
        "Question book-new.svg",\
        "Ambox contradict.svg",\
        "Mitotic spindle color micrograph.gif",\
        "Ambox content.png",\
        "Text document with red question mark.svg",\
        "Edit-clear.svg",\
        "UK-Medical-Bio-Stub.svg",\
        "Flag of Germany.svg",\
        "Commons-logo.svg",\
        "Wiki letter w.svg",\
        "Chromosome.svg",\
        "DNA stub.png",\
        "Merge-arrow.svg"])


# Global dictionary to count how many pages have zero images, one image, two images, ...
global_imagenums = {"Total pages": 0,\
                    "Pages with zero images": 0,\
                    "Pages with one image": 0,\
                    "Pages with two images": 0,\
                    "Pages with three images": 0,\
                    "Pages with four or more images": 0}


# Open the site and collect pages from category
# (Note that these will generally be article talk pages, since that's
#  where the Derm task force template is typically placed)
wpHandle = mwclient.Site('en.wikipedia.org')
dermTalkPages = wpHandle.Pages['Category:Dermatology task force articles']


# Main program loop: load and process each page
def main_program():
    setUpTable()
    for page in dermTalkPages:
        # Make sure we're dealing with the article page, rather than talk page
        page = wpHandle.Pages[page.page_title]

        # Increment the total page count
        global_imagenums["Total pages"] += 1
        
        # Load images and process them
        imageHandle = page.images
        imageList = imageHandle()
        processPage(page, imageList)
    closeTable()
    outputDictionary()
    
# Page processing function
# Note to self: need to encode the page and image names. Otherwise
# the script will eventually choke with a UnicodeEncodingError.
# Hopefully the XMLchar replacements will be properly rendered by
# Wikimedia.
def processPage(page, imageList):
    outputFile.write("|-\n")
    outputFile.write("| [[")
    outputFile.write(page.name.encode("iso-8859-15", "xmlcharrefreplace"))
    outputFile.write("]] ||")

    imageCount = 0
    for image in imageList:
        if (image.page_title not in global_exclude_list):
            if (imageCount > 0):
                outputFile.write("<br>\n")
            imageCount += 1
            outputFile.write('[[:' + image.name.encode("iso-8859-15", "xmlcharrefreplace"))
            outputFile.write(']]')
    outputFile.write("\n" + ' || ' + str(imageCount) + "\n")
    incrementPageCounter(imageCount)


# Function to update the dictionary of pages
# I'm sure there's a more elegant way to do this, but...
def incrementPageCounter(numImages):
    if (numImages is 0):
        global_imagenums["Pages with zero images"] += 1
    elif (numImages is 1):
        global_imagenums["Pages with one image"] += 1
    elif (numImages is 2):
        global_imagenums["Pages with two images"] += 1
    elif (numImages is 3):
        global_imagenums["Pages with three images"] += 1
    else:
        global_imagenums["Pages with four or more images"] += 1
        

# Output the table header boilerplate
def setUpTable():
    outputFile.write('{| class="wikitable sortable" border="1"')
    outputFile.write("\n")
    outputFile.write('! Page !! class="unsortable" | Images !! Number of images')
    outputFile.write("\n")


# Output the table footer boilerplate
def closeTable():
    outputFile.write('|}')


# Output the dictionary counts
def outputDictionary():
    outputFile.write("\n== Totals by number of images ==\n")
    for key, value in global_imagenums.iteritems():
        outputFile.write(key + ": ")
        outputFile.write(str(value))
        outputFile.write("\n")
    
    
########################################
# Main program
# ------------
# Opens a handle to the output file, then runs the main loop
########################################
with open('dermimages_output.txt', 'w') as outputFile:
    main_program()