Wikipedia:WikiProject Astronomical objects/Stub processing/tools/stub listing perl script
Appearance
Originally written by User:Christopher Thomas. By posting it here, I recognize and acknowledge its release under appropriate Wikipedia licenses. --Christopher Thomas (talk) 19:25, 22 January 2012 (UTC)
#!/usr/bin/perl
#
# Stub Triage Script - Stub List Fetching
# Written by Christopher Thomas per WT:ASTRO thread discussion.
#
# Usage: GetStubList.pl <category page> <output filename>
#
# This script fetches a category page and crawls through the "next" links
# to build a comprehensive list of pages in the category.
#
# Output is written to a text file. Each line contains a non-whitespace
# string representing the wiki URL (<foo> in en.wikipedia.org/<foo>),
# followed by a string containing the human-readable name (which may
# contain whitespace).
#
# This script does not explore sub-categories.
#
# This script worked in January 2012. Wiki changes may break it later!
#
#
# Includes
#
use strict;
#
# Functions
#
# Displays a help screen.
# No arguments.
# No return value.
sub PrintHelp
{
print << "Endofblock"
Stub Triage Script - Stub List Fetching
Written by Christopher Thomas per WT:ASTRO thread discussion.
Usage: GetStubList.pl <category page> <output filename>
This script fetches a category page and crawls through the "next" links
to build a comprehensive list of pages in the category.
Output is written to a text file. Each line contains a non-whitespace
string representing the wiki URL (<foo> in en.wikipedia.org/<foo>),
followed by a string containing the human-readable name (which may
contain whitespace).
This script does not explore sub-categories.
This script worked in January 2012. Wiki changes may break it later!
Endofblock
}
# Processes a category page.
# Extracts a list of pages in the category, and a link to the next
# page of entries in the category (if any).
# Arg 0 is the category page title and wiki arguments.
# Arg 1 points to a hash to store name information in.
# Returns the next title/argument string, or undef if no more pages.
# FIXME - Lots of magic constants in here.
sub ProcessPage
{
my ($pname, $names_p, $next_pname);
my (@pagedata, $lidx, $thisline);
my ($done);
my ($url, $label);
# Default to end-of-list.
$next_pname = undef;
# Try to get arguments.
$pname = $_[0];
$names_p = $_[1];
if (!( (defined $pname) && (defined $names_p) ))
{
print "### [ProcessPage] Bad arguments.\n";
}
else
{
# NOTE: Leave the hash untouched; just add to it.
# No matter what, delay so that we don't hammer the wiki.
sleep(1);
# Make sure we're asking for the "printable" version.
if (!($pname =~ m/printable/))
{
$pname = $pname . '&printable=yes';
}
# Make sure we're asking for a full URL.
if (!($pname =~ m/w\/index\.php/i))
{
$pname = '/w/index.php?title=' . $pname;
}
if (!($pname =~ m/http/i))
{
$pname = 'http://en.wikipedia.org' . $pname;
}
# Fix cruft.
$pname =~ s/\&\;/\&/g;
$pname =~ s/\ /_/g;
# FIXME - Diagnostics.
#print "Asking for URL: \"$pname\"\n";
# Fetch the page.
# FIXME - Doing this the messy but easy way.
@pagedata = `lynx --source \"$pname\"`;
# Skip down to "pages in category". This is always present.
$thisline = "";
$lidx = 0;
while ( (defined $thisline) &&
(!($thisline =~ m/name\=\"Pages_in_category/)) )
{
$thisline = $pagedata[$lidx];
$lidx++;
}
# Handle absence gracefully.
# Proceed if present.
if (!(defined $thisline))
{
print "### [ProcessPage] Can't find \"pages in category\"!\n";
print "### (scanned $lidx lines)\n";
}
else
{
# Look for list entries.
# Flag the "next 200" URL if we see one.
# Stop when we see "</div>".
# FIXME - If we ever do process subcategories, flag them here.
$done = 0;
while (!$done)
{
# Fetch the next line.
$thisline = $pagedata[$lidx];
$lidx++;
# Check for end-of-list.
if ($thisline =~ m/\<\/div\>/i)
{
$done = 1;
}
# If this is a non-template list entry, add it.
elsif ($thisline =~
m/\<li\>.+?href\=\"(\S+)\"\s+title=\"(.*?)\"\>/i)
{
$url = $1;
$label = $2;
if (!($label =~ m/template/i))
{
$$names_p{$url} = $label;
}
}
# If this is a "next" field, record it.
elsif ($thisline =~
m/href=\"([^"]*?)\"[^>]*>next 200/i)
{
# This should happen twice if it happens (top/bottom).
# That's fine.
# FIXME - Diagnostics.
#print "Next: \"$1\"\n";
$next_pname = $1;
}
# Finished with this line.
}
}
# Finished processing this page.
}
# Return a link to the next page.
return $next_pname;
}
#
# Main Program
#
my ($catpage, $oname);
my ($names_p, $nidx);
my ($pcount);
$catpage = $ARGV[0];
$oname = $ARGV[1];
if ( (!(defined $catpage)) || (!(defined $oname)) || (defined $ARGV[2]) )
{
PrintHelp();
}
elsif (!open(OFILE, ">$oname"))
{
print "### Unable to write to \"$oname\".\n";
}
else
{
$pcount = 1;
while (defined $catpage)
{
print "Fetching page $pcount...\n";
$pcount++;
$names_p = {};
$catpage = ProcessPage($catpage, $names_p);
foreach $nidx (sort keys %$names_p)
{
print OFILE $nidx . " " . $$names_p{$nidx} . "\n";
}
}
# Close the output file no matter what.
close(OFILE);
}
#
# This is the end of the file.
#