Jump to content

Wikipedia:WikiProject Astronomical objects/Stub processing/tools/stub listing perl script

From Wikipedia, the free encyclopedia

Originally written by User:Christopher Thomas. By posting it here, I recognize and acknowledge its release under appropriate Wikipedia licenses. --Christopher Thomas (talk) 19:25, 22 January 2012 (UTC)[reply]

#!/usr/bin/perl
#
# Stub Triage Script - Stub List Fetching
# Written by Christopher Thomas per WT:ASTRO thread discussion.
#
# Usage:  GetStubList.pl <category page> <output filename>
#
# This script fetches a category page and crawls through the "next" links
# to build a comprehensive list of pages in the category.
#
# Output is written to a text file. Each line contains a non-whitespace
# string representing the wiki URL (<foo> in en.wikipedia.org/<foo>),
# followed by a string containing the human-readable name (which may 
# contain whitespace).
#
# This script does not explore sub-categories.
#
# This script worked in January 2012. Wiki changes may break it later!
#


#
# Includes
#

use strict;



#
# Functions
#


# Displays a help screen.
# No arguments.
# No return value.

sub PrintHelp
{
  print << "Endofblock"

Stub Triage Script - Stub List Fetching
Written by Christopher Thomas per WT:ASTRO thread discussion.

Usage:  GetStubList.pl <category page> <output filename>

This script fetches a category page and crawls through the "next" links
to build a comprehensive list of pages in the category.

Output is written to a text file. Each line contains a non-whitespace
string representing the wiki URL (<foo> in en.wikipedia.org/<foo>),
followed by a string containing the human-readable name (which may 
contain whitespace).

This script does not explore sub-categories.

This script worked in January 2012. Wiki changes may break it later!

Endofblock
}



# Processes a category page.
# Extracts a list of pages in the category, and a link to the next
# page of entries in the category (if any).
# Arg 0 is the category page title and wiki arguments.
# Arg 1 points to a hash to store name information in.
# Returns the next title/argument string, or undef if no more pages.

# FIXME - Lots of magic constants in here.

sub ProcessPage
{
  my ($pname, $names_p, $next_pname);
  my (@pagedata, $lidx, $thisline);
  my ($done);
  my ($url, $label);


  # Default to end-of-list.
  $next_pname = undef;


  # Try to get arguments.
  $pname = $_[0];
  $names_p = $_[1];
  if (!( (defined $pname) && (defined $names_p) ))
  {
    print "### [ProcessPage]  Bad arguments.\n";
  }
  else
  {
    # NOTE: Leave the hash untouched; just add to it.


    # No matter what, delay so that we don't hammer the wiki.
    sleep(1);


    # Make sure we're asking for the "printable" version.

    if (!($pname =~ m/printable/))
    {
      $pname = $pname . '&printable=yes';
    }

    # Make sure we're asking for a full URL.

    if (!($pname =~ m/w\/index\.php/i))
    {
      $pname = '/w/index.php?title=' . $pname;
    }

    if (!($pname =~ m/http/i))
    {
      $pname = 'http://en.wikipedia.org' . $pname;
    }

    # Fix cruft.

    $pname =~ s/\&amp\;/\&/g;
    $pname =~ s/\ /_/g;


# FIXME - Diagnostics.
#print "Asking for URL: \"$pname\"\n";


    # Fetch the page.
    # FIXME - Doing this the messy but easy way.
    @pagedata = `lynx --source \"$pname\"`;


    # Skip down to "pages in category". This is always present.

    $thisline = "";
    $lidx = 0;
    while ( (defined $thisline) &&
      (!($thisline =~ m/name\=\"Pages_in_category/)) )
    {
      $thisline = $pagedata[$lidx];
      $lidx++;
    }


    # Handle absence gracefully.
    # Proceed if present.

    if (!(defined $thisline))
    {
      print "### [ProcessPage]  Can't find \"pages in category\"!\n";
      print "### (scanned $lidx lines)\n";
    }
    else
    {
      # Look for list entries.
      # Flag the "next 200" URL if we see one.
      # Stop when we see "</div>".

      # FIXME - If we ever do process subcategories, flag them here.

      $done = 0;

      while (!$done)
      {
        # Fetch the next line.
        $thisline = $pagedata[$lidx];
        $lidx++;

        # Check for end-of-list.
        if ($thisline =~ m/\<\/div\>/i)
        {
          $done = 1;
        }
        # If this is a non-template list entry, add it.
        elsif ($thisline =~
          m/\<li\>.+?href\=\"(\S+)\"\s+title=\"(.*?)\"\>/i)
        {
          $url = $1;
          $label = $2;

          if (!($label =~ m/template/i))
          {
            $$names_p{$url} = $label;
          }
        }
        # If this is a "next" field, record it.
        elsif ($thisline =~
          m/href=\"([^"]*?)\"[^>]*>next 200/i)
        {
          # This should happen twice if it happens (top/bottom).
          # That's fine.

# FIXME - Diagnostics.
#print "Next: \"$1\"\n";

          $next_pname = $1;
        }

        # Finished with this line.
      }
    }


    # Finished processing this page.
  }


  # Return a link to the next page.
  return $next_pname;
}



#
# Main Program
#

my ($catpage, $oname);
my ($names_p, $nidx);
my ($pcount);


$catpage = $ARGV[0];
$oname = $ARGV[1];

if ( (!(defined $catpage)) || (!(defined $oname)) || (defined $ARGV[2]) )
{
  PrintHelp();
}
elsif (!open(OFILE, ">$oname"))
{
  print "### Unable to write to \"$oname\".\n";
}
else
{
  $pcount = 1;

  while (defined $catpage)
  {
    print "Fetching page $pcount...\n";
    $pcount++;

    $names_p = {};

    $catpage = ProcessPage($catpage, $names_p);

    foreach $nidx (sort keys %$names_p)
    {
      print OFILE $nidx . "   " . $$names_p{$nidx} . "\n";
    }
  }


  # Close the output file no matter what.
  close(OFILE);
}



#
# This is the end of the file.
#