User:HBC archive builderbot/source

use strict;
use Storable;
use LWP::UserAgent;
use HTTP::Request::Common;
use XML::Simple;
use URI::Escape;
use Data::Dumper;
use Algorithm::Diff qw(diff);

my $ua = LWP::UserAgent->new('agent' => 'HBC archive builderbot v0.1 - developing (Operated by User:HighInBC)');
my $nowiki = ('nowiki'); # So it doesn't screw up the display of the source code on wiki

my $page = 'Wikipedia:Requests for comment/User names';
my $shortcut;
$shortcut = 'WP:RFCN';
$shortcut ||= $page;
my %revisions = get_complete_history($page);

my(@old_content);
my($old_key);
my $day;
KEY: foreach my $key (sort {$a <=> $b} keys(%revisions))
  {
  my(@content) = split("\n",${$revisions{$key}}{'text'}{'content'});
  my $timestamp = ${$revisions{$key}}{'timestamp'};
  my $summary = ${$revisions{$key}}{'comment'};
  $summary =~ s|/\*.*\*/\s*||;
  my $user = ${$revisions{$key}}{'contributor'}{'username'};
  my (@headings);
  if (scalar(@content) && scalar(@old_content))
    {
    my @diffs = diff(\@old_content, \@content);
    foreach my $ra_hunk (@diffs)
      {
      foreach my $ra_diff (@{$ra_hunk})
        {
        my($action,$content) = @{$ra_diff}[0,2];
        if (($content =~ m|==\s?([^=]*)\s?==|) && ($action eq '-'))
          {
          my $heading = $1;
          ($heading =~ s|(\{\{.*:.*\}\})|<$nowiki>$1</$nowiki>|) if ($heading =~ m|\{\{.*:.*\}\}|);
          push(@headings,$heading);
          }
        }
      }
    }
  if (scalar(@headings))
    {
    $timestamp =~ m|(\d{4}-\d{2}-\d{2})T(\d{2}:\d{2}):\d{2}Z|;
    if ($1 ne $day)
      {
      $day = $1;
      warn "'''$day'''\n";
      }
    my $time = $2;
    my $archive_link = "'''[{{fullurl:$shortcut|oldid=$old_key}} Archive link]'''";
    if (scalar(@headings) > 1)
      {
      warn "* '''$time''': $archive_link - ($summary ([[User:$user|$user]])) - (".scalar(@headings)." entries)\n";
      foreach my $heading (@headings)
        {
        warn "** $heading\n";
        }
      }
    elsif (scalar(@headings) == 1)
      {
      warn "* '''$time''': $archive_link - $headings[0] - ($summary ([[User:$user|$user]]))\n";
      }
    }
  @old_content = @content;
  $old_key = $key;
  }

sub get_complete_history # Add Gzip, 100 times smaller, gee where did that ratio come from??
  {
  mkdir('cache') unless (-d('cache'));
  my $page = shift;
  my(%revisions);
  my $count;
  my $offset;
  my $fname = 'cache/'.uri_escape($page);
  if (-f($fname))
    {
    warn "Found '$page' in cache, loading...\n";
    %revisions = %{retrieve($fname)};
    my(@keys) = sort {$a <=> $b} keys(%revisions);
    $offset = ($revisions{$keys[scalar(@keys)-1]}{'timestamp'}); # Get timestamp of most recent revision
    warn (scalar(keys(%revisions))." loaded from cache.\n");
    }
  else
    {
    warn "No cache, starting fresh.\n";
    $offset = '0';
    }
  my $total;
  GETMORE:
  warn "\nDownloading as many as 100 revisions starting at ".($offset || 'the start')."\n";
  my $index = 'http://en.wikipedia.org/w/index.php';
  my $res = $ua->request
	(
	 POST $index."?title=Special:Export",
	 Content_Type  => 'application/x-www-form-urlencoded',
	 Content       =>	[(
				  'pages'	=> $page,
				  'action'	=> 'submit',
				  'submit'	=> 'Export',
				  'limit'	=> 100,
				  'offset'	=> $offset
				)]
	);
  my $current = $res->content();
  unless ($current =~ m|^<mediawiki|)
    {
    warn "Failed somehow, trying again.\n";
    goto GETMORE;
    }
  my $index = rindex($current, '<timestamp>');
  my $string = substr($current,$index,43);
  $string =~ m|<timestamp>(.+?)</timestamp>|;
  $offset = $1;
  my $xml_data = XMLin($current);
  $count = 0;
  if (!scalar(keys(%{${$xml_data}{page}{revision}}))) {} # do nothing
  elsif (${$xml_data}{'page'}{'revision'}{'id'})
    {
    unless ($revisions{${$xml_data}{'page'}{'revision'}{'id'}}) {$count++;$total++;}
    $revisions{${$xml_data}{'page'}{'revision'}{'id'}} = ${$xml_data}{'page'}{'revision'};
    }
  else
    {
    foreach my $revision (sort {$a <=> $b} keys(%{${$xml_data}{'page'}{'revision'}}))
      {
      unless ($revisions{$revision}) {$count++;$total++;}
      $revisions{$revision} = ${$xml_data}{'page'}{'revision'}{$revision};
      }
    warn Dumper($xml_data) unless ($total);
    }
  warn "Got $count revisions\n";
  if ($count == 100)
    {
    warn "Still more.\n";
    goto GETMORE;
    }
  if ($total > 0)
    {
    warn "Saving cache...\n";
    store(\%revisions, $fname);
    warn "done.\n";
    }
  return %revisions;
  }