User:HBC archive builderbot/source
Appearance
use strict; use Storable; use LWP::UserAgent; use HTTP::Request::Common; use XML::Simple; use URI::Escape; use Data::Dumper; use Algorithm::Diff qw(diff); my $ua = LWP::UserAgent->new('agent' => 'HBC archive builderbot v0.1 - developing (Operated by User:HighInBC)'); my $nowiki = ('nowiki'); # So it doesn't screw up the display of the source code on wiki my $page = 'Wikipedia:Requests for comment/User names'; my $shortcut; $shortcut = 'WP:RFCN'; $shortcut ||= $page; my %revisions = get_complete_history($page); my(@old_content); my($old_key); my $day; KEY: foreach my $key (sort {$a <=> $b} keys(%revisions)) { my(@content) = split("\n",${$revisions{$key}}{'text'}{'content'}); my $timestamp = ${$revisions{$key}}{'timestamp'}; my $summary = ${$revisions{$key}}{'comment'}; $summary =~ s|/\*.*\*/\s*||; my $user = ${$revisions{$key}}{'contributor'}{'username'}; my (@headings); if (scalar(@content) && scalar(@old_content)) { my @diffs = diff(\@old_content, \@content); foreach my $ra_hunk (@diffs) { foreach my $ra_diff (@{$ra_hunk}) { my($action,$content) = @{$ra_diff}[0,2]; if (($content =~ m|==\s?([^=]*)\s?==|) && ($action eq '-')) { my $heading = $1; ($heading =~ s|(\{\{.*:.*\}\})|<$nowiki>$1</$nowiki>|) if ($heading =~ m|\{\{.*:.*\}\}|); push(@headings,$heading); } } } } if (scalar(@headings)) { $timestamp =~ m|(\d{4}-\d{2}-\d{2})T(\d{2}:\d{2}):\d{2}Z|; if ($1 ne $day) { $day = $1; warn "'''$day'''\n"; } my $time = $2; my $archive_link = "'''[{{fullurl:$shortcut|oldid=$old_key}} Archive link]'''"; if (scalar(@headings) > 1) { warn "* '''$time''': $archive_link - ($summary ([[User:$user|$user]])) - (".scalar(@headings)." entries)\n"; foreach my $heading (@headings) { warn "** $heading\n"; } } elsif (scalar(@headings) == 1) { warn "* '''$time''': $archive_link - $headings[0] - ($summary ([[User:$user|$user]]))\n"; } } @old_content = @content; $old_key = $key; } sub get_complete_history # Add Gzip, 100 times smaller, gee where did that ratio come from?? { mkdir('cache') unless (-d('cache')); my $page = shift; my(%revisions); my $count; my $offset; my $fname = 'cache/'.uri_escape($page); if (-f($fname)) { warn "Found '$page' in cache, loading...\n"; %revisions = %{retrieve($fname)}; my(@keys) = sort {$a <=> $b} keys(%revisions); $offset = ($revisions{$keys[scalar(@keys)-1]}{'timestamp'}); # Get timestamp of most recent revision warn (scalar(keys(%revisions))." loaded from cache.\n"); } else { warn "No cache, starting fresh.\n"; $offset = '0'; } my $total; GETMORE: warn "\nDownloading as many as 100 revisions starting at ".($offset || 'the start')."\n"; my $index = 'http://en.wikipedia.org/w/index.php'; my $res = $ua->request ( POST $index."?title=Special:Export", Content_Type => 'application/x-www-form-urlencoded', Content => [( 'pages' => $page, 'action' => 'submit', 'submit' => 'Export', 'limit' => 100, 'offset' => $offset )] ); my $current = $res->content(); unless ($current =~ m|^<mediawiki|) { warn "Failed somehow, trying again.\n"; goto GETMORE; } my $index = rindex($current, '<timestamp>'); my $string = substr($current,$index,43); $string =~ m|<timestamp>(.+?)</timestamp>|; $offset = $1; my $xml_data = XMLin($current); $count = 0; if (!scalar(keys(%{${$xml_data}{page}{revision}}))) {} # do nothing elsif (${$xml_data}{'page'}{'revision'}{'id'}) { unless ($revisions{${$xml_data}{'page'}{'revision'}{'id'}}) {$count++;$total++;} $revisions{${$xml_data}{'page'}{'revision'}{'id'}} = ${$xml_data}{'page'}{'revision'}; } else { foreach my $revision (sort {$a <=> $b} keys(%{${$xml_data}{'page'}{'revision'}})) { unless ($revisions{$revision}) {$count++;$total++;} $revisions{$revision} = ${$xml_data}{'page'}{'revision'}{$revision}; } warn Dumper($xml_data) unless ($total); } warn "Got $count revisions\n"; if ($count == 100) { warn "Still more.\n"; goto GETMORE; } if ($total > 0) { warn "Saving cache...\n"; store(\%revisions, $fname); warn "done.\n"; } return %revisions; }