User:Joe's Olympic Bot/source
Appearance
Not finished, but a work in progress on the traversal code. Note the unused subroutine at bottom, which has been tested lightly for grabbing a URL from the london2012.com site's search function.
use MediaWiki::API; use Encode; use LWP::UserAgent; use utf8; # Gently pruned from the standard exclusion code to hardcode $user and $opt sub allowBots { my($text) = @_; my $user = "Joe's Olympic Bot"; return 0 if $text =~ /{{[nN]obots}}/; return 1 if $text =~ /{{[bB]ots}}/; if($text =~ /{{[bB]ots\s*\|\s*allow\s*=\s*(.*?)\s*}}/s){ return 1 if $1 eq 'all'; return 0 if $1 eq 'none'; my @bots = split(/\s*,\s*/, $1); return (grep $_ eq $user, @bots)?1:0; } if($text =~ /{{[bB]ots\s*\|\s*deny\s*=\s*(.*?)\s*}}/s){ return 0 if $1 eq 'all'; return 1 if $1 eq 'none'; my @bots = split(/\s*,\s*/, $1); return (grep $_ eq $user, @bots)?0:1; } return 1; } # Within a single MediaWiki call, we ask the API to make up to 5 attempts, 10 s apart, until # the worst-case server lag is better than 5s. my $mw = MediaWiki::API->new(); $mw->{config}->{api_url} = 'http://en.wikipedia.org/w/api.php'; # Delay/retry parameters $mw->{config}->{max_lag} = 5; # Tell MediaWiki to put us off it there's a 5s+ db lag out there $mw->{config}->{max_lag_delay} = 10; # ..and to wait 10s between retries $mw->{config}->{max_lag_retries} = 4; # ..and to only make 4 retries before dropping back to our code # Our own delay parameters $standardelay = 15; $longdelay = 900; # ...if the API puts us off several times in a row, take a 15-minute break my $articles = null; # login while (1) { if ($mw->login( { lgname => "Joe's Olympic Bot", lgpassword => '[REDACTED]' } )) { last; } if ($mw->{error}->{details} =~ /Server has reported lag above the configure/) { sleep $longdelay; } else { die $mw->{error}->{code} . ': ' . $mw->{error}->{details}; } } $profilesfound = 0; getsubd(); sub getsubd { while (1) { $sdirs = $mw->list ( { action => 'query', list => 'categorymembers', cmtitle => 'Category:Competitors at the 2012 Summer Olympics', cmlimit => "max" }, ); if ($articles) { last; } if ($mw->{error}->{details} =~ /Server has reported lag above the configure/) { sleep $longdelay; } else { die $mw->{error}->{code} . ': ' . $mw->{error}->{details}; } } foreach (@{$sdirs}) { $sdirname = $_->{title}; print "########### $sdirname\n"; getlista($sdirname); } } sub getlista { my ($cata) = $_[0]; # skip directories cleaned by hand already, why bother? return if ($cata =~ m/Beach volleyball /); return if ($cata =~ m/Volleyball /); return if ($cata =~ m/Badminton /); return if ($cata =~ m/Divers /); return if ($cata =~ m/Wrestlers /); return if ($cata =~ m/Triathletes /); return if ($cata =~ m/Archers /); return if ($cata =~ m/Tennis /); return if ($cata =~ m/Taekwondo /); return if ($cata =~ m/Footballers /); return if ($cata =~ m/Handball /); # Get list of articles while (1) { $articles = $mw->list ( { action => 'query', list => 'categorymembers', cmtitle => $cata, cmlimit => "max" }, { hook=> \&dsa } , ); if ($articles) { last; } if ($mw->{error}->{details} =~ /Server has reported lag above the configure/) { sleep $longdelay; } else { die $mw->{error}->{code} . ': ' . $mw->{error}->{details}; } } } sub dsa { my ($xyz) = $_[0]; # scan through the articles... foreach (@{$xyz}) { my $thistitle = $_->{title}; $listcount++; next if ($thistitle =~ m/^User:/); next if ($thistitle =~ m/^Category:/); while (1) { my $pagehash = $mw->get_page( { title => $thistitle } ); if ($pagehash) { last; } if ($mw->{error}->{details} =~ /Server has reported lag above the configure/) { sleep $longdelay; } else { die $mw->{error}->{code} . ': ' . $mw->{error}->{details}; } } if (allowBots($pagehash->{'*'})) { my $ref = $mw->get_page( { title => $thistitle } ); $atext = decode_utf8 ( $ref->{'*'} ); my $timestamp = $ref->{'timestamp'}; # There are a couple articles which are not individual athletes but in these categories. Restrict to living people if (!($atext =~ m/Category:Living people/)) { print "NOTLIVING: " . encode("iso-8859-1", $thistitle) . "\n"; next; } # Blow out if we see any of the usual reliable athletic site templates next if ($atext =~ /\{iaaf name\|/i); next if ($atext =~ /\{Cycling archives/i); next if ($atext =~ /\{[Ff]ig\|/); next if ($atext =~ /\{Sports.reference\|/i); next if ($atext =~ /\{FISA\|/i); next if ($atext =~ /\{Swimming Australia name\|/i); next if ($atext =~ /\{cyclingwebsite\|/i); next if ($atext =~ /\{ATP\|/i); next if ($atext =~ /\{WTA\|/i); next if ($atext =~ /\{ITF female profile\|/i); next if ($atext =~ /\{FIFA player\|/i); $refcount = 0; $goodrefcount = 0; while ($atext =~ m/http(s?):([^ \<\|\'\"]*)/g) { $xurl = $&; $refcount ++; # print "URL: (" . $xurl . ")\n"; if (!($xurl =~ m/london2012/ig)) { $goodrefcount++; } elsif (!($xurl =~ m#2012\.com(\/)?$#)) { $goodrefcount++; } } if ($goodrefcount == 0) { if ($refcount == 0) { print "UNREF: " . encode("iso-8859-1", $thistitle) . "\n"; $ret = findolympian($thistitle); print " ->RESULT: " . encode("iso-8859-1", $ret) . "\n"; sleep 5; die if ($profilesfound > 5); } else { print "BADREF: " . encode("iso-8859-1", $thistitle) . "\n"; $ret = findolympian($thistitle); print " ->RESULT: " . encode("iso-8859-1", $ret) . "\n"; $striptitle = $thistitle; if ($striptitle =~ m/([^(]+) \(/) { $striptitle = $1; } $uastriptitle = $striptitle; $uastriptitle =~ tr/ăäóéí/aaoei/; if (($londontitle ne $striptitle) && ($londontitle ne $uastriptitle)) { print "GOFIX " . $striptitle . " as it doesn't match " . $londontitle . "\n"; } elsif ($atext =~ m|\<ref\>\[http:\/\/www.london2012.com(\/)?[ '"][^<]+\<\/ref\>|) { $revtext = decode_utf8( $`) . decode_utf8("{{subst:Cite London Olympics|name=OlympicBotGeneratedRef|title=") . decode_utf8($londontitle) . decode_utf8("|url=") . decode_utf8($ret) . decode_utf8("}}") . decode_utf8($'); $mw->edit( { action => 'edit', summary => "Joe's Olympic Bot: Correcting reference.", basetimestamp => $timestamp, # to avoid edit conflicts title => $thistitle, basetimestamp => $timestamp, # to avoid edit conflicts text => $revtext } ) || die $mw->{error}->{code} . ': ' . $mw->{error}->{details}; } elsif ($atext =~ m|\<ref\>http:\/\/www.london2012.com(\/)?\<\/ref\>|) { $revtext = decode_utf8( $`) . decode_utf8("{{subst:Cite London Olympics|name=OlympicBotGeneratedRef|title=") . decode_utf8($londontitle) . decode_utf8("|url=") . decode_utf8($ret) . decode_utf8("}}") . decode_utf8($'); $mw->edit( { action => 'edit', summary => "Joe's Olympic Bot: Correcting reference.", basetimestamp => $timestamp, # to avoid edit conflicts title => $thistitle, basetimestamp => $timestamp, # to avoid edit conflicts text => $revtext } ) || die $mw->{error}->{code} . ': ' . $mw->{error}->{details}; } else { print "ACPHT\n"; } sleep 5; die if ($profilesfound > 3); } } break; } else { print "….DENIED\n"; } } } sub findolympian { my ($olympian) = $_[0]; my @o = split('\(', $olympian); $olympian = $o[0]; $olympian =~ tr/ /+/; $u = 'http://www.london2012.com/search/index.htmx?q=' . $olympian; # print "URL: " . $u . "\n"; $profilesfound++; my $ua = LWP::UserAgent->new; $ua->agent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0'); $ua->default_header('Accept-Language' => "en-us,en;q=0.5"); $ua->default_header('Accept' => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); $ua->default_header('Connection' => "keep-alive"); my $response = $ua->get($u); $html = $response->content; undef $londontitle; undef $finishedurl; if (defined($html)) { if ($html =~ m#href=.(\/athlete\/[^\/]+\/)# ) { $finishedurl = "http://www.london2012.com" . $1; if ($html =~ m#"name"\>([^<]+)\<\/span\>\<span class="surname"\>([^<]+)\<\/span#) { $londontitle = decode_utf8($1). decode_utf8($2); $londontitle =~ s/\s+$//; print "LONDONTITLE (" . $londontitle . ")\n"; } } else { print "CANTFINDLINK\n" ; return $finishedurl; } } else { print "NORETURN\n" ; } return Encode::decode_utf8($finishedurl); }