User:Joe's Olympic Bot/source

Not finished, but a work in progress on the traversal code. Note the unused subroutine at bottom, which has been tested lightly for grabbing a URL from the london2012.com site's search function.

use MediaWiki::API;
use Encode;
use LWP::UserAgent;
use utf8;

   # Gently pruned from the standard exclusion code to hardcode $user and $opt
   sub allowBots {
    my($text) = @_;

    my $user = "Joe's Olympic Bot";

    return 0 if $text =~ /{{[nN]obots}}/;
    return 1 if $text =~ /{{[bB]ots}}/;
    if($text =~ /{{[bB]ots\s*\|\s*allow\s*=\s*(.*?)\s*}}/s){
        return 1 if $1 eq 'all';
        return 0 if $1 eq 'none';
        my @bots = split(/\s*,\s*/, $1);
        return (grep $_ eq $user, @bots)?1:0;
    }
    if($text =~ /{{[bB]ots\s*\|\s*deny\s*=\s*(.*?)\s*}}/s){
        return 0 if $1 eq 'all';
        return 1 if $1 eq 'none';
        my @bots = split(/\s*,\s*/, $1);
        return (grep $_ eq $user, @bots)?0:1;
    }
    return 1;
  }

 
  # Within a single MediaWiki call, we ask the API to make up to 5 attempts, 10 s apart, until
  # the worst-case server lag is better than 5s. 
  my $mw = MediaWiki::API->new();
  $mw->{config}->{api_url} = 'http://en.wikipedia.org/w/api.php';

  # Delay/retry parameters
 
  $mw->{config}->{max_lag}         = 5;        # Tell MediaWiki to put us off it there's a 5s+ db lag out there
  $mw->{config}->{max_lag_delay}   = 10;  # ..and to wait 10s between retries
  $mw->{config}->{max_lag_retries} = 4;    # ..and to only make 4 retries before dropping back to our code

  # Our own delay parameters
  $standardelay      = 15;  
  $longdelay         = 900;  # ...if the API puts us off several times in a row, take a 15-minute break

  my $articles = null;

  # login
  while (1) { 
    if ($mw->login( { lgname => "Joe's Olympic Bot", lgpassword => '[REDACTED]' } )) {
      last;
    }

    if ($mw->{error}->{details} =~ /Server has reported lag above the configure/) {
      sleep $longdelay;   
    } else {
      die $mw->{error}->{code} . ': ' . $mw->{error}->{details};
    } 
  } 

  $profilesfound = 0;

  getsubd();



sub getsubd {

  while (1) {
    $sdirs = $mw->list ( {
       action => 'query',
       list => 'categorymembers',
       cmtitle => 'Category:Competitors at the 2012 Summer Olympics', 
       cmlimit => "max" },

      );  

    if ($articles) { last; }

    if ($mw->{error}->{details} =~ /Server has reported lag above the configure/) {
      sleep $longdelay;   
    } else {
      die $mw->{error}->{code} . ': ' . $mw->{error}->{details};
    } 
  }

  foreach (@{$sdirs}) {
    $sdirname = $_->{title};
    print "########### $sdirname\n";
    getlista($sdirname);
  }
}



sub getlista {

  my ($cata) = $_[0];

  # skip directories cleaned by hand already, why bother?

  return if ($cata =~ m/Beach volleyball /);
  return if ($cata =~ m/Volleyball /);
  return if ($cata =~ m/Badminton /);
  return if ($cata =~ m/Divers /);
  return if ($cata =~ m/Wrestlers /);
  return if ($cata =~ m/Triathletes /);
  return if ($cata =~ m/Archers /);
  return if ($cata =~ m/Tennis /);
  return if ($cata =~ m/Taekwondo /);
  return if ($cata =~ m/Footballers /);
  return if ($cata =~ m/Handball /);


  # Get list of articles
  while (1) {
    $articles = $mw->list ( {
       action => 'query',
       list => 'categorymembers',
       cmtitle => $cata, 
       cmlimit => "max" },

        {  hook=> \&dsa	} ,
      );  

    if ($articles) { last; }

    if ($mw->{error}->{details} =~ /Server has reported lag above the configure/) {
      sleep $longdelay;   
    } else {
      die $mw->{error}->{code} . ': ' . $mw->{error}->{details};
    } 
  }
} 


sub dsa {

  my ($xyz) = $_[0];

  # scan through the articles...
  foreach (@{$xyz}) {

      my $thistitle = $_->{title};
      $listcount++;

      next if ($thistitle =~ m/^User:/);     
      next if ($thistitle =~ m/^Category:/);

      while (1) {
         my $pagehash = $mw->get_page( { title => $thistitle } );
         if ($pagehash) { last; }

         if ($mw->{error}->{details} =~ /Server has reported lag above the configure/) {
            sleep $longdelay;   
          } else {
            die $mw->{error}->{code} . ': ' . $mw->{error}->{details};
          } 
      }
     
      if (allowBots($pagehash->{'*'})) {       
              my $ref = $mw->get_page( { title => $thistitle } );
              $atext = decode_utf8 ( $ref->{'*'} );
              my $timestamp = $ref->{'timestamp'};

              # There are a couple articles which are not individual athletes but in these categories.  Restrict to living people
              if (!($atext =~ m/Category:Living people/)) {
                 print "NOTLIVING: " . encode("iso-8859-1", $thistitle) . "\n";
                 next;
              }

              # Blow out if we see any of the usual reliable athletic site templates
              next if ($atext =~ /\{iaaf name\|/i);
              next if ($atext =~ /\{Cycling archives/i);
              next if ($atext =~ /\{[Ff]ig\|/);  
              next if ($atext =~ /\{Sports.reference\|/i);
              next if ($atext =~ /\{FISA\|/i);
              next if ($atext =~ /\{Swimming Australia name\|/i);
              next if ($atext =~ /\{cyclingwebsite\|/i);
              next if ($atext =~ /\{ATP\|/i);
              next if ($atext =~ /\{WTA\|/i);
              next if ($atext =~ /\{ITF female profile\|/i);
              next if ($atext =~ /\{FIFA player\|/i);
 
              $refcount = 0;
              $goodrefcount = 0;

              while ($atext =~ m/http(s?):([^ \<\|\'\"]*)/g) {

                $xurl = $&;

                $refcount ++;
      #          print "URL: (" . $xurl . ")\n";    

                if (!($xurl =~ m/london2012/ig)) {
                    $goodrefcount++;
                } elsif (!($xurl =~ m#2012\.com(\/)?$#)) {

                    $goodrefcount++;

                }
              }

              if ($goodrefcount == 0) {
                 if ($refcount == 0) {
                   print "UNREF: " . encode("iso-8859-1", $thistitle) . "\n";

                    $ret = findolympian($thistitle);
                    print " ->RESULT: " . encode("iso-8859-1", $ret) . "\n";
  
                    sleep 5;
                    die if ($profilesfound > 5);

                 } else {
                   print "BADREF: " . encode("iso-8859-1", $thistitle) . "\n";

                    $ret = findolympian($thistitle);
                    print " ->RESULT: " . encode("iso-8859-1", $ret) . "\n";

                    $striptitle = $thistitle;

                    if ($striptitle =~ m/([^(]+) \(/) {
                       $striptitle = $1;
                    }

                    $uastriptitle = $striptitle;
                    $uastriptitle =~ tr/ăäóéí/aaoei/;

                    if (($londontitle ne $striptitle) && ($londontitle ne $uastriptitle)) {
                       print "GOFIX " . $striptitle . " as it doesn't match " . $londontitle . "\n";
                    } elsif ($atext =~ m|\<ref\>\[http:\/\/www.london2012.com(\/)?[ '"][^<]+\<\/ref\>|) {

                       $revtext = decode_utf8( $`)  

                            . decode_utf8("{{subst:Cite London Olympics|name=OlympicBotGeneratedRef|title=") 
                            . decode_utf8($londontitle)
                            . decode_utf8("|url=") 
                            . decode_utf8($ret)
                            . decode_utf8("}}")
                            . decode_utf8($');                   
                                  
                       $mw->edit( {
                        action => 'edit',
                        summary => "Joe's Olympic Bot: Correcting reference.",
                        basetimestamp => $timestamp, # to avoid edit conflicts
                        
                        title => $thistitle,
                          basetimestamp => $timestamp, # to avoid edit conflicts
                        text => $revtext   } )
                       || die $mw->{error}->{code} . ': ' . $mw->{error}->{details};

                    } elsif ($atext =~ m|\<ref\>http:\/\/www.london2012.com(\/)?\<\/ref\>|) {

                       $revtext = decode_utf8( $`)  

                            . decode_utf8("{{subst:Cite London Olympics|name=OlympicBotGeneratedRef|title=") 
                            . decode_utf8($londontitle)
                            . decode_utf8("|url=") 
                            . decode_utf8($ret)
                            . decode_utf8("}}")
                            . decode_utf8($'); 
                                  
                       $mw->edit( {
                        action => 'edit',
                        summary => "Joe's Olympic Bot: Correcting reference.",
                        basetimestamp => $timestamp, # to avoid edit conflicts
                        
                        title => $thistitle,
                          basetimestamp => $timestamp, # to avoid edit conflicts
                        text => $revtext   } )
                       || die $mw->{error}->{code} . ': ' . $mw->{error}->{details};

                    } else  {
                       print "ACPHT\n";
                    }
 
                    sleep 5;
                    die if ($profilesfound > 3);
                 }
              }

           break;
      } else {
        print "….DENIED\n";
      }
  }
}


sub findolympian {

  my ($olympian) = $_[0];

  my @o = split('\(', $olympian);
  $olympian = $o[0];
  $olympian =~ tr/ /+/;

  $u = 'http://www.london2012.com/search/index.htmx?q=' . $olympian;

# print "URL: " . $u . "\n";

$profilesfound++;

 my $ua = LWP::UserAgent->new;
 $ua->agent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0');
 $ua->default_header('Accept-Language' => "en-us,en;q=0.5");
 $ua->default_header('Accept' => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
 $ua->default_header('Connection' => "keep-alive");

 my $response = $ua->get($u);

 $html = $response->content;
 undef $londontitle; 

 undef $finishedurl;
 if (defined($html)) {
    if ($html =~ m#href=.(\/athlete\/[^\/]+\/)#       ) {
        $finishedurl = "http://www.london2012.com" . $1;

       if ($html =~ m#"name"\>([^<]+)\<\/span\>\<span class="surname"\>([^<]+)\<\/span#) {
           $londontitle = decode_utf8($1). decode_utf8($2);

           $londontitle =~ s/\s+$//;

print "LONDONTITLE (" . $londontitle . ")\n";
        }
    } else {
       print "CANTFINDLINK\n" ;
       return $finishedurl;
    }
 } else {
   print "NORETURN\n" ;
 }

 return Encode::decode_utf8($finishedurl);
}