User:AnomieBOT/source/tasks/ReplaceExternalLinks4.pm

Approved 2011-11-02.
Wikipedia:Bots/Requests for approval/AnomieBOT 58
package tasks::ReplaceExternalLinks4;

=pod

=begin metadata

Bot:     AnomieBOT
Task:    ReplaceExternalLinks4
BRFA:    Wikipedia:Bots/Requests for approval/AnomieBOT 58
Status:  Approved 2011-11-02
Created: 2011-10-21
OnDemand: true

Replace URL redirector links with direct links to the target URL.

=end metadata

=cut

use utf8;
use strict;

use Data::Dumper;
use URI;
use URI::Escape;
use AnomieBOT::Task qw/:time/;
use vars qw/@ISA/;
@ISA=qw/AnomieBOT::Task/;

# Maps euquery values to replacement functions
my %replacements=(
);

# Youtube shortener
if(0){ # disable for now, all fixed/logged and currently blacklisted
    $replacements{'youtu.be/'}=sub {
        my $url=shift;

        my $u1=URI->new($url);
        my $u2=URI->new("//youtube.com/watch");
        $u2->scheme($u1->scheme);
        my $p=$u1->path;
        $p=~s!^/*([^/]+)(?:/.*)?$!$1!;
        $u2->query_form(v=>uri_unescape($p), $u1->query_form);
        my $ret=$u2->as_iri;
        return ($ret);
    };
}

# Google has lots of patterns, construct programmatically
if(0){ # disable for now, all fixed/logged and currently blacklisted
    my @domains=qw(
        www.google.com
        books.google.com
        books.google.co.uk
        encrypted.google.com
        images.google.ca
        images.google.com
        images.google.co.uk
        images.google.ie
        news.google.ca
        news.google.co.in
        news.google.com
        news.google.com.au
        news.google.com.br
        news.google.com.co
        news.google.com.hk
        news.google.co.uk
        news.google.co.za
        news.google.de
        news.google.ie
        news.google.it
        news.google.nl
        news.google.ru
        scholar.google.com
        scholar.google.de
        scholar.google.se
        translate.google.com
        www.google.at
        www.google.az
        www.google.be
        www.google.bg
        www.google.ca
        www.google.ch
        www.google.cl
        www.google.cm
        www.google.co.id
        www.google.co.il
        www.google.co.in
        www.google.co.jp
        www.google.co.ke
        www.google.co.kr
        www.google.co.ma
        www.google.com.ar
        www.google.com.au
        www.google.com.br
        www.google.com.co
        www.google.com.ec
        www.google.com.fj
        www.google.com.gh
        www.google.com.hk
        www.google.com.lb
        www.google.com.mx
        www.google.com.my
        www.google.com.ng
        www.google.com.np
        www.google.com.om
        www.google.com.pe
        www.google.com.ph
        www.google.com.pk
        www.google.com.pr
        www.google.com.sg
        www.google.com.tr
        www.google.com.tw
        www.google.com.ua
        www.google.com.uy
        www.google.co.nz
        www.google.co.th
        www.google.co.uk
        www.google.co.za
        www.google.co.zw
        www.google.cz
        www.google.de
        www.google.dk
        www.google.ee
        www.google.es
        www.google.fi
        www.google.fr
        www.google.gr
        www.google.hr
        www.google.hu
        www.google.ie
        www.google.it
        www.google.jo
        www.google.lk
        www.google.lv
        www.google.md
        www.google.nl
        www.google.no
        www.google.pl
        www.google.pt
        www.google.ro
        www.google.ru
        www.google.se
        www.google.si
        www.google.sk
        www.google.sm
    );
    my @suffixes=qw(
        /url?
        /archivesearch/url?
        /bookmarks/url?
        /history/url?
        /m/url?
        /newspapers/url?
        /news/url?
    );
    my $repl=sub {
        my $url=shift;

        my %q=URI->new($url)->query_form;

        my $ret=undef;
        $ret=$q{'q'} if ($q{'q'}//'')=~/^(?:http|ftp)/;
        $ret=$q{'url'} if ($q{'url'}//'')=~/^(?:http|ftp)/;
        if(!defined($ret)){
            return ($ret, "Could not find 'url' or 'q' parameter in Google $url", "Invalid/obfuscated Google redirect", "The link <code><nowiki>$url</nowiki></code> does not contain a <code>q</code> or <code>url</code> parameter containing the target URL. Please fix manually.");
        }
        return ($ret) 
    };
    for my $domain (@domains) {
        for my $suffix (@suffixes) {
            $replacements{$domain.$suffix}=$repl;
        }
    }
}

###########################

my $chars='[^][<>"\x00-\x20\x7F\p{Zs}]';

sub new {
    my $class=shift;
    my $self=$class->SUPER::new();
    $self->{'proto'}=undef;
    $self->{'iter'}=undef;

    my %remap=();
    my @re=();
    while(my ($k,$v)=each %replacements){
        my $re=quotemeta($k);
        $re=~s!\\/!/!g;
        $re=~s/\\\*/$chars*/g;
        $re=~s!^(.*?)($|/)!(?i:$1)$2!;
        push @re, $re;
        $remap{$k}=qr!//$re!;
    }
    $self->{'remap'}=\%remap;
    my $re='//(?:'.join('|', @re).')'.$chars.'*';
    $self->{'re'}=qr/$re/;

    bless $self, $class;
    return $self;
}

=pod

=for info
Approved 2011-11-02.<br />[[Wikipedia:Bots/Requests for approval/AnomieBOT 58]]

=cut

sub approved {
    return -1;
}

sub run {
    my ($self, $api)=@_;
    my $res;

    $api->task('ReplaceExternalLinks4', 0, 10, qw/d::Templates d::Talk/);

    my $screwup='Errors? [[User:'.$api->user.'/shutoff/ReplaceExternalLinks4]]';

    # Spend a max of 5 minutes on this task before restarting
    my $endtime=time()+300;

    my $re=$self->{'re'};
    my %remap=%{$self->{'remap'}};
    my $fix=0;
    my $page;

    my $checkExtLink=sub {
        my ($fmt,$url,$txt)=@_;
        my $prefix;

        if($fmt==2){
            # Duplicate Mediawiki post-processing of bare external links
            $txt=$1.$txt if $url=~s/((?:[<>]|&[lg]t;).*$)//;
            my $sep=',;\.:!?';
            $sep.=')' unless $url=~/\(/;
            $txt=$1.$txt if $url=~s/([$sep]+$)//;

            # There shouldn't be a template inside the url
            $txt=$1.$txt if $url=~s/(\{\{.*$)//;

            $prefix=qr/https?:/;
        } else {
            $prefix=qr/(?:https?:)?/;
        }
        return $url.$txt unless $url=~/^$prefix$re$/;

        keys %remap;
        while(my ($k,$r)=each %remap){
            next unless $url=~/^$prefix$r/;
            my ($ret,$log,$errs,$errb)=$replacements{$k}($url);
            if(defined($ret)){
                $fix++;
                $ret=~s/([][<>"\x00-\x20\x7F\p{Zs}])/ uri_escape_utf8($1,'\x00-\xff') /ge;
                return $ret.$txt;
            }
            $api->warn("$log in $page") if defined($log);
            $api->whine("$errs in [[:$page]]", $errb, Pagename=>'User:AnomieBOT/ReplaceExternalLinks4 problems', NoSmallPrint=>1) if(defined($errs) && defined($errb));
        }
        return $url.$txt;
    };

    my $fixLinks=sub {
        my $txt=shift;
        my $nowiki;

        # Hide bits we shouldn't process
        ($txt,$nowiki)=$api->strip_nowiki($txt);
        ($txt,$nowiki)=$api->strip_templates($txt, sub { return 1; }, {}, $nowiki);

        # Hide XLinkBot notices
        if($page=~/^User talk:/){
            ($txt,$nowiki)=$api->strip_regex(qr/[^\n]*\[\[User:XLinkBot(?:\||\]\])[^\n]*/, $txt, $nowiki);
        }

        # First, fix any bracketed external link
        $txt=~s{\[((?:https?:)?$re)( *[^\]\x00-\x08\x0a-\x1F]*?)\]}{ '['.($checkExtLink->(1,$1,$2)).']' }ge;

        # Now hide the bracketed external links
        ($txt,$nowiki)=$api->strip_regex(qr{\[(?:https?:)?//[^][<>\x22\x00-\x20\x7F]+ *[^\]\x00-\x08\x0a-\x1F]*?\]}, $txt, $nowiki);

        # Fix any bare external links
        $txt=~s{\b(https?:$re)}{ $checkExtLink->(2,$1,'') }ge;

        # Unstrip
        $txt=$api->replace_stripped($txt,$nowiki);

        return $txt;
    };

    $self->{'proto'}=['http','https'] unless @{$self->{'proto'}//[]};
    while(@{$self->{'proto'}}){
        if(!defined($self->{'iter'})){
            $self->{'iter'}=$api->iterator(
                generator   => 'exturlusage',
                geuprotocol => shift @{$self->{'proto'}},
                geuquery    => [ keys %replacements ],
                geulimit    => '1000', # exturlusage has issues with big lists
            );
        }
        while(my $pg=$self->{'iter'}->next){
            if(!$pg->{'_ok_'}){
                $api->warn("Failed to retrieve page list for ".$self->{'iter'}->iterval.": ".$pg->{'error'}."\n");
                return 60;
            }

            return 0 if $api->halting;
            $page=$pg->{'title'};
            my $tok=$api->edittoken($page, EditRedir => 1);
            if($tok->{'code'} eq 'shutoff'){
                $api->warn("Task disabled: ".$tok->{'content'}."\n");
                return 300;
            }
            if($tok->{'code'} eq 'pageprotected'){
                $api->whine("[[:$page]] is protected", "Please fix manually.", Pagename=>'User:AnomieBOT/ReplaceExternalLinks4 problems', NoSmallPrint=>1);
                next;
            }
            if($tok->{'code'} eq 'botexcluded'){
                $api->whine("Bot excluded from [[:$page]]", "<nowiki>".$tok->{'error'}."</nowiki>. Please fix manually or adjust the exclusion.", Pagename=>'User:AnomieBOT/ReplaceExternalLinks4 problems', NoSmallPrint=>1);
                next;
            }
            if($tok->{'code'} ne 'success'){
                $api->warn("Failed to get edit token for $page: ".$tok->{'error'}."\n");
                next;
            }
            if(exists($tok->{'missing'})){
                $api->warn("WTF? $page does not exist?\n");
                next;
            }

            my $intxt=$tok->{'revisions'}[0]{'slots'}{'main'}{'*'};
            $fix=0;

            # First, process links in templates
            my $outtxt=$api->process_templates($intxt, sub {
                shift; #$name
                my $params=shift;
                shift; #$wikitext
                shift; #$data
                my $oname=shift;

                my $ret="{{$oname";
                for my $p (@$params){
                    $ret.='|'.($fixLinks->($p));
                }
                $ret.="}}";
                return $ret;
            });

            # Now clean up the rest of the page.
            $outtxt=$fixLinks->($outtxt);

            if($outtxt ne $intxt){
                my @summary=();
                push @summary, "bypassing $fix redirection URL".($fix==1?'':'s') if $fix;
                unless(@summary){
                    $api->warn("Changes made with no summary for $page, not editing");
                    next;
                }
                $summary[$#summary]='and '.$summary[$#summary] if @summary>1;
                my $summary=ucfirst(join((@summary>2)?', ':' ', @summary));
                $api->log("$summary in $page");
                my $r=$api->edit($tok, $outtxt, "$summary. $screwup", 1, 1);
                if(lc($r->{'code'}) eq 'failure' && exists($r->{'edit'}{'spamblacklist'})){
                    my $bl=$r->{'edit'}{'spamblacklist'};
                    $api->log("Write failed on $page: Blacklisted link $bl");
                    $api->warn("Write failed on $page: Blacklisted link $bl\n");
                    $api->whine("Redirect to blacklisted URL in [[:$page]]", "MediaWiki's [[MediaWiki:Spam-blacklist|spam blacklist]] complained about <nowiki>$bl</nowiki>. Note there may be more than one blacklisted link in the page. Please fix manually.", Pagename=>'User:AnomieBOT/ReplaceExternalLinks4 problems', NoSmallPrint=>1);
                    next;
                }
                if($r->{'code'} ne 'success'){
                    $api->warn("Write failed on $page: ".$r->{'error'}."\n");
                    next;
                }
            }

            # If we've been at it long enough, let another task have a go.
            return 0 if time()>=$endtime;
        }
        $self->{'iter'}=undef;
    }

    $api->log("May be DONE!");
    return 3600;
}

1;