User:Coren/csb2.js
Appearance
Code that you insert on this page could contain malicious content capable of compromising your account. If you import a script from another page with "importScript", "mw.loader.load", "iusc", or "lusc", take note that this causes you to dynamically load a remote script, which could be changed by others. Editors are responsible for all edits and actions they perform, including by scripts. User scripts are not centrally supported and may malfunction or become inoperable due to software changes. A guide to help you find broken scripts is available. If you are unsure whether code you are adding to this page is safe, you can ask at the appropriate village pump. This code will be executed when previewing this page. |
Documentation for this user script can be added at User:Coren/csb2. |
//<syntaxhighlight language=perl>
#! /usr/bin/perl
use LWPx::ParanoidAgent;
use HTTP::Cookies;
use URI::Escape;
use Text::Align::WagnerFischer;
$ua = LWPx::ParanoidAgent->new(timeout => 20);
$ua->agent("CorenSearchBot/1.0 ");
$cookie_jar = HTTP::Cookies->new(file => "$ENV{'HOME'}/lwp_cookies.dat", autosave => 1,);
$ua->cookie_jar($cookie_jar);
sub Doing($) {
my($msg) = @_;
print "\t$msg\n";
}
sub significant($) {
my @in = split "\n", $_[0];
my @out;
foreach my $l (@in) {
next if $l =~ m/ Categor(y|ies) /;
next if $l =~ m/align/;
my $words = 0;
if($l =~ m/\b[a-z]{5,}\b/) {
$words++ while $l =~ m//g;
}
if($l =~ m/\b\*\b/) {
$words-=2 while $l =~ m//g;
}
next if $words < 3;
#$l .= " [$words]";
push @out, $l;
}
return @out;
}
sub complete($) {
my @in = split "\n", $_[0];
my @out;
foreach my $l (@in) {
next if $l =~ m/ Categor(y|ies) /;
push @out, $l;
}
return @out;
}
sub tokenize(@) {
my @t;
foreach my $l (@_) {
foreach my $t (split / /, $l) {
push @t, $t if length($t) > 3;
}
}
return @t;
}
sub statementize($) {
($_, undef) = @_;
s/---*/ /g;
tr/!-?/ /;
#s/ */ /g;
s/^ *//g;
s/ *$//g;
s/\*([^ .])/\1/g;
s/\. */.\n/g;
#while(s/([^. \n]) *([A-Z][a-zA-Z0-9_]*)/\1 */gs) { }
#while(s/\* *\*/* /gs) { }
s/\.([A-Z])/\n\1/sg;
s/ *\././g;
s/\n\n*/\n/gs;
s/\.\n/\n/gs;
return $_;
}
sub normalizewikitext($) {
($_, undef) = @_;
tr/*#/::/;
s/<ref>.*?<\/ref>/ /igs;
s/<.*?>/ /igs;
s/&[^;]*;/ /gs;
while(s/('''*)(.*?)\1/ \2 /gs) { }
s/\[\[([^|\]]*)]]/ \1 /gs;
s/\[\[.*?\|(.*?)]]/ \1 /gs;
s/\[[^ ]* (.*?)]/ \1 /gs;
s/\[.*?]/ /gs;
s/^(===*)(.*?)\1/\2. /g;
s/{{.*?}}/ /gs;
return statementize $_;
}
sub normalizewebtext($) {
($_, undef) = @_;
s/<.*?>/ /igs;
s/\&.*?;/ /gs;
return statementize $_;
}
sub WPRequest(@) {
my $req = HTTP::Request->new(POST => 'http://en.wikipedia.org/w/api.php');
$req->content_type('application/x-www-form-urlencoded');
$req->content(join '&', @_);
my $res = $ua->request($req);
return $res->is_success? $res->content: undef;
}
sub WPLogin($$) {
my ($uname, $pwd) = @_;
$pwd = uri_escape($pwd);
my $req = HTTP::Request->new(POST => 'http://en.wikipedia.org/w/index.php?title=Special:Userlogin&action=submitlogin&type=login');
$req->content_type('application/x-www-form-urlencoded');
$req->content("wpName=$uname&wpPassword=$pwd&wpRemember=1&wpLoginattempt=Log+in");
my $res = $ua->request($req);
$cookie_jar->extract_cookies($req);
return "Ok";
}
sub WPStartEdit($) {
my ($title) = @_;
$title = uri_escape($title);
my $req = HTTP::Request->new(GET => "http://en.wikipedia.org/w/index.php?title=$title&action=edit");
my $res = $ua->request($req);
my $txt;
$txt = $1 if $res->content =~ m/<textarea[^>]*>(.*)<\/textarea>/s;
$txt =~ s/</</gs;
$txt =~ s/>/>/gs;
$txt =~ s/"/"/gs;
$txt =~ s/&/\&/gs;
my $et;
$et = $1 if $res->content =~ m/<input type='hidden' value="([^"]*?)" name="wpEditToken" \/>/s;
my $more;
$more .= '&wpStarttime='.uri_escape($1) if $res->content =~ m/<input type='hidden' value="([^"]*?)" name="wpStarttime" \/>/s;
$more .= '&wpEdittime='.uri_escape($1) if $res->content =~ m/<input type='hidden' value="([^"]*?)" name="wpEdittime" \/>/s;
if($res->is_success) {
return ($title, $et, $more, $txt);
}
return undef;
}
sub WPTryEdit($$$$$) {
my($title, $et, $more, $txt, $es) = @_;
my $req = HTTP::Request->new(POST => "http://en.wikipedia.org/w/index.php?title=$title&action=submit");
$req->content_type('application/x-www-form-urlencoded');
$req->content(
'wpSection='
. '&wpSummary='.uri_escape($es)
. '&wpSave=wpSave'
. '&wpEditToken='.uri_escape($et)
. '&wpTextbox1='.uri_escape($txt)
. $more
);
my $res = $ua->request($req);
$et = $1 if $res->content =~ m/<input type='hidden' value="([^"]*?)" name="wpEditToken" \/>/s;
return undef if $res->content =~ m/<textarea/;
return 1;
}
sub WPArticle($) {
my($title) = @_;
my $art = WPRequest('action=query',
'prop=revisions',
'titles='.uri_escape($title),
'rvprop=content',
'rvlimit=1',
'format=xml');
$art = $1 if $art =~ m/<rev>(.*?)<\/rev>/s;
return $art;
}
sub WPNewPages() {
my $list = WPRequest('action=query',
'list=recentchanges',
'rclimit=500',
'rcnamespace=0',
'format=xml');
my @news;
my $maxrid = 0;
if($list =~ m/<rc type="1" .*? title="([^"]*)" .*? revid="([0-9]+)"/g) {
while(1) {
last if $2 <= $last_revid;
$maxrid = $2 if $2>$maxrid;
push @news, $1;
last if not $list =~ m//g;
}
}
$last_revid = $maxrid if $maxrid>$last_revid;
return @news;
}
sub WPCreator($) {
my($title) = @_;
my $art = WPRequest('action=query',
'prop=revisions',
'titles='.uri_escape($title),
'rvprop=user',
'rvlimit=1',
'rvdir=newer',
'format=xml');
return $1 if $art =~ m/<rev user="([^"]*?)" \/>/s;
return undef;
}
sub YahooFind($) {
my $req = HTTP::Request->new(GET => 'http://search.yahooapis.com/WebSearchService/V1/webSearch?appid=SANITIZED&query='.uri_escape(join(' ',@_)).'&results=5&language=en');
my $res = $ua->request($req);
my @uri;
my $r = $res->content;
$r =~ s/<Cache>.*?<\/Cache>//sg;
my @re = $r =~ m/<Url>([^<]*?)\/?<\/Url>/gs;
Doing "Search \"".join(' ',@_)."\" found $#re+1 results";
return @re;
}
sub top3($) {
my($q) = @_;
my @uri, YahooFind($q);
$#uri=2 if $#uri>2;
SITE:
foreach my $uri (@uri) {
next if $uri =~ m/\.[pP][Dd][Ff]/;
foreach my $q (@web) {
next SITE if $q eq $uri;
}
my $site;
$site = $1 if $uri =~ m{^[^:]*://([^/]*)/};
if($site eq 'en.wikipedia.org' and $uri=~m{/wiki/}) {
$uri =~ s{.*/wiki/(.*)}{\1};
$uri = uri_unescape($uri);
$uri =~ tr/_/ /;
foreach my $q (@enwiki) {
next SITE if $q eq $uri;
}
push @enwiki, $uri;
next SITE;
}
foreach my $re (@exclude) {
next SITE if $site =~ $re;
}
push @web, $uri;
return if $#web > 5;
}
}
sub findmatches($) {
my $article = WPArticle($_[0]);
my @atokens = tokenize complete normalizewikitext $article;
#print "article <", join(' ', @atokens), ">\n";
my @paras = significant normalizewikitext $article;
my $why = undef;
my $score = $config{MinScore};
my $what = undef;
my $what_ok;
my $score_ok = 50000;
local @web;
local @enwiki;
return undef if $#atokens < 5;
$#atokens = 200 if $#atokens > 200;
my @uri;
my $ln = 0;
my $title = $_[0];
$title =~ s/\(.*?\) *//;
foreach my $l (@paras) {
if($ln==1 or $ln==7 or $ln==($#paras-1)) {
if($l =~ m/ (.*)\.?/) {
my @tq = split ' ', $1;
my @q;
my $num = 0;
foreach my $w (@tq) {
push @q, $w if $w =~ m/[a-zA-Z0-9*]/;
$num++ if not $w eq '*';
last if $num > 9;
}
my $q = join ' ', @q;
top3 "\"$title\" $q";
}
}
$ln++;
}
return undef if $#paras < 0; top3 "\"$title\"";
foreach my $uri (@web) {
Doing "checking $uri";
my @src = eval {
local $SIG{ALRM} = sub { die "alarm\n" };
alarm 25;
my $req = HTTP::Request->new(GET => $uri);
alarm 0;
my $res = $ua->request($req);
if($res->is_success) {
my @src = tokenize complete normalizewebtext $res->content;
#print "webpage <", join(' ', @src), ">\n";
return @src if $#src > 9;
}
return undef;
};
next if $#src < 10;
next if $@ eq "alarm\n";
$#src = 100000/$#atokens if $#src*$#atokens > 100000;
my $alignment = Text::Align::WagnerFischer->new(
left => \@src,
right => \@atokens,
weights => [0,1,2]
);
my $maybe = 'pageincluded';
my $dif = abs ($#src-$#atokens);
$sina = ($alignment->cost()-$dif)*1000/$#src;
$ains = ($alignment->cost()-$dif)*1000/$#atokens;
Doing "$#src/$#atokens $dif gives cost ".($alignment->cost()-$dif)." for $sina/$ains";
if($ains > $sina) {
$maybe = 'pageincludes';
$sina = $ains;
}
my $need = $config{MinScore};
$need = ($need*$#atokens)/30 if $#atokens<30;
if($sina < $need and $sina < $score) {
$why = $maybe; $score = $sina;
$what = $uri;
}
if($sina < $score_ok) {
$score_ok = $sina;
$what_ok = $uri;
}
}
foreach $uri (@enwiki) {
next if $uri eq $_[0];
my $test = WPArticle($uri);
my @src = tokenize complete normalizewikitext $test;
next if $#src < 10;
my $alignment = Text::Align::WagnerFischer->new(
left => \@src,
right => \@atokens,
weights => [-1,1,2]
);
$sina = $alignment->cost()*1000/$#src;
$ains = $alignment->cost()*1000/$#atokens;
$sina = $ains if $ains < $sina;
if($sina<-400 and $sina < $score) {
$why = 'wikipage';
$what = $uri;
$score = $sina;
}
if($sina < $score_ok) {
$score_ok = $sina;
$what_ok = $uri;
}
}
return ($why, $what, ($score)/10) if $score < $config{MinScore};
Doing "Best match was $what_ok with $score_ok";
return ('', '', 1000);
}
sub TagPage($$$) {
my($title, $type, $what) = @_;
my $tag = "{{csb-$type|1=$what}}";
my $user = WPCreator($title);
foreach my $ally (@allies) {
return "creator trusted" if $user eq $ally;
}
$user = "User talk:$user" if defined $user;
while(1) {
my($ttl, $token, $more, $text) = WPStartEdit($title);
return "article is (now) a redirect" if $text =~ m/^#REDIRECT/;
return "attributed" if $text =~ m/{{DANFS}}/i;
return "attributed" if $text =~ m/{{[cC]atholic}}/i;
return "speedied" if $text =~ m/{{db/;
return "marked copyvio" if $text =~ m/{{copyvio/;
return "already tagged" if $text =~ m/{{csb-/;
return "page gone" if length($text)<20;
$text = "$tag\n\n" . $text;
if(WPTryEdit($ttl, $token, $more, $text, "Tagging for copyvio of $what"))
{
while(defined $user) {
($ttl, $token, $more, $text) = WPStartEdit($user);
$text .= "\n{{subst:csb-notice-$type|$title|url=$what}} — [[User:Coren|Coren]] <sup>[[User Talk:Coren|(talk)]]</sup> 22:41, 18 August 2007 (UTC)\n";
last if WPTryEdit($ttl, $token, $more, $text, "Notifying user of copyvio on $title");
}
while(1) {
($ttl, $token, $more, $text) = WPStartEdit($config{ReportTo});
my $re = qr/\[\[$title]]/s;
last if $text =~ $re;
if($type eq 'wikipage') {
$text .= "* [[$title]] — [[$what]]. Reported by [[User:CorenSearchBot|CSBot]] at 22:41, 18 August 2007 (UTC)\n";
} else {
$text .= "* [[$title]] — [$what $what]. Reported by [[User:CorenSearchBot|CSBot]] at 22:41, 18 August 2007 (UTC)\n";
}
last if WPTryEdit($ttl, $token, $more, $text, "Adding violation on $title");
}
return undef
}
}
}
sub configstatus() {
undef %config;
undef @exclude;
undef @allies;
foreach $l (split "\n", WPArticle("User:CorenSearchBot/config")) {
$config{$1} = $2 if $l =~ m/ *([A-Za-z]+)=(.*)/;
}
foreach $l (split "\n", WPArticle("User:CorenSearchBot/exclude")) {
push @exclude, qr/$1$/i if $l =~ m/ *([^=]*\.[a-z]{2,4})$/;
}
foreach $l (split "\n", WPArticle("User:CorenSearchBot/allies")) {
push @allies, $1 if $l =~ m/ *([^=]*)$/;
}
}
my @npq;
my $ok = WPLogin('CorenSearchBot', SANITIZED);
configstatus;
print "Configuration read.\n";
print "(", $#exclude+1, " exclusions)\n";
print "(", $#allies+1, " allies)\n";
print "Report to '$config{ReportTo}'\n";
print "Is a copy below $config{MinScore}\n";
print "\n";
push @npq, @ARGV;
my @manuals;
while(1) {
if($#npq < 1) {
print "Fetching new pages\n";
push @npq, WPNewPages if $#npq < 1;
print $#npq+1, " page(s) to check. (last revid $last_revid)\n";
if($#npq<0) {
if($#manuals<0) {
foreach $l (split "\n", WPArticle("User:CorenSearchBot/manual")) {
push @manuals, $1 if $l =~ m/\[\[([^]]*)]]$/;
}
while($#manuals >= 0) { my ($ttl, $token, $more, $text) = WPStartEdit("User:CorenSearchBot/manual");
$text =~ s/==Unprocessed requests==.*==Recent Results==/==Unprocessed requests==\n\n==Recent Results==/s;
last if WPTryEdit($ttl, $token, $more, $text, "Removing pending requests");
}
}
if($#manuals>=0) {
my $page = pop @manuals;
my $result = "{{User:CorenSearchBot/result-no|$page|22:41, 18 August 2007 (UTC)}}\n";
print "Manually checking [[$page]]\n";
my($why, $what, $score) = findmatches($page);
$score = int(100-$score);
$result = "{{User:CorenSearchBot/result-unknown|$page|22:41, 18 August 2007 (UTC)}}\n" if $score>-10;
if(defined $why and not $why eq '') {
print "\t\033[31;1m[[$page]] is $why of [$what] with confidence $score\033[0m\n";
$result = "{{User:CorenSearchBot/result-yes|$page|$score|22:41, 18 August 2007 (UTC)|url=$what}}\n";
}
while(1) {
my ($ttl, $token, $more, $text) = WPStartEdit("User:CorenSearchBot/results");
$text .= $result;
last if WPTryEdit($ttl, $token, $more, $text, "Posting result of manual check");
}
} else {
print "Sleeping.\n";
sleep 20;
configstatus;
}
}
}
if($#npq >= 0) {
my $page = $npq[0];
shift @npq;
print "Checking [[$page]]\n";
my($why, $what, $score) = findmatches($page);
if(defined $why and not $why eq '') {
$score = int(100-$score);
print "\t\033[31;1m[[$page]] is $why of [$what] with confidence $score\033[0m\n";
my $res = TagPage($page, $why, $what);
if(defined $res) {
print "\tTagging: $res\n";
} else {
print "\tTags placed\n";
}
}
}
}
//</syntaxhighlight>