User:OrphanBot/tagbot.pl
Appearance
Source code for OrphanBot's upload-tagging task. Requires libBot.pm and Pearle.pm.
#!/usr/bin/perl
# Tagbot
#
# A bot to identify and tag recently-uploaded images that have no image description page, source information, or copyright tag.
use strict;
use warnings;
use Date::Calc qw(Month_to_Text Today);
use Array::Utils;
use utf8;
use Data::Dumper;
use libBot;
binmode STDOUT, ":utf8";
my $permit_interruptions = 0; # Allow talkpage messages to stop the bot?
my ($cur_y, $cur_m, $cur_d);
my %users_notified; # List of users notifed. 0, undef = no; 1 = notified once; 2 = notified and second notice
my %notifications; # List of user,image pairs, used to ensure that no user is ever notified about an image twice.
my %dont_notify = (); # List of users to never notify
my %banned_users = (); # List of users banned from uploading
my %exempt_users = (); # List of users exempt from inspection
my %unknown_tags; # List of tags found that are not in either the "good" or "bad" list
my @sourcereq_tags; # List of tags that require a separate source
my $sourcereq_tags;
my @nosource_tags; # List of self-sourcing tags
my $nosource_tags;
my @deletion_tags; # Tags that will eventually lead to the deletion of the image
my $deletion_tags;
my @forbidden_tags; # List of tags that should never be seen
my $forbidden_tags;
my @deprecated_tags; # List of tags that shouldn't be used any more
my $deprecated_tags;
my @nontags; # List of tags that aren't copyright tags
my $nontags;
my @source_tags; # List of tags that provide source but not copyright status
my $source_tags;
sub loadTagList
{
my $filename = shift;
my @list = ();
open INFILE, "<", $filename;
while(<INFILE>)
{
$_ =~ s/#.*//; # Remove comments
$_ =~ s/^\s*//; # Remove leading whitespace
$_ =~ s/\s*$//; # Remove trailing whitespace
push @list, $_ if($_ !~ /^\s*$/);
}
close INFILE;
return @list;
}
sub processTagList
{
my $tags = join "|", @_;
$tags =~ s/\(/\\\(/g;
$tags =~ s/\)/\\\)/g;
$tags =~ s/\./\\\./g;
$tags =~ s/\*/.*?/g;
return "($tags)";
}
@sourcereq_tags = loadTagList("sourcereq.tags");
$sourcereq_tags = processTagList(@sourcereq_tags);
print "Sourcereq: Loaded\n\n";
@nosource_tags = loadTagList("nosource.tags");
$nosource_tags = processTagList(@nosource_tags);
print "Nosource: Loaded\n\n";
@forbidden_tags = loadTagList("forbidden.tags");
$forbidden_tags = processTagList(@forbidden_tags);
print "Forbid: Loaded\n\n";
@deletion_tags = loadTagList("deletion.tags");
$deletion_tags = processTagList(@deletion_tags);
print "Deletion: Loaded\n\n";
@deprecated_tags = loadTagList("deprecated.tags");
$deprecated_tags = processTagList(@deprecated_tags);
print "Deprecated: Loaded\n\n";
@nontags = loadTagList("nontags.tags");
$nontags = processTagList(@nontags);
print "Nontags: Loaded\n\n";
@source_tags = loadTagList("source.tags");
$source_tags = processTagList(@source_tags);
print "Sourcetags: Loaded\n\n";
sub tokenSubst
{
my $string = shift;
my $image = shift;
$string =~ s/<IMAGE>/$image/g if(defined($image));
$string =~ s/<DAY>/$cur_d/g;
$string =~ s/<MONTH>/$cur_m/g;
$string =~ s/<YEAR>/$cur_y/g;
return $string;
}
sub loadUserList
{
my $file = shift;
my %notelist;
my $i = 0;
Pearle::myLog(4, "File: $file\n");
open INFILE, "<", $file;
while(<INFILE>)
{
my ($user, $reason);
$_ =~ s/\s*#.*$//g;
chomp;
($user, $reason) = $_ =~ /([^\t]*)\t+(.*)/;
next if(!defined($user) or !defined($reason));
$notelist{$user} = $reason;
$i++;
}
close INFILE;
Pearle::myLog(3, "$i notifications loaded\n");
return %notelist;
}
# Initialize
($cur_y, $cur_m, $cur_d) = Today(1); # Today in GMT
$cur_m = Month_to_Text($cur_m);
Pearle::init("username", "password", "tagbot.log","cookies.tagbot.txt");
Pearle::config(nullOK => 1, sanityCheck => 1, loglevel => 3, printlevel => 4, testmode => 0);
config(username => 'username');
Pearle::myLog(2, "Beginning execution\n");
%dont_notify = loadNotificationList("orphanbot.whitelist");
%banned_users = loadUserList("banneduser.list");
%exempt_users = loadUserList("exemptuser.list");
if(!Pearle::login())
{
exit;
}
# Get the day's uploads
my @articles;
@articles = Pearle::getLogArticles(log => 'upload', limit => 150);
# Chop off the 20 most recent log entries
splice @articles, 0, 20;
Pearle::myLog(3, scalar(@articles) . " images found\n");
foreach my $log_entry (@articles)
{
my $image = $log_entry->[0];
my $uploader = $log_entry->[1];
my $summary = $log_entry->[2] || "";
print "$image\n";
print "$uploader\n";
Pearle::myLog(2, "Processing image $image\n");
# Basic checks that can be done from the log alone
# Non-terminating check: Was the image uploaded by a blacklisted user?
if($banned_users{$uploader})
{
botwarnlog("*Image [[:$image]] uploaded by blacklisted user [[User:$uploader]]\n");
Pearle::myLog(3, "Upload by banned user $uploader\n");
}
# Terminating check: Is the user on the whitelist?
if($exempt_users{$uploader})
{
Pearle::myLog(2, "Upload by exempt user $uploader found.\n");
next;
}
# Terminating check: Is the upload a modification?
if($summary =~ /optimi(z|s)ed using (optipng|PNGCrusher)/i)
{
Pearle::myLog(2, "Optimize upload found for image $image\n");
next;
}
if($summary =~ /tweak|crop|scale|adjust|change|resize|corrected|correcting/i)
{
Pearle::myLog(2, "Tweak found for image $image\n");
next;
}
# Terminating check: Is the upload a revert?
if($summary =~ /Reverted to earlier revision|Reverted to version/)
{
Pearle::myLog(2, "Revert upload found for image $image\n");
next;
}
# Get page data
my $image_data = Pearle::APIQuery(titles => $image, prop => ['templates', 'revisions'],
tllimit => 500, # All the templates
rvprop => ['content'], # Article body
meta => 'userinfo', uiprop => ['hasmsg'], # Check for talkpage messages
redirects => 1, # Resolve redirects
);
if(!defined($image_data))
{
Pearle::myLog(1, "Server did not return an appropriate response.\n");
next;
}
my $parsed_xml = Pearle::getXMLParser()->XMLin($image_data, ForceArray => ['tl'] );
Pearle::myLog(4, Dumper($parsed_xml));
my $page_text = GetPageText($parsed_xml);
my @templates = GetPageTemplates($parsed_xml);
# Remove non-tags from template list
# TODO: Remove redlinks
@templates = grep {$_ !~ /:$nontags$/i} @templates;
my $stripped_page_text = $page_text || "";
$stripped_page_text =~ s/^==.*?==//gm; # Remove section headers
$stripped_page_text =~ s/\n//g; # Remove newlines
$stripped_page_text =~ s/{{{[^}]+}}}//g; # Remove template parameters
$stripped_page_text =~ s/{{[^}]+}}//gi; # Remove templates
Pearle::myLog(4, "Templates: " . join(", ", @templates) . "\n");
Pearle::myLog(4, "Stripped text: $stripped_page_text\n");
print "=============================================================================\n";
# Check for interruptions
if($permit_interruptions and DoIHaveMessages($image_data))
{
Pearle::myLog(0, "Talkpage message found; exiting on image $image.\n");
last;
}
# Sanity check: Does the image still exist?
if(defined($parsed_xml->{query}->{pages}->{page}->{missing}))
{
Pearle::myLog(2, "Image $image has already been deleted\n");
next;
}
# Sanity check: Is the image marked for deletion?
if(grep {$_ =~ /:$deletion_tags$/i} @templates)
{
# We don't do anything with images already marked for deletion. There are just too many corner cases and wasted-effort conditions.
Pearle::myLog(2, "Deletion tag found\n");
next;
}
# Check for red flag: "Google Image" (matches 'image', 'images', 'imagesearch')
if(defined($page_text) and $page_text =~ /google image/i)
{
Pearle::myLog(2, "Image $image has red-flag keyword 'google image'\n");
botwarnlog("* Image [[:$image]] has red-flag keyword 'google image'\n");
}
# Check for red flag: those goddamn navboxes
if(grep {$_ =~ /navbox/i} @templates)
{
botwarnlog("* Navbox found on [[:$image]]\n");
}
######### Check for source, license, and tag ###################
# Meanings: "undef" = we don't know, "0" = definitely no, "1" = probably yes
my $has_source = undef;
my $has_license = undef;
my $has_tag = undef;
# Does the image lack a description page?
if(!defined($page_text) or $page_text =~ /^\s*$/)
{
Pearle::myLog(3, "Empty IDP\n");
$has_source = 0;
$has_license = 0;
$has_tag = 0;
}
# Does the image have a source-providing template?
if(my @tags = grep {$_ =~ /:$source_tags$/i} @templates)
{
Pearle::myLog(3, "Source-providing template @tags found\n");
$has_source = 1;
# Remove the tags from the candidate set: they can't keep an image from being "untagged" or having unknown tags
@templates = Array::Utils::array_diff(@templates, @tags);
}
# Does the image have a self-sourcing tag?
if(my @tags = grep {$_ =~ /:$nosource_tags$/i} @templates)
{
Pearle::myLog(3, "Self-sourcing tag @tags found\n");
$has_source = 1;
$has_license = 1;
$has_tag = 1;
}
# Does the image have a sourcereq tag?
if(my @tags = grep {$_ =~ /:$sourcereq_tags$/i} @templates)
{
Pearle::myLog(3, "Sourcereq tag @tags found\n");
$has_license = 1;
$has_tag = 1;
}
# Handle those damned "Information" and "Non-free use rationale" tags
if(grep {$_ =~ /:Information$/} @templates)
{
Pearle::myLog(3, "Has an Information template\n");
# Remove the template from the list
@templates = grep {$_ !~ /:Information$/} @templates;
# Attempt to parse an "information" template
if($page_text =~ /\|\s*source\s*=\s*[^|}]{4,}/i)
{
# If there's a filled-in "source" parameter, assume a source
Pearle::myLog(3, "Assuming source in {{Information:source}}\n");
$has_source = 1;
}
if($page_text =~/\|\s*author\s*=\s*[^|}]{4,}/i)
{
# If there's a filled-in "author" parameter, assume a source
Pearle::myLog(3, "Assuming source in {{Information:author}}\n");
$has_source = 1;
}
if($page_text =~/\|\s*permission\s*=\s*[^|}]{4,}/i)
{
# If there's a filled-in "permission" parameter, assume a license (but not a tag)
Pearle::myLog(3, "Assuming license in {{Information:permission}}\n");
$has_license = 1;
}
if($page_text =~/\|\s*flickr_url\s*=\s*[^|}]{4,}/i)
{
# If there's a filled-in "flickr_url" parameter, assume a source
Pearle::myLog(3, "Assuming source in {{Flickr:flickr_url}}\n");
$has_source = 1;
}
}
if((grep {$_ =~ /:Non-free media rationale$/} @templates) or
(grep {$_ =~ /:Non-free use rationale$/} @templates))
{
Pearle::myLog(3, "Has a non-free use rationale template\n");
# Remove the template from the list
@templates = grep {$_ !~ /:Non-free media rationale$/} @templates;
@templates = grep {$_ !~ /:Non-free use rationale$/} @templates;
# Attempt to parse a "non-free use rationale" template or derivative
if($page_text =~ /\|\s*source\s*=\s*[^|}]{4,}/i)
{
# If there's a filled-in "source" parameter, assume a source
Pearle::myLog(3, "Assuming source in {{Non-free * rationale:source}}\n");
$has_source = 1;
}
if($page_text =~ /\|\s*publisher\s*=\s*[^|}]{4,}/i)
{
# If there's a filled-in "publisher" parameter, assume a source
Pearle::myLog(3, "Assuming source in {{Non-free * rationale:publisher}}\n");
$has_source = 1;
}
if($page_text =~ /\|\s*owner\s*=\s*[^|}]{4,}/i)
{
# If there's a filled-in "owner" parameter, assume a source
Pearle::myLog(3, "Assuming source in {{Non-free * rationale:owner}}\n");
$has_source = 1;
}
if($page_text =~ /\|\s*website\s*=\s*[^|}]{4,}/i)
{
# If there's a filled-in "website" parameter, assume a source
Pearle::myLog(3, "Assuming source in {{Non-free * rationale:website}}\n");
$has_source = 1;
}
if($page_text =~ /\|\s*distributor\s*=\s*[^|}]{4,}/i)
{
# If there's a filled-in "distributor" parameter, assume a source
Pearle::myLog(3, "Assuming source in {{Non-free * rationale:distributor}}\n");
$has_source = 1;
}
$has_license = 1; # Assume that it's licensed as "fair use"
}
if(grep {$_ =~ /:Non-free image data$/} @templates)
{
Pearle::myLog(3, "Has a non-free image data template\n");
# Remove the template from the list
@templates = grep {$_ !~ /:Non-free image data$/} @templates;
# Attempt to parse a "non-free use rationale" template
if($page_text =~ /\|\s*source\s*=\s*[^|}]{4,}/i)
{
# If there's a filled-in "source" parameter, assume a source
Pearle::myLog(3, "Assuming source in {{Non-free image data:source}}\n");
$has_source = 1;
}
$has_license = 1; # Assume that it's licensed as "fair use"
}
if(grep {$_ =~ /:spoken article entry$/i} @templates)
{
Pearle::myLog(3, "Has a Spoken Article template\n");
# Remove the template from the list
@templates = grep {$_ !~ /:Spoken article entry$/i} @templates;
# Attempt to parse
if($page_text =~ /\|\s*user_name\s*=[ \t]*\S+/i)
{
# If there's a filled-in "user_name" parameter, assume a source
Pearle::myLog(3, "Assuming source in {{Spoken article entry:user_name}}\n");
$has_source = 1;
}
$has_license = 1; # Assume that it's GFDL
}
# Is the image description page lacking in tags?
# This is checked here because we may have removed "information" or "non-free use rationale" templates from the list earlier
# We want those in the list before here because it makes detecting them for parsing easier, but we don't want them in the list
# here so we can say for sure that the page is untagged.
if(scalar(@templates) == 0)
{
Pearle::myLog(3, "No templates found\n");
$has_tag = 0;
}
# Does it have source information outside of the templates?
if(length($stripped_page_text) >= 7) # Page text with headers, newlines and templates stripped is at least seven bytes ("my work")
{
# TODO: Better source checking
if(!defined($has_source) or $has_source == 0)
{
Pearle::myLog(3, "Assuming page has source\n");
$has_source = 1;
}
if(!defined($has_license) or $has_license == 0)
{
Pearle::myLog(3, "Assuming page has license\n");
$has_license = 1;
}
}
else
{
if(!defined($has_source))
{
# If we still don't know if it has a source, it's safe to assume it doesn't.
Pearle::myLog(3, "Assuming page doesn't have source\n");
$has_source = 0;
}
if(!defined($has_license))
{
# If we still don't know if it has a license, we'll assume it doesn't
Pearle::myLog(3, "Assuming page doesn't have license information\n");
$has_license = 0;
}
}
########## Check for exceptional conditions ##########
# Terminating check: Is the image using a deprecated tag, and doesn't have any other license tag?
if(($has_tag != 1) and (grep {$_ =~ /:$deprecated_tags$/i} @templates))
{
Pearle::myLog(2, "Image has deprecated tag\n");
# Mark as no-license
wikilog($image, tokenSubst("\n{{no copyright information|month=<MONTH>|day=<DAY>|year=<YEAR>}}"), "Obsolete or deprecated tag");
if(!IsNotified($uploader, undef, $image, undef, \%dont_notify))
{
Pearle::myLog(2, "Warning user $uploader\n");
wikilog("User talk:$uploader", "\n{{subst:User:OrphanBot/deprecated|$image}} --~~~~", "Image with obsolete or deprecated license");
}
Pearle::limit();
next;
}
# Terminating check: Does the image have a forbidden tag?
if(my @tags = grep {$_ =~ /:$forbidden_tags$/i} @templates)
{
# Doesn't matter what else is on the page, the image requires human handling
Pearle::myLog(2, "Forbidden tag $tags[0] found on image [[:$image]]\n");
botwarnlog("*Forbidden tag {{tl|$tags[0]}} found on image [[:$image]]\n");
Pearle::limit();
next;
}
# TODO: Terminating check: Malformed fair-use rationale
########## Process #####################################
# We've found an image with only unknown templates
if(!defined($has_tag))
{
# We don't know if it has any tags or not, and so we cannot deduce the license or source status
# We know an image doesn't have tags if:
# * It has no templates
# * or all templates are on the "nontags" list
# We know an image has tags if:
# * We found a tag we know about
print "Has unknown tags\n";
my @new_unknown_tags = grep {!defined($unknown_tags{$_})} @templates;
if(scalar(@new_unknown_tags) > 0)
{
foreach my $unknown_tag (@new_unknown_tags)
{
Pearle::myLog(2, "Unknown tag {{$unknown_tag}} found\n");
botwarnlog("* Unknown tag [[$unknown_tag]] found\n");
$unknown_tags{$unknown_tag} = 1;
}
Pearle::limit();
}
}
elsif($has_tag == 0)
{
if($has_source == 0)
{
if($has_license == 0)
{
# Tag as "no source" and "no license"
wikilog($image, tokenSubst("\n{{no copyright holder|month=<MONTH>|day=<DAY>|year=<YEAR>}}\n{{no copyright information|month=<MONTH>|day=<DAY>|year=<YEAR>}}"), "Image has no source or license information");
if(!IsNotified($uploader, undef, $image, undef, \%dont_notify))
{
Pearle::myLog(2, "Warning user $uploader\n");
wikilog("User talk:$uploader", tokenSubst("{{subst:User:OrphanBot/nosource nolicense|<IMAGE>}} --~~~~\n", $image), "You've uploaded an image with no source or license information");
}
Pearle::myLog(2, "No source, no license\n");
Pearle::limit();
}
else
{
# No license tag, and it either has a license or we don't know if it has a license
# Tag as "no source" and "untagged"
wikilog($image, tokenSubst("\n{{no copyright holder|month=<MONTH>|day=<DAY>|year=<YEAR>}}\n{{untagged|month=<MONTH>|day=<DAY>|year=<YEAR>}}"), "Image has no source or license tag");
if(!IsNotified($uploader, undef, $image, undef, \%dont_notify))
{
Pearle::myLog(2, "Warning user $uploader\n");
wikilog("User talk:$uploader", tokenSubst("{{subst:User:OrphanBot/nosource untagged|<IMAGE>}} --~~~~\n", $image), "You've uploaded an image with no source or license tag");
}
Pearle::myLog(2, "No source, untagged\n");
Pearle::limit();
}
}
else
{
if($has_license == 0)
{
# Tag as "no license"
wikilog($image, tokenSubst("\n{{no copyright information|month=<MONTH>|day=<DAY>|year=<YEAR>}}"), "Image has no license information");
if(!IsNotified($uploader, undef, $image, undef, \%dont_notify))
{
Pearle::myLog(2, "Warning user $uploader\n");
wikilog("User talk:$uploader", tokenSubst("{{subst:User:OrphanBot/nolicense|<IMAGE>}} --~~~~\n", $image), "You've uploaded an image with no license information");
}
Pearle::myLog(2, "No license\n");
Pearle::limit();
}
else
{
# Tag as "untagged"
wikilog($image, tokenSubst("\n{{untagged|month=<MONTH>|day=<DAY>|year=<YEAR>}}"), "Image has no license tag");
if(!IsNotified($uploader, undef, $image, undef, \%dont_notify))
{
Pearle::myLog(2, "Warning user $uploader\n");
wikilog("User talk:$uploader", tokenSubst("{{subst:User:OrphanBot/untagged-new|<IMAGE>}} --~~~~\n", $image), "You've uploaded an image with no license tag");
}
Pearle::myLog(2, "Untagged\n");
Pearle::limit();
}
}
}
else
{
# If it has a tag, it has a license
if($has_source == 0)
{
# Tag as "no source"
wikilog($image, tokenSubst("\n{{no copyright holder|month=<MONTH>|day=<DAY>|year=<YEAR>}}"), "Image has no source information");
if(!IsNotified($uploader, undef, $image, undef, \%dont_notify))
{
Pearle::myLog(2, "Warning user $uploader\n");
wikilog("User talk:$uploader", tokenSubst("{{subst:User:OrphanBot/nosource-new|<IMAGE>}} --~~~~\n", $image), "You've uploaded an image with no source information");
}
Pearle::myLog(2, "No source\n");
Pearle::limit();
}
else
{
# Everything's fine
Pearle::myLog(2, "Image has no problems\n");
}
}
sleep(2);
}
Pearle::myLog(2, "Finished with upload set\n");