User:Curtis Autery/findNumChange.pl
Appearance
This is a perl script in response to Dark Shikari's request for a bot that could work with an offline database dump and detect revisions where only numbers were changed.
This script does not fulfil all the requirements of the request, but is lightweight and can be used in conjuction with other tools and troubleshooting methods.
#!/usr/bin/perl -w # findNumChange.pl # Version 1.0 # Searches enwiki-*-pages-meta-history.xml for revisions that # change only numeric values. This is a common vandalism technique # that sometimes goes unnoticed. For an example of this type of # vandalism, see the article on Abraham Lincoln # Usage: # findNumChange.pl inputfile > outputfile # Also works with piped input, useful for users without # enough free space to work on the uncompressed file, # or without file systems that support >2gig files. Example: # 7z e -so inputfile.7z | findNumChange.pl > outputfile # Edit value of $lowstamp in format yyyy-MM-yyThh:mm:ssZ # to narrow search to changes that happened after that date use strict; my $lowstamp = '2006-09-01T00:00:00Z'; my $title = ''; # Article title my $revision = 0; # Article revision my $timestamp = ''; # Timestamp of revision my $author = ''; # Author username or IP/Domain name my $comment = ''; # Revision comment my $orig = ''; # Text of prior version of page my $current = ''; # Text of current revision being looked at my $lastline = ''; # Last line of input, used as poor man's XML parsing tool my $line = ''; # Current line my $intext = 0; # 0 = line is XML envelope, 1 = line is article text my $changes = ''; # Poor man's diff of matching changes sub clearmost { # Clears variables that change with each revision $revision = 0; $timestamp = ''; $author = ''; $comment = ''; $orig = $current; $current = ''; $intext = 0; $changes = ''; } sub matchcrit { # Main routine to apply pattern matching return 0 if $timestamp lt $lowstamp; # Comment these out to not trust comments indicating no vandalism # or to see if changes have been reverted return 0 if $comment =~ /correct/i; return 0 if $comment =~ /revert/i; return 0 if $comment =~ /fix/i; return 0 if $comment =~ /rvv/; my @old = split /\n/, $orig; my @new = split /\n/, $current; my $flag = 0; $changes = ''; return 0 if $#old != $#new; # Rule out revisions with added or deleted lines # Loop through each line, compare original lines and lines with numbers removed. # If lines without numbers match, but original lines do not, then a change to # only the numbers has been made. for (0 .. $#new){ my $tmpold = $old[$_]; my $tmpnew = $new[$_]; $tmpold =~ s/\d+[,.]?(?!px)//g; # deletes one or more digits followed by a possible $tmpnew =~ s/\d+[,.]?(?!px)//g; # comma or period, globally, unless followed by px if (($tmpold eq $tmpnew) && ($old[$_] ne $new[$_])) { $flag = 1; $changes .= "- $old[$_]\n+ $new[$_]\n"; } } return $flag; } sub spitit { # Output summary of change print <<EOF; ------------------------------------------------------------ Article: $title Timestamp: $timestamp Revision: $revision Contributor: $author Comment: $comment Changes: $changes EOF } while (<>) { $lastline = $line unless $intext; $line = $_; if ($line =~ m!</revision>!) { spitit if matchcrit; clearmost; } if ($intext) { $current .= $line; $intext = 0 if $line =~ m!</text>!; } else { $title = $1 if $line =~ m!<title>([^<]+)</title>!; if ($line =~ m!<id>([^<]+)</id>!) { my $tmp = $1; $revision = $tmp if $lastline =~ /revision/; } $author = $1 if $line =~ m!<ip>([^<]+)</ip>!; $author = $1 if $line =~ m!<username>([^<]+)</username>!; $timestamp = $1 if $line =~ m!<timestamp>([^<]+)</timestamp>!; $comment = $1 if $line =~ m!<comment>([^<]+)</comment>!; if ($line =~ /<text/) { $intext = 1; $current = $line; } } }