Jump to content

User:Curtis Autery/findNumChange.pl

From Wikipedia, the free encyclopedia

This is a perl script in response to Dark Shikari's request for a bot that could work with an offline database dump and detect revisions where only numbers were changed.

This script does not fulfil all the requirements of the request, but is lightweight and can be used in conjuction with other tools and troubleshooting methods.

#!/usr/bin/perl -w

# findNumChange.pl
# Version 1.0
# Searches enwiki-*-pages-meta-history.xml for revisions that
# change only numeric values.  This is a common vandalism technique
# that sometimes goes unnoticed.  For an example of this type of
# vandalism, see the article on Abraham Lincoln

# Usage:
# findNumChange.pl inputfile > outputfile
# Also works with piped input, useful for users without
# enough free space to work on the uncompressed file,
# or without file systems that support >2gig files.  Example:
# 7z e -so inputfile.7z | findNumChange.pl > outputfile

# Edit value of $lowstamp in format yyyy-MM-yyThh:mm:ssZ
# to narrow search to changes that happened after that date

use strict;

my $lowstamp = '2006-09-01T00:00:00Z';


my $title = '';			# Article title
my $revision = 0;		# Article revision
my $timestamp = '';		# Timestamp of revision
my $author = '';		# Author username or IP/Domain name
my $comment = '';		# Revision comment
my $orig = '';			# Text of prior version of page
my $current = '';		# Text of current revision being looked at
my $lastline = '';		# Last line of input, used as poor man's XML parsing tool
my $line = '';			# Current line
my $intext = 0;			# 0 = line is XML envelope, 1 = line is article text
my $changes = '';		# Poor man's diff of matching changes

sub clearmost {			# Clears variables that change with each revision
	$revision = 0;
	$timestamp = '';
	$author = '';
	$comment = '';
	$orig = $current;
	$current = '';
	$intext = 0;
	$changes = '';
}

sub matchcrit {			# Main routine to apply pattern matching
	return 0 if $timestamp lt $lowstamp;

	# Comment these out to not trust comments indicating no vandalism
	# or to see if changes have been reverted
	return 0 if $comment =~ /correct/i;
	return 0 if $comment =~ /revert/i;
	return 0 if $comment =~ /fix/i;
	return 0 if $comment =~ /rvv/;

	my @old = split /\n/, $orig;
	my @new = split /\n/, $current;
	my $flag = 0;
	$changes = '';
	return 0 if $#old != $#new;	# Rule out revisions with added or deleted lines

	# Loop through each line, compare original lines and lines with numbers removed.
	# If lines without numbers match, but original lines do not, then a change to
	# only the numbers has been made.
	for (0 .. $#new){
		my $tmpold = $old[$_];
		my $tmpnew = $new[$_];
		$tmpold =~ s/\d+[,.]?(?!px)//g;	# deletes one or more digits followed by a possible
		$tmpnew =~ s/\d+[,.]?(?!px)//g;	# comma or period, globally, unless followed by px
		if (($tmpold eq $tmpnew) && ($old[$_] ne $new[$_])) {
			$flag = 1;
			$changes .= "- $old[$_]\n+ $new[$_]\n";
		}
	}
	return $flag;
}

sub spitit {			# Output summary of change
	print <<EOF;
------------------------------------------------------------

Article: $title
Timestamp: $timestamp
Revision: $revision
Contributor: $author
Comment: $comment

Changes:
$changes
EOF

}

while (<>) {
	$lastline = $line unless $intext;
	$line = $_;
	if ($line =~ m!</revision>!) {
		spitit if matchcrit;
		clearmost;
	}
	if ($intext) {
		$current .= $line;
		$intext = 0 if $line =~ m!</text>!;
	} else {
		$title = $1 if $line =~ m!<title>([^<]+)</title>!;
		if ($line =~ m!<id>([^<]+)</id>!) {
			my $tmp = $1;
			$revision = $tmp if $lastline =~ /revision/;
		}
		$author = $1 if $line =~ m!<ip>([^<]+)</ip>!;
		$author = $1 if $line =~ m!<username>([^<]+)</username>!;
		$timestamp = $1 if $line =~ m!<timestamp>([^<]+)</timestamp>!;
		$comment = $1 if $line =~ m!<comment>([^<]+)</comment>!;
		if ($line =~ /<text/) {
			$intext = 1;
			$current = $line;
		}
	}
}