User:Carnildo/wiki-regex-tester.pl
Appearance
Common usages:
wiki-regex-tester.pl titles.txt < blacklist.txt
Will test every regex in "blacklist.txt" to see if it matches any titles in "titles.txt". "blacklist.txt" contains one blacklist regex per line; "titles.txt" contains one title per line.
wiki-regex-tester.pl 'Title of a Wikipedia article' < blacklist.txt
Will test to see if 'Title of a Wikipedia article' would be blocked by any entry in "blacklist.txt"
wget -O - 'http://en.wikipedia.org/w/index.php?title=MediaWiki:Titleblacklist&action=raw' |perl wiki-regex-tester.pl ns_0.txt|wc -l
Will fetch the latest version of the English Wikipedia blacklist, test it against the list of titles in "ns_0.txt", and count the number of titles matched.
#!/usr/bin/perl use warnings; use strict; use utf8; use Time::HiRes; binmode STDIN, ":utf8"; binmode STDOUT, ":utf8"; binmode STDERR, ":utf8"; my @regexes; while(<STDIN>) { my $regex = $_; my $ignorecase = 1; my $moveonly = 0; my $newaccount = 0; $regex =~ s/#.*$//g; # Strip comments $ignorecase = 0 if($regex =~ /casesensitive/); $moveonly = 1 if($regex =~ /moveonly/); $newaccount = 1 if($regex =~ /newaccountonly/); $regex =~ s/<(moveonly|newaccountonly|casesensitive|\||errmsg=[^|>]*| )+>//g; # Strip modifiers $regex =~ s/\s*$//g; # Strip trailing space $regex =~ s/^\s*//g; # Strip leading space if($regex !~ /^\s*$/ and !$newaccount) { push @regexes, [$regex, $ignorecase, $moveonly]; } } print STDERR "Testing " . scalar(@regexes) . " regexes\n"; my $lines = 0; my $lines2 = 0; my $regex_count = 0; foreach my $regex_entry (@regexes) { my $start_time = Time::HiRes::time(); my $u_start_time = Time::HiRes::clock(); my $maxtime = 0; my ($regex, $ignorecase, $moveonly) = @{$regex_entry}; if(-e $ARGV[0]) { open INFILE, "<", $ARGV[0]; binmode INFILE, ":utf8"; } else { open INFILE, "<", \$ARGV[0]; binmode INFILE, ":utf8"; } while(<INFILE>) { my $target = $_; chomp $target; $target =~ s/_/ /g; if($ignorecase) { if($target =~ /^$regex$/i) { print "* [[$target]] :: $regex\n"; } } else { if($target =~ /^$regex$/) { print "* [[$target]] :: $regex\n"; } } $lines = $lines + 1; if($lines >= 10000) { my $newtime = Time::HiRes::clock(); my $diff = $newtime - $u_start_time; $u_start_time = $newtime; $maxtime = $diff if($diff > $maxtime); $lines = 0; $lines2 += 10000; print STDERR "$diff $lines2\r"; } } $regex_count += 1; my $stop_time = Time::HiRes::time(); print STDERR "Regex $regex took " . ($stop_time - $start_time) . " seconds\n"; print STDERR "Slowest batch took $maxtime seconds\n"; close INFILE; }