User:Interiot/Tool/code
Appearance
< User:Interiot | Tool
#!/usr/bin/perl
# License: [[Public domain]]
# regression test cases:
# Markadet fr.wikipedia.org (11k edits)
# Kolja21 de.wikipedia.org (5.1k edits)
# OldakQuill en.wikipedia.org (12k edits)
# Mxn meta.wikimedia.org (1.7k edits)
# Helios89 it.wikipedia.org (7k edits)
# TODO:
# - regarding the "403 access denied" problem, contact noc@wikimedia.org or #wikimedia-tech on freenode
# - ahh, they actively block screen-scrapers
# - sweet-talk Hashar or Dom into unblocking, temporarily disable the tool or enable some form of rate limiting, etc.
# - add a starting-cutoff-date, so renominations for RfA could only include the most recent items
# - add a # edits per day
# - use something like this to retrieve the list of namespaces in real-time:
# http://en.wikiquote.org/wiki/Special:Export/Main_Page
# - make "minor" actually work well for editcountitis:
# - eg. for each namespace, present it like: Category Talk: 23 (13)
# where "23" is the non-minor edits, and "13" is the minor edits
# - get it to work with other mediawikis (example: http://brandt-watch.org/bwwiki/Main_Page)
# - include a date at the end of the top-15 breakdown
# - change the <div>s to <td>s on graph bars
# - don't count comments as having an edit summary when it's purely an autocomment
# - fix the issue where there's an "extra" first result when $offset > 0
#
# - REWRITE IN AJAX so we don't have to worry about it being a temporary solution or not
# - fix the sorting order on the output
#
# - ?? http://tools.wikimedia.de/~avar/cgi-bin/count
# Possible other analysis graphs:
# - monthly breakdowns
# : have all the monthly breakdowns appear in one space on the page, but allow the user to
# select between them with Javascript
# - monthly breakdown of major/minor edits (like current red/green... make major edits on left, with minor edits trailing on right)
# - monthly breakdown of the number of edits with summaries of /^(rv|revert)/
# - monthly breakdown, one each for the separate namespaces
# - on monthly breakdowns, extrapolate the current month forward
# - allow the user to hit ''(more)'' at the bottom of the namespace breakdowns, allowing them to
# see a more complete list of top-15
# - allow the user to restrict the metrics to some specific recent period... eg. this is
# something that's sometimes discussed on RfA
# - any content-based analyses? (I suppose one would have to know which SQL thingies are quicker than others)
# semi-far-out:
# - allow the user to see JUST their edits from a specific page, when they click on that page on
# the top-15 breakdown (furthermore, if structured right, it might let anybody's tool basically to
# pop up the results of a $user && $page query)
# - allow the results to be the combination of multiple users (either logged-in-user + anon-IP,
# and multiple logged-in-users from multiple sites, eg. meta)
use strict;
use warnings;
use CGI;
#use CGI::Carp qw(fatalsToBrowser);
use Date::Parse;
use LWP::Simple;
use HTML::Entities;
use Data::Dumper;
sub LOGFILE {"/home/interiot/public_html/tmp/wannabe_kate.log"}
if ($ENV{QUERY_STRING} eq "code") { # send ourself when requested
open FIN, $0 and print "Content-type: text/plain\n\n", <FIN>;
exit;
}
# fill out using these documents:
# http://meta.wikimedia.org/wiki/MediaWiki_localisation#Getting_latest_file
# http://sourceforge.net/docs/E04/
sub nmspc {
my @a = map {s/#.*//; s/^\s+|\s+$//g; $_} grep /\S/, split /[\n\r]+/, shift;
return { "\x00order" => [@a], map { $_,1} @a};
}
my %valid_namespaces = (
'en.wikipedia.org' => nmspc(qq[
Talk:
Category talk:
Category:
Help:
Help talk:
Image:
Image talk:
MediaWiki:
MediaWiki talk:
Portal:
Portal talk:
Template:
Template talk:
User:
User talk:
Wikipedia:
Wikipedia talk:
]),
'de.wikipedia.org' => nmspc(qq[
Diskussion: # Talk
Kategorie: # Category:
Kategorie Diskussion: # Category Talk:
Hilfe: # Help:
Hilfe Diskussion: # Help Talk:
Bild: # Image:
Bild Diskussion: # Image Talk:
MediaWiki: # MediaWiki:
MediaWiki Diskussion: # MediaWiki Talk:
Portal: # Portal:
Portal Diskussion: # Portal Talk:
Vorlage: # Template:
Vorlage Diskussion: # Template Talk:
Benutzer: # User:
Benutzer Diskussion: # User Talk:
Wikipedia: # Wikipedia:
Wikipedia Diskussion: # Wikipedia Talk:
]),
'it.wikipedia.org' => nmspc(qq[
Discussione # Talk:
Categoria # Category:
Discussioni categoria # Category Talk:
Aiuto # Help:
Discussioni aiuto # Help Talk:
Immagine # Image:
Discussioni immagine # Image Talk:
MediaWiki # MediaWiki:
Discussioni MediaWiki # MediaWiki Talk:
Template # Template:
Discussioni template # Template Talk:
Utente # User:
Discussioni utente # User Talk:
Wikipedia # Wikipedia:
Discussioni Wikipedia # Wikipedia Talk:
]),
);
my $query = new CGI;
my $site = $query->param("site");
my $username = CGI::Util::escape($query->param("username"));
$username =~ s/[\+\s]/_/g;
my $isvalid = 0;
my $this_namespace;
if ($ENV{QUERY_STRING}) {
$isvalid = 1;
$isvalid = 0 unless ($site =~ /^[\w\.]*\.(org|com|net)$/i);
#$isvalid = 0 unless ($username =~ /^[-\w\._]*$/);
$isvalid = 0 if (length($username) == 0);
}
# data we generate by parsing the output from Wikipedia
my @urls;
my $bandwidth_down = 0;
my %namespace_totals;
my $xml_lang = "";
my $earliest_perldate;
my $latest_perldate;
my %month_totals;
my %month_editsummary_totals;
my %unique_articles;
my %namespace_unique_articles;
my %article_titles;
print "Content-type: text/html; charset=utf-8\n\n";
#cgi_dumper(\%valid_namespaces);
if (!$isvalid) {
if ($ENV{QUERY_STRING}) {
print "<font color=red><b>Invalid value</b></font>. <a href='http://en.wikipedia.org/wiki/Special:Emailuser/Interiot'>email Interiot</a> if this is incorrect.<p><br><br>\n";
}
print <<"EOF";
This is a slow substitute for <a
href="http://en.wikipedia.org/wiki/Wikipedia:Kate%27s_Tool">Kate's Tool</a> when it's unavailable.
<form method=GET style="padding-top:1em">
<table><tr><td>username <td><input maxlength=128 name=username value="" title="username">
<tr><td>site <td><input maxlength=128 name=site value="en.wikipedia.org" title="site">
<tr><td> <td><input type=submit value="Submit">
</table>
</form>
Notes:
<ul>
<li>Green bars are for edit summaries, red bars are for edits with no summaries
<li>The statistics are real-time (it <a href="http://en.wikipedia.org/wiki/Screen_scraping">scrapes</a> data off of the <tt>Special:Contributions</tt> page while you wait).
<li>It's somewhat slow for edit counts over 5000
<li>It's unable to count deleted edits
<li>It should work with most wikis out there that use <a href="http://en.wikipedia.org/wiki/MediaWiki">MediaWiki</a>, since it doesn't need privileged access to the databases.
<!-- <li>This can't be more than a temporary solution for Wikipedia, as it wastes ~1GB/day of extra bandwidth compared to Kate's -->
<li>Source code is in the <a href="http://en.wikipedia.org/wiki/Public_domain">public domain</a> and available <a href="$ENV{SCRIPT_NAME}?code">here</a>
<li>Warning: <a href="http://www.bbc.co.uk/dna/h2g2/A1091350">metrics are evil</a>
</ul>
For bug reports/comments, see <a href="http://en.wikipedia.org/wiki/User_talk:Interiot">User talk:Interiot</a> or <a href="http://en.wikipedia.org/wiki/Special:Emailuser/Interiot">email him</a>.
EOF
} else {
$this_namespace = $valid_namespaces{lc $site};
#cgi_dumper(\$this_namespace); exit;
$username =~ s/^_+|_$//g;
#print "$site<br>$username\n";
$namespace_totals{earliest} = get_5000($site, $username, 0);
#cgi_dumper(\@urls, \%namespace_totals); exit;
#cgi_dumper(\%unique_articles);
$namespace_totals{"number of unique articles"} = scalar(keys %unique_articles);
$namespace_totals{"avg edits per article"} = sprintf("%5.2f", $namespace_totals{total} / $namespace_totals{"number of unique articles"});
print $xml_lang, <<'EOF';
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<style>
td {padding: .1em 1em .1em}
table.months {padding-top: 2em}
table.months td.date {font-weight: bold}
table.months td {font-size: 75%}
div.red, div.green {
height:1em;
float:left;
}
div.red {background-color: #f00}
div.green {background-color: #0f0}
div.topN {
float: left;
min-height: 30em; /* otherwise, they get ALL jumbled up */
}
table.topN {
float: left;
border: 1px solid black;
}
table.topN th {
background-color: #000;
color: #fff;
}
table.topN td {
/* override the above */
padding: .1em .3em;
}
</style>
</head>
<body>
EOF
print "<ul style='padding-left:10em'><a href='$ENV{SCRIPT_NAME}'><i>Go back</i></a> to see caveats or to check another user.</ul>\n";
print "<h3>User:$username</h3>\n";
print "<table>\n";
foreach my $key (sort keys %namespace_totals) {
print "<tr><td>", $key, "<td>", $namespace_totals{$key}, "\n";
}
print "</table>\n";
#### output the months stats
#cgi_dumper(\%month_editsummary_totals);
my @months = list_months();
my $max_width = 0;
$max_width = ($_ > $max_width ? $_ : $max_width) foreach (values %month_totals);
if ($max_width > 0) {
print "<table class=months>\n";
foreach my $month (@months) {
my $no_summary = $month_totals{$month} - $month_editsummary_totals{$month};
print "<tr><td class=date>$month <td>", $month_totals{$month}, "\n";
#print "<td><div class=red style='width:", int(500 * $month_totals{$month} / $max_width), "px'></div>\n";
print "<td><div class=green style='width:", int(500 * $month_editsummary_totals{$month} / $max_width), "px'></div>\n";
print "<div class=red style='width:", int(500 * $no_summary / $max_width), "px'></div>\n";
}
print "</table>\n";
}
#### output the top-15 namespace stats
my $num_to_present = 15;
if ($this_namespace) { # only do it if we're sure about the namespaces
print "<p><br>\n";
#print "<ul>NOTE: This section has a tendency to hilight a user's \"youthful indiscretions\". Please take the dates of the edits into account.</ul>\n";
foreach my $nmspc ("Mainspace", @{$this_namespace->{"\x00order"}}) {
next unless %{$namespace_unique_articles{$nmspc}};
my @articles = sort {$namespace_unique_articles{$nmspc}{$b} <=> $namespace_unique_articles{$nmspc}{$a}}
grep { $namespace_unique_articles{$nmspc}{$_} > 1} # filter out items with only 1 edit
keys(%{$namespace_unique_articles{$nmspc}});
next unless @articles;
#print "<div class=topN>\n";
print "<table class=topN><tr><th colspan=2>$nmspc\n";
my @present = splice(@articles, 0, $num_to_present);
foreach my $article (@present) {
my $artname = $article_titles{$article};
if ($nmspc ne 'Mainspace') {
$artname =~ s/^.*?://;
}
$artname =~ s/\s/ /g;
my $url = "http://$site/w/index.php?title=$article&action=history";
print "<tr><td>", $namespace_unique_articles{$nmspc}{$article}, "<td><a href='$url'>$artname</a>\n";
}
# fill it out so float:left doesn't jumble up
foreach (@present..14) {
print "<tr><td> <td> \n";
}
print "</table>\n";
#print "</div>\n";
}
}
#### output the bottom summary
print "<p style='clear:left'><br><br>If there were any problems, please <a href='http://en.wikipedia.org/wiki/Special:Emailuser/Interiot'>email Interiot</a> or post at <a href='http://en.wikipedia.org/wiki/User_talk:Interiot'>User talk:Interiot</a>.\n";
#print "<p>Based on these URLs:\n<ul>\n", join("\n", map {"<li><a href='$_>$_</a>"} @urls), "</ul>\n";
print "<div style='padding:1em 3em; font-size: 60%'>Based directly on these URLs:\n";
foreach my $ctr (0..$#urls) {
print "<a href='$urls[$ctr]'>[", ($ctr+1), "]</a>";
print ", " unless ($ctr >= @urls - 1);
print "\n";
}
print "</div>\n";
#### log the bandwidth used
open FOUT, ">>" . LOGFILE() or die;
printf FOUT "%s %-20s %-30s %5dK %7d\n", scalar(localtime), $username, $site,
int($bandwidth_down / 1024), $namespace_totals{total};
close FOUT;
}
sub get_5000 {
my $site = shift;
my $username = shift;
my $offset = shift;
my $earliest = "";
my $url = "http://$site/w/index.php?title=Special:Contributions&target=$username&offset=${offset}&limit=5000";
if (! $LWP::Simple::ua) {
LWP::Simple::_init_ua();
#$LWP::Simple::ua->agent("Mozilla/4.0 WebTV/2.6 (compatible; MSIE 4.0)"); # apparently they're picky about useragent strings
$LWP::Simple::ua->agent("Wget/1.9.1"); # apparently they're picky about useragent strings. Use the same as wget.
}
push(@urls, $url);
if (@urls >= 10) {
print "Too many pages fetched. Terminating.<br>\n";
#cgi_dumper(\@urls); exit;
}
my $page;
if (1) {
my $request = HTTP::Request->new(GET => $url);
my $response = $LWP::Simple::ua->request($request);
if (!$response->is_success) {
print "While trying to fetch <a href='$url'>$url</a>, $site responded:<br><br>\n", $response->status_line, "<br><br>", $response->content;
exit;
}
$page = $response->content;
$bandwidth_down += length($page);
if (0) {
local *FOUTOUT;
open FOUTOUT, ">/var/tmp/kate/tmp.out" or die;
print FOUTOUT $page;
close FOUTOUT;
}
} else {
open FININ, "</var/tmp/kate/tmp.out" or die;
local $/ = undef;
$page = <FININ>;
close FININ;
}
if ($page =~ /(<html [^>]+>)/i) {
$xml_lang = $1;
}
## parse each individual contribution
#while ($page =~ /^<li>(\d\d:\d\d,.*)/igm) {
while ($page =~ /^<li>([^(]+\(<a href="[^"]+action=history.*)/igm) {
my $this_time;
local $_ = $1;
my $edit_summary;
#$edit_summary++ if (m#<a href="/wiki/[^"]*"\s+title="[^"]*">[^<]*</a>\s*\(#is);
$edit_summary++ if (/<span class='comment'>/si);
my $article_url;
if (m#<a href="/wiki/([^"]+)" title="[^"]+">([^<]+)#si) {
$article_url = $1;
$article_titles{$1} = $2;
}
$unique_articles{$article_url}++;
## strip out all the HTML tags
s/<[^>]*>//gs;
if (/^(.*?) \(/) {
my $date = $1;
$earliest = $date;
# translate months into english, so Date::Parse chn handle them
# languages believed to work here: EN, DE, IT
$date =~ s/\b(?:gen )\b/jan/gix;
$date =~ s/\b(?:mär )\b/mar/gix;
$date =~ s/\b(?:mai|mag )\b/may/gix;
$date =~ s/\b(?:giu )\b/jun/gix;
$date =~ s/\b(?:lug )\b/jul/gix;
$date =~ s/\b(?:ago )\b/aug/gix;
$date =~ s/\b(?:set )\b/sep/gix;
$date =~ s/\b(?:okt|ott )\b/oct/gix;
$date =~ s/\b(?:dez|dic )\b/dec/gix;
$this_time = str2time($date);
if ($this_time == 0) {
#print "XXXXXXXXXXXXXXXXXXXXXXXXX<br>\n";
} else {
#print scalar(gmtime($this_time)), "<br>\n";
$earliest_perldate = $this_time; # record the earliest and latest month we see
$latest_perldate ||= $this_time;
my $monthkey = monthkey(localtime($this_time));
$month_totals{$monthkey}++;
$edit_summary && $month_editsummary_totals{$monthkey}++;
}
}
s/^[^()]*\([^()]*\) \([^()]*\) (?:\S )? //;
my $subspace = "Mainspace";
if (/^([^\s\d\/:]+(?:\s[^\s\d\/:]+)?:)/) {
if (!$this_namespace || exists $this_namespace->{$1}) {
$subspace = $1;
}
}
$namespace_totals{$subspace}++;
$namespace_totals{total}++;
$namespace_unique_articles{$subspace}{$article_url}++;
#print "$_<br>\n";
}
## if they have more than 5000 contributions, go to the next page
while ($page =~ /href="[^"]+:Contributions[^"]+offset=(\d+)/ig) {
#print "Trying again at offset $1<br>\n";
next unless $1 > 0 && ($offset == 0 || $1 < $offset);
return get_5000($site, $username, $1); # tail recursion until there are no more
}
return $earliest;
}
# returns something like [
# "2003/10",
# "2003/11",
# "2003,12"
# ]
sub list_months {
my $last_monthkey = '';
my @ret;
# yes, this is a fairly odd algorithm. oh well.
for (my $date=$earliest_perldate; $date<=$latest_perldate; $date+=10*24*60*60) {
my $monthkey = monthkey(localtime($date));
if ($monthkey ne $last_monthkey) {
push(@ret, $monthkey);
$last_monthkey = $monthkey;
}
}
return @ret;
}
sub monthkey {($_[5] + 1900) . "/" . ($_[4] + 1)}
sub cgi_dumper {print "<pre>", HTML::Entities::encode(Dumper(@_)), "</pre>"}