User:Johantheghost/wp-refconvert.pl
Appearance
#!/usr/bin/perl # A perl script to convert Wikipedia {{ref}}-style references to use the # <ref> feature. # # Usage: # wp-refconvert article.txt # creates a new file called article-new.txt, containing the new version # of the article. use utf8; binmode(STDIN, ":utf8"); binmode(STDOUT, ":utf8"); binmode(STDERR, ":utf8"); use strict; my $prog = "wp-refconvert"; ############################################################################## # Global Data ############################################################################## # Number of distinct references and notes found. my $numRefs = 0; my $numNotes = 0; my $fixedRefs = 0; # Table of references. my @references; ############################################################################## # Article Parsing ############################################################################## sub readArticle { my ( $file ) = @_; open(my $in, "<:utf8", $file) || die("$prog: can't open $file\n"); local($_); while (<$in>) { # Check for references in the line; and check for notes. These # should be mutually exclusive. my @refs = m/\{\{ref[^}]+\}\}/g; foreach my $r (@refs) { addRef($r); } my @notes = m/ *(\{\{note[^}]+\}\})/gc; if (scalar(@notes) > 0) { my ( $text ) = m/\G *(.*)$/; addNote($notes[0], $text); } } close($in); } sub addRef { my ( $ref ) = @_; my ( $n, $k, $l ); if (($n) = ( $ref =~ /^\{\{ref\|([^}|]+)\}\}$/)) { $k = ++$numRefs; $l = ""; # printf "Simple: %s, %d, %s\n", $n, $k, $l; } elsif (($n, $k) = ( $ref =~ /^\{\{ref_num\|([^}|]+)\|([^}|]+)\}\}$/)) { $l = ""; # printf "Num: %s, %d, %s\n", $n, $k, $l; } elsif (($n, $k, $l) = ( $ref =~ /^\{\{ref_label\|([^}|]+)\|([^}|]+)\|([^}|]+)\}\}$/)) { # printf "Label: %s, %d, %s\n", $n, $k, $l; } else { die("$prog: unknown reference style \"$ref\"\n"); } my $record = $references[$k - 1]; if (!defined($record)) { $record = { 'name' => $n, 'count' => 1, 'usecount' => 0 }; $references[$k - 1] = $record; } else { if ($record->{'name'} ne $n) { die(sprintf "$prog: note mismatch: ref %d=%s; note=%s\n", $k, $record->{'name'}, $n); } ++$record->{'count'}; } } sub addNote { my ( $note, $text ) = @_; my ( $n, $k, $l ); if (($n) = ( $note =~ /^\{\{note\|([^}|]+)\}\}$/)) { $k = ++$numNotes; $l = ""; # printf "Simple: %s, %d, %s\n", $n, $k, $l; } elsif (($n, $k, $l) = ( $note =~ /^\{\{note_label\|([^}|]+)\|([^}|]+)\|([^}|]+)\}\}$/)) { if ($k != ++$numNotes) { die("$prog: note number mismatch: seq = $numNotes, explicit = $k\n"); } # printf "Label: %s, %d, %s\n", $n, $k, $l; } else { die("$prog: unknown note style \"$note\"\n"); } my $record = $references[$k - 1]; if (!defined($record)) { die("$prog: note mismatch: no ref $k ($n)\n"); } if ($record->{'name'} ne $n) { die(sprintf "$prog: note mismatch: ref %d=%s; note=%s\n", $k, $record->{'name'}, $n); } $record->{'text'} = $text; } ############################################################################## # Article Editing ############################################################################## sub editArticle { my ( $file, $output ) = @_; my $doneRefs = 0; open(my $in, "<:utf8", $file) || die("$prog: can't open $file\n"); open(my $out, ">:utf8", $output) || die("$prog: can't create $output\n"); local($_); while (<$in>) { # Check for references in the line; and check for notes. These # should be mutually exclusive. if (/^\# \{\{note/) { if (!$doneRefs) { printf $out "<references/>\n"; ++$doneRefs; } } else { s/(\{\{ref[^}]+\}\})/fixRef($1)/ge; print $out $_; } } close($in); close($out); } sub fixRef { my ( $ref ) = @_; my ( $n, $k, $l ); if (($n) = ( $ref =~ /^\{\{ref\|([^}|]+)\}\}$/)) { $k = ++$fixedRefs; $l = ""; } elsif (($n, $k) = ( $ref =~ /^\{\{ref_num\|([^}|]+)\|([^}|]+)\}\}$/)) { $l = ""; # printf "Num: %s, %d, %s\n", $n, $k, $l; } elsif (($n, $k, $l) = ( $ref =~ /^\{\{ref_label\|([^}|]+)\|([^}|]+)\|([^}|]+)\}\}$/)) { ; } else { die("$prog: unknown reference style \"$ref\"\n"); } my $record = $references[$k - 1]; if (!defined($record)) { die("$prog: ref mismatch: no ref $k ($n)\n"); } if ($record->{'count'} > 1) { if ($record->{'usecount'}++ == 0) { return sprintf "<ref name=\"%s\">%s</ref>", $record->{'name'}, $record->{'text'}; } else { return sprintf "<ref name=\"%s\"/>", $record->{'name'}; } } else { return sprintf "<ref>%s</ref>", $record->{'text'}; } } ############################################################################## # Diagnostics ############################################################################## sub dumpRefs { if ($numNotes != $numRefs) { printf STDERR "## %d refs; %d notes\n", $numRefs, $numNotes; } foreach my $i (1 .. $numRefs) { my $record = $references[$i - 1]; printf STDERR "[%2d] %-12s (%2d) %s\n", $i, $record->{'name'}, $record->{'count'}, $record->{'text'}; } } ############################################################################## # Main ############################################################################## sub main { my ( @args ) = @_; my $article = $args[0]; # First, parse the article. readArticle($article); # dumpRefs(); my $newvers = $article; ($newvers =~ s:\.([^/]+)$:-new.\1:) || ($newvers .= ".new"); printf STDERR "## edit %s -> %s\n", $article, $newvers; editArticle($article, $newvers); 0; } exit(main(@ARGV));