Wikipedia:Duplicated sections/script
Appearance
# Hot pipes $| = 1; # This script is expecting entries.txt to be a relatively database # dump that has been pre-processed to put each page on line by itself. # On 31 July 2005, this script ran on a 1.2GHz i686 laptop with ~700MB # RAM in about 20 minutes. Not using the dupHeaders() filter will # cause it to take probably about 5 hours or more. # The author of this script is Christopher Beland, User:Beland on # en.wikipedia.org. It is hereby released into the Public Domain. # Feel free to use it for any purpose whatsoever. use strict; main(); sub main { my ($cur_id, $cur_namespace, $cur_title, $cur_text, @junk, $line, $cur_namespace_name, $i, $j, @tokens, $printed, $chain); unless (-d "./todo") { mkdir "./todo"; } open (ENTRIES, "<data/entries.txt") || die "Cannot read data/entries.txt"; open (DUPHEAD, ">todo/duplicate-chunks.txt") || die "Cannot write todo/blank-pages.txt" ; while (<ENTRIES>) { if (++$j % 100 == 0) { print STDERR $j."\r"; } $line = $_; eval("\@tokens = $line"); ($cur_id, $cur_namespace, $cur_title, $cur_text, @junk) = @tokens; unless (dupHeaders($cur_text) == 1) { next; } if ($cur_namespace == -2) { $cur_namespace_name = "Media:"; } elsif ($cur_namespace == -1) { $cur_namespace_name = "Special:"; } elsif ($cur_namespace == 0) { $cur_namespace_name = ""; } elsif ($cur_namespace == 1) { $cur_namespace_name = "Talk:"; } elsif ($cur_namespace == 2) { $cur_namespace_name = "User:"; } elsif ($cur_namespace == 3) { $cur_namespace_name = "User_talk:"; } elsif ($cur_namespace == 4) { $cur_namespace_name = "Wikipedia:"; } elsif ($cur_namespace == 5) { $cur_namespace_name = "Wikipedia_talk:"; } elsif ($cur_namespace == 6) { $cur_namespace_name = ":Image:"; } elsif ($cur_namespace == 7) { $cur_namespace_name = "Image_talk:"; } elsif ($cur_namespace == 8) { $cur_namespace_name = "MediaWiki:"; } elsif ($cur_namespace == 9) { $cur_namespace_name = "MediaWiki_talk:"; } elsif ($cur_namespace == 10) { $cur_namespace_name = "Template:"; } elsif ($cur_namespace == 11) { $cur_namespace_name = "Template_talk:"; } elsif ($cur_namespace == 12) { $cur_namespace_name = "Help:"; } elsif ($cur_namespace == 13) { $cur_namespace_name = "Help_talk:"; } elsif ($cur_namespace == 14) { $cur_namespace_name = ":Category"; } elsif ($cur_namespace == 15) { $cur_namespace_name = "Category_talk:"; } # Remove leading and trailing 's. $cur_title =~ s/^\'//; $cur_title =~ s/\'$//; # Remove leading and trailing whitespace $cur_title =~ s/^\s*//; $cur_title =~ s/\s*$//; $cur_text =~ s/\\n/ /g; $cur_text =~ s/\s+/ /g; my (%chains, @chunks, $i, $per, $numberRepeated); @chunks = split (" ", $cur_text); while (@chunks > 3) { $chain = $chunks[-1]." ".$chunks[-2]." ".$chunks[-3]; $chains{$chain}++; pop(@chunks); # Note: pop from the rear is a bjillion times more # efficient than unloading manually from the front. $i++; } # print DUPHEAD "* [[".$cur_namespace_name.$cur_title."]] $i\n"; $printed = 0; foreach $chain (keys(%chains)) { if ($chains{$chain} > 1) { if ($printed == 0) { print DUPHEAD "* [[".$cur_namespace_name.$cur_title."]]"; $printed = 1; } # print DUPHEAD $chains{$chain}.": ".$chain."\n"; $numberRepeated++ } } if ($printed == 1) { $per = int(($numberRepeated / $i) * 100); print DUPHEAD " ${per}% repeated - $numberRepeated out of $i triplets\n"; } } close (ENTRIES); close (DUPHEAD); } sub dupHeaders { my ($text, %headers, $line); $text = $_[0]; unless ($text =~ m/=/) { # No headers means no duplicate headers return (0); } $text =~ s/\\n/\n/g; foreach $line (split ("\n", $text)) { if ($line =~ m/^\s*\=/) { $headers{$line}++; } } foreach $line (keys(%headers)) { if ($headers{$line} > 1) { # Found a duplicated header return(1); } } # Didn't return, so must not have found any duplicate headers return(0); } print `sort -nr -k3 todo/duplicate-chunks.txt > todo/duplicate-chunks-sorted.txt`