User:SQLBot/Readref.php
Appearance
<?php
if( !isset($argv[1] ) ) {
$helptext = "Reference Problem Finder, by SQL@Enwiki\nphp $argv[0] <DumpFile> <OutputFile> <domain> <options>\n* -w = Wikify output\n* -d = Double check against API\n\n";
die($helptext);
}
if( !isset( $argv[3] ) ) {
die($helptext);
}
$domain = $argv[3];
$fIn = fopen( $argv[1], "r" );
$fOut = fopen( $argv[2], "w" );
if( in_array( "-w", $argv ) ) {
$wikify = TRUE;
}
if( in_array( "-d", $argv ) ) {
$doublecheck = TRUE;
}
$refs = "/(<ref |<ref>)/i";
$reflist = "/(\{\{(reflist|reference|refs|footnotes)|<references)/i";
function GetPage($article) {
global $domain;
$url = "http://$domain/w/";
$article = urlencode($article);
$request = $url . 'api.php?action=query&prop=revisions&titles=' . $article . '&rvprop=content&format=php';
$sxGetArticle = file_get_contents($request);
$sxGetA = unserialize($sxGetArticle);
$sxGetAID = $sxGetA['query']['pages'];
$sxGetAID = array_shift($sxGetAID);
$sxGetAID = array_shift($sxGetAID);
$sxAText = $sxGetA['query']['pages'][$sxGetAID]['revisions'][0]["*"];
return($sxAText);
}
function checkArticle( $text ) {
global $refs, $reflist;
$text = html_entity_decode( $text );
if( stripos( $text, "<!--") !== FALSE ) {
$text = preg_replace( "/\<\!\-\-(.*)\-\-\>/i", "", $text );
}
if( stripos( $text, "#REDIRECT" ) !== FALSE ) {
return( FALSE );
}
$hasRef = preg_match( $refs, $text, $mRefs );
$hasRefList = preg_match( $reflist, $text, $mRefList );
if( isset( $mRefs[1] ) && !isset( $mRefList[1] ) ) {
return( TRUE );
}
}
$num = 0;
$ok = 0;
$prob = 0;
$time_start = microtime(true);
while( !feof( $fIn ) ) {
$fLine = fgets( $fIn );
$fLine = rtrim( ltrim( $fLine ) );
$mTitleF = preg_match( "/\<title\>(.*)\<\/title\>/i", $fLine, $mTitle );
if($mTitleF) {
echo "$num [$ok / $prob]: Checking $mTitle[1]... ";
$title = $mTitle[1];
}
unset( $mStartTextFound );
unset( $mEndTextFound );
$mStartTextFound = strpos( $fLine, "<text" );
$mEndTextFound = strpos( $fLine, "</text>" );
if( $mStartTextFound !== FALSE && $mEndTextFound !== FALSE ) {
preg_match( "/\<text xml\:space\=\"preserve\">(.*)\<\/text\>/i", $fLine, $mText );
echo " Got text...";
if( !checkArticle( $mText[1] ) ) {
echo " No problems!\n";
$ok++;
} else {
echo " Problem!\n";
fwrite( $fOut, "$title\n" );
$prob++;
}
$num++;
} else if ( $mStartTextFound !== FALSE ) {
unset( $mEndFound );
unset( $aText );
$aText = $fLine;
while( !$mEndFound ) {
$fLine = fgets( $fIn );
$fLine = rtrim( ltrim( $fLine ) );
$aText = $aText . $fLine;
$mEndFound = strpos( $fLine, "</text>" );
}
echo " End Found... ";
preg_match( "/\<text xml\:space\=\"preserve\">(.*)\<\/text\>/i", $aText, $mText );
echo " Got Text... ";
if( !checkArticle( $mText[1] ) ) {
echo " No problems!\n";
$ok++;
} else {
echo " Problem!\n";
if( $wikify ) {
fwrite( $fOut, "* [[$title]]\n" );
} else {
fwrite( $fOut, "$title\n" );
}
$prob++;
}
$num++;
} else {
}
}
$time_end = microtime(true);
$time = round($time_end - $time_start, 0);
$nodc[time] = $time;
$nodc[rps] = $num / $time;
$nodc[num] = $num;
$nodc[prob] = $prob;
$nodc[ok] = $ok;
fclose( $fOut );
if( $doublecheck ) {
echo "\n\nDouble-checking articles!\n\n";
$articles = file( $argv[2] );
$fOut = fopen( $argv[2], "w" );
sort( $articles );
$num = 0;
$ok = 0;
$prob = 0;
$time_start = microtime(true);
foreach( $articles as $article) {
$num++;
$article = ltrim( rtrim( $article ) );
if( strpos( $article, "* [[" ) !== FALSE ) {
preg_match( "/\* \[\[(.*)\]\]/i", $article, $mArticle );
$article = $mArticle[1];
}
echo "$num [$ok / $prob]: $article :";
$aText = GetPage($article);
if( !checkArticle( $aText ) ) {
echo " No problems!\n";
$ok++;
} else {
echo " Problem!\n";
if( $wikify ) {
fwrite( $fOut, "* [[$article]]\n" );
} else {
fwrite( $fOut, "$article\n" );
}
$prob++;
}
}
$time_end = microtime(true);
$time = round($time_end - $time_start, 0);
$rps = $num / $time;
$elim = $nodc[prob] - $prob;
$dcpct = $elim / $nodc[prob];
$dcpct = round( $dcpct * 100, 0 );
echo "Processed $num in $time (sec) at about $rps checks per second, with double-checking enabled. DC eliminated $elim ($dcpct%) positives.\n";
}
echo "Processed $nodc[num] in $nodc[time] (sec) at about $nodc[rps] checks per second, with no double-checking.\nRun complete\n";
fclose( $fIn );
?>