Wikipedia:WikiProject Dates/Parse script
Appearance
Usage
[edit]cat enwiki-20080724-pages-articles.xml | php parse.php > data
Script: parse.php
[edit]<?php while($line=fgets(STDIN)) { if(preg_match('/^\s*<title>(.*)<\/title>\s*$/', $line, $matches)) { $title = strtr($matches[1], ' ', '_'); } elseif($buffer) { if(preg_match('/(.*)<\/text>\s*$/', $line, $matches)) { process_page($title, $buffer . ' ' . $matches[1]); $buffer = null; } else { $buffer .= ' ' . rtrim($line); } } elseif(preg_match('/^\s*<text[^>]*>(.*)<\/text>\s*$/', $line, $matches)) { process_page($title, rtrim($matches[1])); } elseif(preg_match('/^\s*<text[^>]*>(.*)/', $line, $matches)) { $buffer = ' ' . rtrim($matches[1]); } } function process_page($title, $body) { print $title; while(preg_match('/(.*)\{\{([^\{\}]*)\}\}(.*)/', $body, $matches)) { $body = $matches[1] . ' ' . $matches[3]; $dates = tally_dates($matches[2], $dates); } pretty_print($dates, 'template'); $dates = null; while(preg_match('/(.*)\<ref[^\&]*\>(.*?)\<\/ref[^\&]*\>(.*)/i', $body, $matches)) { $body = $matches[1] . ' ' . $matches[3]; $dates = tally_dates($matches[2], $dates); } pretty_print($dates, 'references'); $dates = null; $dates = tally_dates($body, $dates); pretty_print($dates); print "\n"; } function tally_dates($string, $dates) { $month_regex = '(january|february|march|april|may|june|july|august|september|october|november|december)'; $regexTrail = '(.*)/iu'; $prxDM = "\[\[(\d{1,2})[ _]{$month_regex}]]"; $prxMD = "\[\[{$month_regex}[ _](\d{1,2})]]"; $prxY = "\[\[(\d{1,4}([ _]BC|))]]"; $prxISO1 = "\[\[(-?\d{4})]]-\[\[(\d{2})-(\d{2})]]"; $prxISO2 = "\[\[(-?\d{4})-(\d{2})-(\d{2})]]"; $DMY_linked = "/(.*){$prxDM} *,? *{$prxY}{$regexTrail}"; $DMY_raw = "/(.*)(\d{1,2})[ _]{$month_regex} *,? *(\d{1,4}([ _]BC|)){$regexTrail}"; $YDM_linked = "/(.*){$prxY} *,? *{$prxDM}{$regexTrail}"; $YDM_raw = "/(.*)(\d{1,4}([ _]BC|)) *,? +(\d{1,2})[ _]{$month_regex}{$regexTrail}"; $MDY_linked = "/(.*){$prxMD} *,? *{$prxY}{$regexTrail}"; $MDY_raw = "/(.*){$month_regex} +(\d{1,2}) *,? +(\d{1,4}([ _]BC|)){$regexTrail}"; $YMD_linked = "/(.*){$prxY} *,? *{$prxMD}{$regexTrail}"; $YMD_raw = "/(.*)(\d{1,4}([ _]BC|)) *,? +{$month_regex} +(\d{1,2}){$regexTrail}"; $DM_linked = "/(.*){$prxDM}{$regexTrail}"; $MD_linked = "/(.*){$prxMD}{$regexTrail}"; $ISO1_linked = "/(.*){$prxISO1}{$regexTrail}"; $ISO2_linked = "/(.*){$prxISO2}{$regexTrail}"; $ISO_raw = "/(.*)(-?\d{4})-(\d{2})-(\d{2}){$regexTrail}"; while(preg_match($DMY_linked, $string, $matches)) { $dates['DMY_linked']++; $string = $matches[1] . ' ' . $matches[6]; } while(preg_match($MDY_linked, $string, $matches)) { $dates['MDY_linked']++; $string = $matches[1] . ' ' . $matches[6]; } while(preg_match($YDM_linked, $string, $matches)) { $dates['YDM_linked']++; $string = $matches[1] . ' ' . $matches[6]; } while(preg_match($YMD_linked, $string, $matches)) { $dates['YMD_linked']++; $string = $matches[1] . ' ' . $matches[6]; } while(preg_match($MD_linked, $string, $matches)) { $dates['MD_linked']++; $string = $matches[1] . ' ' . $matches[4]; } while(preg_match($DM_linked, $string, $matches)) { $dates['DM_linked']++; $string = $matches[1] . ' ' . $matches[4]; } while(preg_match($DMY_raw, $string, $matches)) { $dates['DMY_raw']++; $string = $matches[1] . ' ' . $matches[6]; } while(preg_match($MDY_raw, $string, $matches)) { $dates['MDY_raw']++; $string = $matches[1] . ' ' . $matches[6]; } while(preg_match($YDM_raw, $string, $matches)) { $dates['YDM_raw']++; $string = $matches[1] . ' ' . $matches[6]; } while(preg_match($ISO1_linked, $string, $matches)) { $dates['ISO1_linked']++; $string = $matches[1] . ' ' . $matches[6]; } while(preg_match($ISO2_linked, $string, $matches)) { $dates['ISO2_linked']++; $string = $matches[1] . ' ' . $matches[6]; } while(preg_match($ISO_raw, $string, $matches)) { $dates['ISO_raw']++; $string = $matches[1] . ' ' . $matches[5]; } return $dates; } function pretty_print($dates, $type = null) { if(!is_array($dates)) { return; } if($type == 'template') { print ' {'; } elseif($type == 'references') { print ' <'; } else { print ' '; } foreach($dates as $format => $count) { print $maybe_comma . $format . ':' . $count; $maybe_comma = ','; } if($type == 'template') { print '}'; } elseif($type == 'references') { print '>'; } } ?>