User:Rjwilmsi/genfixes
Appearance
//
// module options
bool DEBUG_MODE = false; // processes article as normal but writes lines to log file to show which regexes matched
// component options
bool APPLY_AMBIGUOUS_FIXES = false ; // automatically set to true if in report mode
bool LANGTAG_FOR_MODE = false ;
bool DO_DUPE_UNNAMED_REF_FIX = true ;
bool FIREFOX_REFLINKS = false ;
bool FIXING_TYPOS = false ; // automatically set later
bool REPORT_MODE = false ; // automatically set later
/* available logging data types:
AMBIG
MULTIREF
MREFERR
DEBUG
MAXED
SUMMARY
LANGTAG */
// HideMore(string text, bool HideExternalLinks, bool LeaveMetaHeadings, bool HideImages)
WikiFunctions.Parse.HideText ht = new WikiFunctions.Parse.HideText(false, false, true);
WikiFunctions.Parse.HideText htmore = new WikiFunctions.Parse.HideText();
string LangList = @"(Abkhazian|Afar|Afrikaans|Akan|Albanian|Amharic|Arabic|Aragonese|Armenian|Assamese|Avaric|Avestan|Aymara|Azerbaijani|Bambara|Bashkir|Basque|Belarusian|Bengali|Bihari|Bislama|Bosnian|Breton|Bulgarian|Burmese|Catalan|Chamorro|Chechen|Chichewa|Chinese|Church Slavic|Chuvash|Cornish|Corsican|Cree|Croatian|Czech|Danish|Divehi|Dutch|Dzongkha|English|Esperanto|Estonian|Ewe|Faroese|Fijian|Finnish|French|Fulah|Galician|Ganda|Georgian|German|Greek|Guaran|Gujarati|Haitian|Hausa|Hebrew|Herero|Hindi|Hiri Motu|Hungarian|Icelandic|Ido|Igbo|Indonesian|Interlingue|Inuktitut|Inupiaq|Irish|Italian|Japanese|Javanese|Kalaallisut|Kannada|Kanuri|Kashmiri|Kazakh|Khmer|Kikuyu|Kinyarwanda|Kirghiz|Kirundi|Komi|Kongo|Korean|Kurdish|Kwanyama|Lao|Latin|Latvian|Limburgish|Lingala|Lithuanian|Luxembourgish|Macedonian|Malagasy|Malay|Malayalam|Maltese|Manx|Marathi|Marshallese|Moldavian|Mongolian|Nauru|Navajo|Ndonga|Nepali|North Ndebele|Northern Sami|Norwegian|Norwegian Bokml|Norwegian Nynorsk|Occitan|Ojibwa|Oriya|Oromo|Ossetian|P[au]njabi|Pashto|Persian|Polish|Portuguese|Quechua|Raeto-Romance|Romanian|Russian|Samoan|Sango|Sanskrit|Sardinian|Scottish Gaelic|Serbian|Serbo-Croatian|Shona|Sichuan Yi|Sindhi|Sinhala|Slovak|Slovenian|Somali|South Ndebele|Southern Sotho|Spanish|Sundanese|Swahili|Swati|Swedish|Tagalog|Tahitian|Tajik|Tamil|Tatar|Telugu|Thai|Tibetan|Tigrinya|Tonga|Tsonga|Tswana|Turkish|Turkmen|Twi|Uighur|Ukrainian|Urdu|Uzbek|Venda|Vietnamese|Volapk|Walloon|Welsh|Western Frisian|Wolof|Xhosa|Yiddish|Yoruba|Zhuang|Zulu)";
string MonthList = @"(January|February|March|April|May|June|July|August|September|October|November|December)";
string MonthList2 = @"(?:January|February|March|April|May|June|July|August|September|October|November|December)";
string SICitStart = @"(?si)(\{\{\s*cit[^{}]*\|\s*";
string TemEnd = @"(\s*(?:\||\}\}))";
string ArticleTitleG = ""; // for logging use
bool RELFINKS_ACTIONED = false;
public string ProcessArticle(string ArticleText, string ArticleTitle, int wikiNamespace, out string Summary, out bool Skip)
{
int MAX_ITERATIONS = 5;
FIXING_TYPOS = awb.RegexTypoFix.Checked;
REPORT_MODE = !awb.AutoTagCheckBox.Checked; // set report mode by disabling auto tagging
if(FIXING_TYPOS)
MAX_ITERATIONS = 1;
string Summary2 = "";
bool Skip2 = false;
Skip = false;
Summary = "";
int VisibleFixes = 0;
int VisibleFixesPerRun = 0;
ArticleTitleG = ArticleTitle;
ArticleText = ht.Hide(ArticleText);
for(int i = 0; i < MAX_ITERATIONS; i++)
{
string Before = ArticleText;
ArticleText = ProcessArticleInternal(ArticleText, ArticleTitle, wikiNamespace, out Summary2, out Skip2, out VisibleFixesPerRun);
if(!Summary2.Equals(""))
Summary += Summary2;
if(i == 0)
Skip = Skip2; // on later loop Skip2 will come back true
VisibleFixes += VisibleFixesPerRun;
if(i == (MAX_ITERATIONS-1) && MAX_ITERATIONS > 1)
LogToFile("@@@MAXED@@@reached max iterations");
if(Before.Equals(ArticleText))
break;
}
Summary = String.Format("({0}) " + Summary, VisibleFixes);
if(DEBUG_MODE)
LogToFile(@"@@@SUMMARY@@@Summary:" + Summary);
Skip = false;
return(ht.AddBack(ArticleText));
}
public string ProcessArticleInternal(string ArticleText, string ArticleTitle, int wikiNamespace, out string Summary, out bool Skip, out int VisibleFixes)
{
const int MAX_ITERATIONS = 50;
const int BRACKET_ON_THREAD_TIMEOUT = 2000;
Skip = false;
Summary = "" ;
VisibleFixes = 0;
// String to check whether to requst skip
string OriginalArticleText = ArticleText;
// to determine edit summary
string ArticleTextBeforeOfDate = "";
string ArticleTextBeforeLanguageTemplate = "";
string ArticleTextBeforeDOBDOD = "";
string ArticleTextBeforeURL = "";
string ArticleTextBeforeCiteTemplate = "";
string ArticleTextBeforeDuplicateCiteField = "";
string ArticleTextBeforeDEFAULTSORT = "";
string ArticleTextBeforeGuessedDate = "";
string ArticleTextBeforeDayMonthYear = "";
string ArticleTextBeforeReportMode = "";
string ArticleTextBeforeDuplicateUnnamedRef = "";
// to determine edit summary counts
int VisibleFixesCheckpoint = 0;
int SpecificFixes = 0;
bool DayMonthYearFixApplied = false;
// for setting Date format
string ArticleLocaleFound = "";
// to determine if visible fixes made, logging
bool ArticleLogged = false;
if(REPORT_MODE)
APPLY_AMBIGUOUS_FIXES = true;
if(APPLY_AMBIGUOUS_FIXES && !REPORT_MODE)
FIREFOX_REFLINKS = true;
// genfixes
//VisibleFixes += RegexReplace(ref ArticleText, @"(?si)({\s*\|\s*class\s*=\s*""wikitable[^}]*?)cel(?:lpa|pad?)ding\b", "$1cellpadding"); // cellpadding typo, INAWB
// date range fixes
//VisibleFixes += RegexReplace(ref ArticleText, @"(?i){{\s*cit[^{}]*\|\s*year\s*=\s*(\[\[)?(\d\d\s*)?(?:January|February|March|April|May|June|July|August|September|October|November|December)", "$1date$2");
//VisibleFixes += RegexReplace(ref ArticleText, @"(?i)(\[\[" + MonthList2 + @"\s*(?:[1-3]?\d)\]\])\s*(?:[^\d]|&.dash;)\s*(\[\[" + MonthList2 + @"\s*(?:[1-3]?\d)\]\])(,?)\s*(\[\[\d{3,4}\]\])", "$1$3 $4 – $2$3 $4"); // date range fix Am full wikilinked
//VisibleFixes += RegexReplace(ref ArticleText, @"(?i)(\[\[(?:[1-3]?\d)\s*" + MonthList2 + @"\]\])\s*(?:[^\d]|&.dash;)\s*(\[\[(?:[1-3]?\d)\s*" + MonthList2 + @"\]\])(,?)\s*(\[\[\d{3,4}\]\])", "$1$3 $4 – $2$3 $4"); // date range fix Int full wikilink
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)\b" + MonthList + @"\s*([1-3]?\d)\s*(?:[^\d]|&.dash;)([1-3]?\d,?)\s+\[\[(\d{3,4})\]\]", "$1 $2–$3 $4"); // date range fix Am
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)\b([1-3]?\d)\s*(?:[^\d]|&.dash;)([1-3]?\d)\s+" + MonthList + @",?\s*\[\[(\d{3,4})\]\]", "$1–$2 $3 $4"); // date range fix Int
// fix [[link}] and [{link2]]
VisibleFixes += RegexReplace(ref ArticleText, @"([^\[\]{}])\[{([^\[\]{}]+\]\])([^\[\]{}])", "$1[[$2$3");
VisibleFixes += RegexReplace(ref ArticleText, @"([^\[\]{}])(\[\[[^\[\]{}]+)}\]([^\[\]{}])", "$1$2]]$3");
// whitespace fixes around headings
VisibleFixes += RegexReplace(ref ArticleText, @"([\.\}\]]) +(\r|\n)", "$1$2", false);
// template brackets, fixes {{{template}} and {{template}}}
if(!Regex.IsMatch(ArticleText, @"(?si)(\{\{#if.*?(\{\{\{|\}\}\})|\{\{\{num|num\|\d\}\}\})"))
{
VisibleFixes += RegexReplace(ref ArticleText, @"([^\{\}])\{\{\{([^\{\}!]+)\}\}([^\{\}])", "$1{{$2}}$3");
VisibleFixes += RegexReplace(ref ArticleText, @"([^\{\}])\{\{([^\{\}!]+)\}\}\}([^\{\}])", "$1{{$2}}$3");
}
// cite template fixes
ArticleTextBeforeCiteTemplate = ArticleText;
VisibleFixesCheckpoint = VisibleFixes;
// if date is ambiguous between American and British format, will default to American
ArticleLocaleFound = DetermineArticleDateLocale(ArticleText);
VisibleFixes += RegexReplace(ref ArticleText, @"(?si)(<ref[^\>\<\{\}]*?>\s*\{\{\s*cit[^\{\}]*?)(?:\)\)\s*)?(</ref>)", "$1}}$2"); // fix when citations have no closing }}, or )) instead
if(!Regex.IsMatch(ArticleText, @"(?si)\{\{\s*cite\s*(press\s+release\s+v2|web\s+APA)"))
VisibleFixes += RegexReplace(ref ArticleText, @"(?si)(\{\{\s*cite\s+(?:web|news|press\s+release|journal|magazine))(?:(\s+)|(\s*)\\(\s*))(\w+)", "$1$3|$2$4$5"); // fix when cite templates have no | at start e.g. {{ cite web url=... or {{ cite web\url=...
// (part) wikilinked/external linked URL in cite template, uses MediaWiki regex of [^\[\]<>""\s] for URL bit after http://
VisibleFixes += RegexReplace(ref ArticleText, @"(?si)(\{\{\s*cite\s+[^{}]*\|\s*url\s*=\s*)\[+\s*((?:http://)?[^\[\]<>""\s]+?\s*)\]?" + TemEnd, "$1$2$3");
VisibleFixes += RegexReplace(ref ArticleText, @"(?si)(\{\{\s*cite\s+[^{}]*\|\s*url\s*=\s*)\[?\s*((?:http://)?[^\[\]<>""\s]+?\s*)\]+" + TemEnd, "$1$2$3");
VisibleFixes += RegexReplace(ref ArticleText, @"(?si)(\{\{\s*cit[^{}]*?\|\s*)(?:fprmat)(\s*=\s*)", "$1format$2"); // Changes 'fprmat' typo to format
VisibleFixes += RegexReplace(ref ArticleText, @"(?si)(\{\{\s*cit[^{}]*?\|\s*)(?:\s*date\s*)?(?:retrieved(?:\s+on)?|(?:last)?accessed|access\s+date)(\s*=\s*)", "$1accessdate$2"); // Changes non-existent retrieved field to accessdate
VisibleFixes += RegexReplace(ref ArticleText, @"(?s)Accessdate", "accessdate", false);
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)\bac(?:(?:ess?s?|cc?es|cesss|ccess)date|cessdare)\b", "accessdate");
string AccDMMD = @"(?:(?:archive|air|access)?date2?|accessdaymonth|accessmonthday)\s*=\s*";
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + AccDMMD + @"\[*" + MonthList2 + @"\s*[0-3]?\d(?:\s*\]+)?)\s*\.\s*(\[*\s*\d{4}\s*\]*)", "$1, $2"); // 'date=January 9. 2008' to 'date=January 9, 2008'
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + AccDMMD + @"\[*\s*[0-3]?\d\s*" + MonthList2 + @"(?:\s*\]+)?)\s*[\.,]+(\s*\[*\s*\d{4}\s*\]*)", "$1$2", false); // 'date=9 January, 2008' to 'date=9 January 2008'
// cite date=11/2004 --> date=November 2004, TODO scan
string CitNumSlashY = SICitStart + @"date\s*=\s*)";
string D3TemEnd = @"/([12]\d{3}\s*(?:\||}}))";
VisibleFixes += RegexReplace(ref ArticleText, CitNumSlashY + @"0?4" + D3TemEnd, "$1April $2");
VisibleFixes += RegexReplace(ref ArticleText, CitNumSlashY + @"0?8" + D3TemEnd, "$1August $2");
VisibleFixes += RegexReplace(ref ArticleText, CitNumSlashY + @"12" + D3TemEnd, "$1December $2");
VisibleFixes += RegexReplace(ref ArticleText, CitNumSlashY + @"0?2" + D3TemEnd, "$1February $2");
VisibleFixes += RegexReplace(ref ArticleText, CitNumSlashY + @"0?1" + D3TemEnd, "$1January $2");
VisibleFixes += RegexReplace(ref ArticleText, CitNumSlashY + @"0?7" + D3TemEnd, "$1July $2");
VisibleFixes += RegexReplace(ref ArticleText, CitNumSlashY + @"0?6" + D3TemEnd, "$1June $2");
VisibleFixes += RegexReplace(ref ArticleText, CitNumSlashY + @"0?3" + D3TemEnd, "$1March $2");
VisibleFixes += RegexReplace(ref ArticleText, CitNumSlashY + @"11" + D3TemEnd, "$1November $2");
VisibleFixes += RegexReplace(ref ArticleText, CitNumSlashY + @"10" + D3TemEnd, "$1October $2");
VisibleFixes += RegexReplace(ref ArticleText, CitNumSlashY + @"0?9" + D3TemEnd, "$1September $2");
VisibleFixes += RegexReplace(ref ArticleText, CitNumSlashY + @"0?5" + D3TemEnd, "$1May $2");
// date = YYYY-Month-DD fix
string CitYMonthD = SICitStart + @"(?:archive|air)?date2?\s*=\s*\d{4})[-/\s]";
string DTemEnd = @"?[-/\s]([0-3]?\d\s*(?:\||}}))";
VisibleFixes += RegexReplace(ref ArticleText, CitYMonthD + @"Apr(?:il|\.)" + DTemEnd, "$1-04-$2");
VisibleFixes += RegexReplace(ref ArticleText, CitYMonthD + @"Aug(?:ust|\.)" + DTemEnd, "$1-08-$2");
VisibleFixes += RegexReplace(ref ArticleText, CitYMonthD + @"Dec(?:ember|\.)" + DTemEnd, "$1-12-$2");
VisibleFixes += RegexReplace(ref ArticleText, CitYMonthD + @"Feb(?:r?uary|\.)" + DTemEnd, "$1-02-$2");
VisibleFixes += RegexReplace(ref ArticleText, CitYMonthD + @"Jan(?:uary|\.)" + DTemEnd, "$1-01-$2");
VisibleFixes += RegexReplace(ref ArticleText, CitYMonthD + @"Jul(?:y|\.)" + DTemEnd, "$1-07-$2");
VisibleFixes += RegexReplace(ref ArticleText, CitYMonthD + @"Jun(?:e|\.)" + DTemEnd, "$1-06-$2");
VisibleFixes += RegexReplace(ref ArticleText, CitYMonthD + @"Mar(?:ch|\.)" + DTemEnd, "$1-03-$2");
VisibleFixes += RegexReplace(ref ArticleText, CitYMonthD + @"Nov(?:ember|\.)" + DTemEnd, "$1-11-$2");
VisibleFixes += RegexReplace(ref ArticleText, CitYMonthD + @"Oct(?:ober|\.)" + DTemEnd, "$1-10-$2");
VisibleFixes += RegexReplace(ref ArticleText, CitYMonthD + @"Sep(?:tember|\.)" + DTemEnd, "$1-09-$2");
VisibleFixes += RegexReplace(ref ArticleText, CitYMonthD + @"May\." + DTemEnd, "$1-05-$2");
// cite month=Mon or month=05 fix
string CitMonth = SICitStart + @"month\s*=\s*)";
string AllCitDates = SICitStart + @"(?:(?:archive|air|access)?date2?|accessdaymonth|accessmonthday|month)\s*=[^}\|]*?)";
string MonEnd = @"[\.,]?(\s*(?:\||}}|\s))";
VisibleFixes += RegexReplace(ref ArticleText, CitMonth + @"(?:Apr\.?|0?4)" + TemEnd, "$1April$2");
VisibleFixes += RegexReplace(ref ArticleText, CitMonth + @"(?:Aug\.?|0?8)" + TemEnd, "$1August$2");
VisibleFixes += RegexReplace(ref ArticleText, CitMonth + @"(?:Dec\.?|12)" + TemEnd, "$1December$2");
VisibleFixes += RegexReplace(ref ArticleText, CitMonth + @"(?:Feb\.?|0?2)" + TemEnd, "$1February$2");
VisibleFixes += RegexReplace(ref ArticleText, CitMonth + @"(?:Jan\.?|0?1)" + TemEnd, "$1January$2");
VisibleFixes += RegexReplace(ref ArticleText, CitMonth + @"(?:Jul\.?|0?7)" + TemEnd, "$1July$2");
VisibleFixes += RegexReplace(ref ArticleText, CitMonth + @"(?:Jun\.?|0?6)" + TemEnd, "$1June$2");
VisibleFixes += RegexReplace(ref ArticleText, CitMonth + @"(?:Mar\.?|0?3)" + TemEnd, "$1March$2");
VisibleFixes += RegexReplace(ref ArticleText, CitMonth + @"(?:Nov\.?|11)" + TemEnd, "$1November$2");
VisibleFixes += RegexReplace(ref ArticleText, CitMonth + @"(?:Oct\.?|10)" + TemEnd, "$1October$2");
VisibleFixes += RegexReplace(ref ArticleText, CitMonth + @"(?:Sep(?:t|\.)?|0?9)" + TemEnd, "$1September$2");
VisibleFixes += RegexReplace(ref ArticleText, CitMonth + @"0?5" + TemEnd, "$1May$2");
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @"(?:archive|air|access)?date2?\s*=)\s*Page\s+last\s+updated\s+at\s+[0-2]\d:[0-5]\d\s*\w{3},\s*", "$1"); // remove Page last updated at ... from (BBC) date references
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @"(?:archive|air|access)?date2?\s*=\s*)(?:(?:Mon|Tues|Wednes|Thurs|Fri|Satur|Sun)\w*\s*[,\.]*|retrieved(?:\s+on)?|accessed)\s*", "$1"); // remove day of week/"retrieved" from date field
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @")year(\s*=\s*(?:\[\[)?(?:\d\d\s*)?" + MonthList2 + @")", "$1date$2", false); // year to date when value contains month
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @"(?:archive|air|access)?date2?(?:\s*=\s*)(?:" + MonthList2 + @"\s*[0-3]?\d|[0-3]?\d\s*" + MonthList2 + @")[,\.]?\s+)'?(0\d\s*(?:\||\}\}))", "${1}20$2"); // 'DD Month YY' to YYYY fix (2000+)
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @"(?:archive|air|access)?date2?(?:\s*=\s*)(?:" + MonthList2 + @"\s*[0-3]?\d|[0-3]?\d\s*" + MonthList2 + @")[,\.]?\s+)'?([4-9]\d\s*(?:\||\}\}))", "${1}19$2"); // 'DD Month YY' to YYYY fix (1940+)
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @")date(\s*=\s*(?:\[\[)?(?:\d?\d\s*)?" + MonthList2 + @"(?:\s*\d?\d,?)?)(?:\s*\|)\s*year\s*=\s*(\d{4})" + TemEnd, "$1date$2 $3$4"); // date and year combiner
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @")year\s*=\s*(\d{4})\s*\|\s*date(\s*=\s*(?:\[\[)?(?:\d?\d\s*)?" + MonthList2 + @"(?:\s*\d?\d,?)?)" + TemEnd, "$1date$3 $2$4"); // date and year combiner 2 of 2
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @")(?:date|month)(\s*=\s*[12]\d{3}\s*(?:\||}))", "$1year$2", false); // date=YYYY or month=YYYY to year fix
// VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @")year(\s*=\s*\[\[" + MonthList2 + @"\s*[0-3]?\d\]?\]?,?\s*\[?\[?(\d{4})\]?\]?)", "$1date$2"); // Fixes wikilinked year field to date (American)
// VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @")year(\s*=\s*\[?\[?[0-3]?\d\s*" + MonthList2 + @"\]?\]?,?\s*\[?\[?\d{4}\]?\]?)", "$1date$2"); // Fixes wikilinked year field to date (International)
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @"(?:archive|air|access)?date2?\s*=\s*)([0-3]?\d)-(" + MonthList2 + @",?)-(\d{2,4})", "$1$2 $3 $4"); // convert dashed dates to spaces, allows YY
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @"(?:archive|air|access)?date2?\s*=\s*)" + MonthList + @"-([0-3]?\d,?)-(\d{2,4})", "$1$2 $3 $4"); // convert dashed dates to spaces, allows YY
VisibleFixes += RegexReplace(ref ArticleText, @"(?si)(\{\{\s*cite\s*(?:news[^g]|paper|press|episode|video)[^{}]*\|\s*accessdate\s*=\s*)(?:" + MonthList + @"\s+0?([1-3]?\d)|0?([1-3]?\d)\s*" + MonthList + @")(?:\s*\|)\s*accessyear\s*=\s*(20[01]\d)" + TemEnd, "$1$2 $3$4 $5 $6$7"); // accessyear and accessdate combiner (not for cite web as this displays correctly as-is)
// VisibleFixes += LoopedRegexReplace(ref ArticleText, @"(?si)(\{\{\s*cite\s+(?:web|news|press\s+release)[^{}]*\|\s*(?:access|archive)date\s*=\s*)\[\[(200\d|19[7-9]\d)-([0-1]?\d)-([0-3]?\d)\]\]", "$1$2-$3-$4"); // Fixes wikilinked ISO cite web accessdate, which displays with visible square brackets for web/news/pr templates
// VisibleFixes += RegexReplace(ref ArticleText, @"(?si)(\{\{\s*cite\s+[^{}]*\|\s*(?:access|archive)date\s*=\s*)\[\[(" + MonthList2 + @"\s*[0-3]?\d)\]\],?\s*\[\[(\d{4})\]\]", "$1$2 $3"); // wikilinked Am accessdate
// VisibleFixes += RegexReplace(ref ArticleText, @"(?si)(\{\{\s*cite\s+[^{}]*\|\s*(?:access|archive)date\s*=\s*)\[\[([0-3]?\d\s*" + MonthList2 + @")\]\],?\s*\[\[(\d{4})\]\]", "$1$2 $3"); // wikilinked Int accessdate
VisibleFixes += RegexReplace(ref ArticleText, AllCitDates + @"Jan" + MonEnd, "$1January$2"); // Mon to Month expander for January for all cite dates
VisibleFixes += RegexReplace(ref ArticleText, AllCitDates + @"Feb" + MonEnd, "$1February$2"); // Mon to Month expander for February for all cite dates
VisibleFixes += RegexReplace(ref ArticleText, AllCitDates + @"Mar" + MonEnd, "$1March$2"); // Mon to Month expander for March for all cite dates
VisibleFixes += RegexReplace(ref ArticleText, AllCitDates + @"Apr" + MonEnd, "$1April$2"); // Mon to Month expander for April for all cite dates
VisibleFixes += RegexReplace(ref ArticleText, AllCitDates + @"Jun" + MonEnd, "$1June$2"); // Mon to Month expander for June for all cite dates
VisibleFixes += RegexReplace(ref ArticleText, AllCitDates + @"Jul" + MonEnd, "$1July$2"); // Mon to Month expander for July for all cite dates
VisibleFixes += RegexReplace(ref ArticleText, AllCitDates + @"Aug" + MonEnd, "$1August$2"); // Mon to Month expander for August for all cite dates
VisibleFixes += RegexReplace(ref ArticleText, AllCitDates + @"Sept?" + MonEnd, "$1September$2"); // Mon to Month expander for September for all cite dates
VisibleFixes += RegexReplace(ref ArticleText, AllCitDates + @"Oct" + MonEnd, "$1October$2"); // Mon to Month expander for October for all cite dates
VisibleFixes += RegexReplace(ref ArticleText, AllCitDates + @"Nov" + MonEnd, "$1November$2"); // Mon to Month expander for November for all cite dates
VisibleFixes += RegexReplace(ref ArticleText, AllCitDates + @"Dec" + MonEnd, "$1December$2"); // Mon to Month expander for December for all cite dates
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @"(?:archive|air)?date2?\s*=\s*)(\[?\[?" + MonthList2 + @")[,\.]?\s*([0-3]?\d\]?\]?)(?:st|nd|rd|th)[,\.]?(\s*\[?\[?(?:200\d|19\d\d)\]?\]?)", "$1$2 $3,$4"); // removes ordinals, extra commas from American dates
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @"(?:archive|air)?date2?\s*=\s*)(\[?\[?[0-3]?\d)\s*(?:st|nd|rd|th)[,\.]?\s*(" + MonthList2 + @"\]?\]?)[,\.]?\s*(\[?\[?(?:200\d|19\d\d)\]?\]?)", "$1$2 $3 $4"); // removes ordinals, extra commas from International dates
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + AccDMMD + @")(\[?\[?" + MonthList2 + @")[,\.]?\s*([0-3]?\d\]?\]?)\s*(?:st|nd|rd|th)[,\.]?(\s*(?:\||\}\}))" , "$1$2 $3$4"); // remove ordinals from 'Month Dth'
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + AccDMMD + @")(\[?\[?[0-3]?\d)\s*(?:st|nd|rd|th)\s*(" + MonthList2 + @"\]?\]?)[,\.]?(\s*(?:\||\}\}))" , "$1$2 $3$4"); // remove ordinals from 'Dth Month'
VisibleFixes += RegexReplace(ref ArticleText, @"(?si)(\{\{\s*cite\s*(?:web|book|journal|magazine)[^{}]*\|\s*)date(\s*=\s*)" + MonthList + @",?\s*(\d{4})" + TemEnd, "$1month$2$3 | year$2$4$5", false); // date to year and month, for templates where these fields exist
VisibleFixes += RegexReplace(ref ArticleText, @"(?si)(\{\{\s*cite\s*(?:web|book|journal|magazine)[^{}]*\|\s*)date(\s*=\s*)(\d{4})\s*" + MonthList + @"" + TemEnd, "$1year$2$3 | month$2$4$5", false); // 'date=YYYY Month' to year and month, for templates where these fields exist
// date=DD month=Month year=YYYY fix (or fields in different order)
ArticleTextBeforeDayMonthYear = ArticleText;
if(!Regex.IsMatch(ArticleText, @"(?si)\{\{\s*cit[^{}]*\|\s*url\s*=\s*[^\[\]<>""\s\|]+?\b(?:date|month|year)=")) // prevent matching on URL with 'date=' in it
{
if(ArticleLocaleFound.Equals("US"))
{
// date month year, date year month (US format)
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @"\bda(?:te|y)\s*=\s*)([0-3]?\d)\s*(\|[^{}]*?(?:\|\s*)?)\bmonth\s*=\s*" + MonthList + @"\s*\|([^{}]*)\byear\s*=\s*(\d{4})\s*(?:\||(\}\}))", "$1 $4 $2, $6 $3$5$7");
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @"\bda(?:te|y)\s*=\s*)([0-3]?\d)\s*(\|[^{}]*?(?:\|\s*)?)\byear\s*=\s*(\d{4})\s*\|([^{}]*)\bmonth\s*=\s*" + MonthList + @"\s*(?:\||(\}\}))", "$1 $6 $2, $4 $3$5$7");
// month year date, month date year (US format)
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @")\bmonth\s*=\s*" + MonthList + @"\s*(\|[^{}]*?(?:\|\s*)?)\byear\s*=\s*(\d{4})\s*\|([^{}]*\bda(?:te|y)\s*=\s*)([0-3]?\d)\s*(?:\||(\}\}))", "$1$5 $2 $6, $4 $3$7");
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @")\bmonth\s*=\s*" + MonthList + @"\s*\|([^{}]*?(?:\|\s*)?\bda(?:te|y)\s*=\s*)([0-3]?\d)\s*(\|[^{}]*)\byear\s*=\s*(\d{4})\s*(?:\||(\}\}))", "$1$3 $2 $4, $6 $5$7");
// year date month, year month date (US format)
VisibleFixes += RegexReplace(ref ArticleText, @"(?si)(\{\{\s*cit[^{}]*)\|\s*\byear\s*=\s*(\d{4})\s*(\|[^{}]*?(?:\|\s*)?\bda(?:te|y)\s*=\s*)([0-3]?\d)(\s*\|[^{}]*)\bmonth\s*=\s*" + MonthList + @"\s*(?:\||(\}\}))", "$1$3 $6 $4, $2 $5$7");
VisibleFixes += RegexReplace(ref ArticleText, @"(?si)(\{\{\s*cit[^{}]*\|)\s*\byear\s*=\s*(\d{4})\s*(\|[^{}]*?(?:\|\s*)?)\bmonth\s*=\s*" + MonthList + @"\s*\|([^{}]*\bda(?:te|y)\s*=\s*)([0-3]?\d)\s*(?:\||(\}\}))", "$1$5 $4 $6, $2 $3$7");
}
else // Intl format
{
// date month year, date year month (International format)
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @"\bda(?:te|y)\s*=\s*[0-3]?\d)\s*(\|[^{}]*?(?:\|\s*)?)\bmonth\s*=\s*" + MonthList + @"\s*\|([^{}]*)\byear\s*=\s*(\d{4})\s*(?:\||(\}\}))", "$1 $3 $5 $2$4$6");
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @"\bda(?:te|y)\s*=\s*[0-3]?\d)\s*(\|[^{}]*?(?:\|\s*)?)\byear\s*=\s*(\d{4})\s*\|([^{}]*)\bmonth\s*=\s*" + MonthList + @"\s*(?:\||(\}\}))", "$1 $5 $3 $2$4$6");
// month year date, month date year (International format)
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @")\bmonth\s*=\s*" + MonthList + @"\s*(\|[^{}]*?(?:\|\s*)?)\byear\s*=\s*(\d{4})\s*\|([^{}]*\bda(?:te|y)\s*=\s*[0-3]?\d)\s*(?:\||(\}\}))", "$1$5 $2 $4 $3 $6");
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @")\bmonth\s*=\s*" + MonthList + @"\s*\|([^{}]*?(?:\|\s*)?\bda(?:te|y)\s*=\s*[0-3]?\d)\s*(\|[^{}]*)\byear\s*=\s*(\d{4})\s*(?:\||(\}\}))", "$1$3 $2 $5 $4 $6");
// year date month, year month date (International format)
VisibleFixes += RegexReplace(ref ArticleText, @"(?si)(\{\{\s*cit[^{}]*)\|\s*\byear\s*=\s*(\d{4})\s*(\|[^{}]*?(?:\|\s*)?\bda(?:te|y)\s*=\s*[0-3]?\d)(\s*\|[^{}]*)\bmonth\s*=\s*" + MonthList + @"\s*(?:\||(\}\}))", "$1$3 $5 $2 $4$6");
VisibleFixes += RegexReplace(ref ArticleText, @"(?si)(\{\{\s*cit[^{}]*)\|\s*\byear\s*=\s*(\d{4})\s*(\|[^{}]*?(?:\|\s*)?)\bmonth\s*=\s*" + MonthList + @"\s*\|([^{}]*\bda(?:te|y)\s*=\s*[0-3]?\d)\s*(\||\}\})", "$1$3$5 $4 $2 $6");
}
}
if(!ArticleTextBeforeDayMonthYear.Equals(ArticleText))
{
// if matched above with day=.. then set to date=
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @"\b)day(\s*=\s*[\s\w]{5,})", "$1date$2", false);
DayMonthYearFixApplied=true;
}
//VisibleFixes += RegexReplace(ref ArticleText, @"(?si)(\{\{\s*cite\s*(?:[ac-ikln-v][^{}]*)\|\s*)month(\s*=\s*)" + MonthList + @" \| year(?:\s*=\s*)(\d{4})" + TemEnd, "$1date$2$3 $4$5"); // year and month to date, where these fields don't exist (do exist for web, book, journal
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @"(?:archive|air|access)?date2?(?:\s*=\s*)(?:" + MonthList2 + @"\s*[0-3]?\d|[0-3]?\d\s*" + MonthList2 + @")[,\.]?\s+)'?(0\d\s*(?:\||\}\}))", "${1}20$2"); // 'DD Month YY' to YYYY fix (2000+)
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @"(?:archive|air|access)?date2?(?:\s*=\s*)(?:" + MonthList2 + @"\s*[0-3]?\d|[0-3]?\d\s*" + MonthList2 + @")[,\.]?\s+)'?([4-9]\d\s*(?:\||\}\}))", "${1}19$2"); // 'DD Month YY' to YYYY fix (1940+)
VisibleFixes += RegexReplace(ref ArticleText, @"(?si)({{\s*cite\s+[^{}]*\|\s*(?:archive|air|access)?date2?\s*=\s*)([0-3]?\d)-(" + MonthList2 + @",?)-(\d{2,4})", "$1$2 $3 $4"); // convert dashed dates to spaces
VisibleFixes += RegexReplace(ref ArticleText, @"(?si)({{\s*cite\s+[^{}]*\|\s*(?:archive|air|access)?date2?\s*=\s*)" + MonthList + @"-([0-3]?\d),?-(\d{2,4})", "$1$2 $3 $4"); // convert dashed dates to spaces
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @")accessdate\s*=\s*([0-3]?\d)\s*\|([^{}]*?(?:\|\s*)?)((?:accessdaymonth|accessmonthday)\s*=\s*" + MonthList2 + @")(\s*(?:\||\}\}))" , "$1$3$4 $2$5"); // accessdaymonth|accessmonthday = Month | accessdate = D?D combiner, 1 of 2
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @")((?:accessdaymonth|accessmonthday)\s*=\s*" + MonthList2 + @")(\s*\|[^{}]*?)(?:\|\s*)?accessdate\s*=\s*([0-3]?\d)\s*(\||\}\})" , "$1$2 $4$3$5"); // accessdaymonth|accessmonthday = Month | accessdate = D?D combiner, 2 of 2
VisibleFixes += RegexReplace(ref ArticleText, @"(?si)({{\s*cit[^{}]*\|\s*date\s*=\s*)(?:(" + MonthList2 + @"\s*)0(\d,?)|0(\d\s+" + MonthList2 + @"))(\s+\d{2,4}\s*(?:\||}}))", "$1$2$3$4$5"); // removes leading zeros in days Am or Int dates
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @"(?:accessdaymonth|accessmonthday)\s*=\s*)(?:(" + MonthList2 + @"\s*)0(\d,?)|0(\d\s+" + MonthList2 + @"))" + TemEnd, "$1$2$3$4$5"); // removes leading zeros in days Am or Int Month + 0D
VisibleFixes += RegexReplace(ref ArticleText, @"(?si)({{\s*cite\s*(?:news[^g]|paper|press|episode|video)[^{}]*\|\s*accessdate\s*=\s*)(?:" + MonthList + @"\s+0?([1-3]?\d)|0?([1-3]?\d)\s*" + MonthList + @")(?:\s*\|)\s*accessyear\s*=\s*(20[01]\d)" + TemEnd, "$1$2 $3$4 $5 $6$7"); // accessyear and accessdate combiner (not for cite web as this displays correctly as-is)
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @"access)monthday(\s*=\s*[0-3]?\d\s*" + MonthList2 + @"\s*(?:\||\}\}))" , "$1daymonth$2", false); // accessmonthday = Month D fix
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @"access)daymonth(\s*=\s*" + MonthList2 + @"\s*[0-3]?\d\s*(?:\||\}\}))" , "$1monthday$2", false); // accessdaymonth = D Month fix
// tidy up || or |}} (maybe with whitespace between) if one of above fixes introduced it
VisibleFixes += LoopedRegexReplace(ref ArticleText, @"(?si)(\{\{\s*cit[^{}]*)\|\s*(\}\}|\|)", "$1$2", false);
// accessdate/archivedate only fixes
if(Regex.IsMatch(ArticleText, @"(?i)\b(access|archive)date\s*="))
{
string CitAccessdate = SICitStart + @"(?:access|archive)date\s*=\s*";
// begin ISO conversion fixes for accessdate, archivedate, may soon be unnecessary due to template changes under discussion
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?([0-3]\d)(?:st|nd|rd|th) Apr(?:il|\.)?[,\.]?(?:\]\])?[,\.]?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-04-$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?([0-3]\d)(?:st|nd|rd|th) Aug(?:ust|\.)?[,\.]?(?:\]\])?[,\.]?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-08-$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?([0-3]\d)(?:st|nd|rd|th) Dec(?:ember|\.)?[,\.]?(?:\]\])?[,\.]?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-12-$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?([0-3]\d)(?:st|nd|rd|th) Feb(?:r?uary|\.)?[,\.]?(?:\]\])?[,\.]?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-02-$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?([0-3]\d)(?:st|nd|rd|th) Jan(?:uary|\.)?[,\.]?(?:\]\])?[,\.]?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-01-$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?([0-3]\d)(?:st|nd|rd|th) Jul(?:y|\.)?[,\.]?(?:\]\])?[,\.]?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-07-$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?([0-3]\d)(?:st|nd|rd|th) Jun(?:e|\.)?[,\.]?(?:\]\])?[,\.]?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-06-$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?([0-3]\d)(?:st|nd|rd|th) Mar(?:ch|\.)?[,\.]?(?:\]\])?[,\.]?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-03-$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?([0-3]\d)(?:st|nd|rd|th) May(?:\]\])?[,\.]?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-05-$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?([0-3]\d)(?:st|nd|rd|th) Nov(?:ember|\.)?[,\.]?(?:\]\])?[,\.]?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-11-$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?([0-3]\d)(?:st|nd|rd|th) Oct(?:ober|\.)?[,\.]?(?:\]\])?[,\.]?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-10-$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?([0-3]\d)(?:st|nd|rd|th) Sep(?:tember|\.)?[,\.]?(?:\]\])?[,\.]?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-09-$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?([1-9])(?:st|nd|rd|th) Apr(?:il|\.)?[,\.]?(?:\]\])?[,\.]?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-04-0$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?([1-9])(?:st|nd|rd|th) Aug(?:ust|\.)?[,\.]?(?:\]\])?[,\.]?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-08-0$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?([1-9])(?:st|nd|rd|th) Dec(?:ember|\.)?[,\.]?(?:\]\])?[,\.]?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-12-0$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?([1-9])(?:st|nd|rd|th) Feb(?:r?uary|\.)?[,\.]?(?:\]\])?[,\.]?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-02-0$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?([1-9])(?:st|nd|rd|th) Jan(?:uary|\.)?[,\.]?(?:\]\])?[,\.]?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-01-0$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?([1-9])(?:st|nd|rd|th) Jul(?:y|\.)?[,\.]?(?:\]\])?[,\.]?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-07-0$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?([1-9])(?:st|nd|rd|th) Jun(?:e|\.)?[,\.]?(?:\]\])?[,\.]?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-06-0$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?([1-9])(?:st|nd|rd|th) Mar(?:ch|\.)?[,\.]?(?:\]\])?[,\.]?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-03-0$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?([1-9])(?:st|nd|rd|th) May[,\.]?(?:\]\])?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-05-0$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?([1-9])(?:st|nd|rd|th) Nov(?:ember|\.)?[,\.]?(?:\]\])?[,\.]?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-11-0$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?([1-9])(?:st|nd|rd|th) Oct(?:ober|\.)?[,\.]?(?:\]\])?[,\.]?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-10-0$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?([1-9])(?:st|nd|rd|th) Sep(?:tember|\.)?[,\.]?(?:\]\])?[,\.]?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-09-0$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?Apr(?:il|\.)?\s+([0-3]\d)(?:\]\])?(?:st|nd|rd|th)[,\.]?(?:\]\])?[,\.]?\s+(?:20)?([01]\d)(?:\]\])?", "${1}20$3-04-$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?Apr(?:il|\.)?\s+([1-9])(?:st|nd|rd|th)[,\.]?(?:\]\])?[,\.]?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-04-0$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?Aug(?:ust|\.)?\s+([0-3]\d)(?:\]\])?(?:st|nd|rd|th)[,\.]?(?:\]\])?[,\.]?\s+(?:20)?([01]\d)(?:\]\])?", "${1}20$3-08-$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?Aug(?:ust|\.)?\s+([1-9])(?:st|nd|rd|th)[,\.]?(?:\]\])?[,\.]?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-08-0$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?Dec(?:ember|\.)?\s+([0-3]\d)(?:\]\])?(?:st|nd|rd|th)[,\.]?(?:\]\])?[,\.]?\s+(?:20)?([01]\d)(?:\]\])?", "${1}20$3-12-$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?Dec(?:ember|\.)?\s+([1-9])(?:st|nd|rd|th)[,\.]?(?:\]\])?[,\.]?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-12-0$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?Feb(?:r?uary|\.)?\s+([0-3]\d)(?:\]\])?(?:st|nd|rd|th)[,\.]?(?:\]\])?[,\.]?\s+(?:20)?([01]\d)(?:\]\])?", "${1}20$3-02-$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?Feb(?:r?uary|\.)?\s+([1-9])(?:st|nd|rd|th)[,\.]?(?:\]\])?[,\.]?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-02-0$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?Jan(?:uary|\.)?\s+([0-3]\d)(?:\]\])?(?:st|nd|rd|th)[,\.]?(?:\]\])?[,\.]?\s+(?:20)?([01]\d)(?:\]\])?", "${1}20$3-01-$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?Jan(?:uary|\.)?\s+([1-9])(?:st|nd|rd|th)[,\.]?(?:\]\])?[,\.]?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-01-0$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?Jul(?:y|\.)?\s+([0-3]\d)(?:\]\])?(?:st|nd|rd|th)[,\.]?(?:\]\])?[,\.]?\s+(?:20)?([01]\d)(?:\]\])?", "${1}20$3-07-$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?Jul(?:y|\.)?\s+([1-9])(?:st|nd|rd|th)[,\.]?(?:\]\])?[,\.]?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-07-0$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?Jun(?:e|\.)?\s+([0-3]\d)(?:\]\])?(?:st|nd|rd|th)[,\.]?(?:\]\])?[,\.]?\s+(?:20)?([01]\d)(?:\]\])?", "${1}20$3-06-$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?Jun(?:e|\.)?\s+([1-9])(?:st|nd|rd|th)[,\.]?(?:\]\])?[,\.]?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-06-0$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?Mar(?:ch|\.)?\s+([0-3]\d)(?:\]\])?(?:st|nd|rd|th)[,\.]?(?:\]\])?[,\.]?\s+(?:20)?([01]\d)(?:\]\])?", "${1}20$3-03-$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?Mar(?:ch|\.)?\s+([1-9])(?:st|nd|rd|th)[,\.]?(?:\]\])?[,\.]?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-03-0$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?May ([0-3]\d)(?:\]\])?(?:st|nd|rd|th)[,\.]?(?:\]\])?[,\.]?\s+(?:20)?([01]\d)(?:\]\])?", "${1}20$3-05-$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?May ([1-9])(?:st|nd|rd|th)[,\.]?(?:\]\])?[,\.]?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-05-0$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?Nov(?:ember|\.)?\s+([0-3]\d)(?:\]\])?(?:st|nd|rd|th)[,\.]?(?:\]\])?[,\.]?\s+(?:20)?([01]\d)(?:\]\])?", "${1}20$3-11-$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?Nov(?:ember|\.)?\s+([1-9])(?:st|nd|rd|th)[,\.]?(?:\]\])?[,\.]?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-11-0$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?Oct(?:ober|\.)?\s+([0-3]\d)(?:\]\])?(?:st|nd|rd|th)[,\.]?(?:\]\])?[,\.]?\s+(?:20)?([01]\d)(?:\]\])?", "${1}20$3-10-$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?Oct(?:ober|\.)?\s+([1-9])(?:st|nd|rd|th)[,\.]?(?:\]\])?[,\.]?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-10-0$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?Sep(?:tember|\.)?\s+([0-3]\d)(?:\]\])?(?:st|nd|rd|th)[,\.]?(?:\]\])?[,\.]?\s+(?:20)?([01]\d)(?:\]\])?", "${1}20$3-09-$2");
VisibleFixes += RegexReplace(ref ArticleText, CitAccessdate + @")(?:\[\[)?Sep(?:tember|\.)?\s+([1-9])(?:st|nd|rd|th)[,\.]?(?:\]\])?[,\.]?\s*(?:20)?(0[4-8])(?:\]\])?", "${1}20$3-09-0$2");
// end ISO conversion fixes
}
VisibleFixes += LoopedRegexReplace(ref ArticleText, SICitStart + @"(?:archive|air|access)?date2?\s*=\s*(?:\[\[)?200\d)-([2-3]\d|1[3-9])-(0[1-9]|1[0-2])(\]\])?", "$1-$3-$2$4"); // YYYY-DD-MM to YYYY-MM-DD
VisibleFixes += LoopedRegexReplace(ref ArticleText, @"(?si)(\{\{\s*cite[^\{\}]*\|\s*(?:archive|air|access)?date2?\s*=\s*(?:(?:200\d|19[7-9]\d)-[01]?\d-[0-3]?\d|[0-3]?\d\s*\w+,?\s*(?:200\d|19[7-9]\d)|\w+\s*[0-3]?\d,?\s*(?:200\d|19[7-9]\d)))(\s*[,-:]?\s+[0-2]?\d\:?[0-5]\d(?:\:?[0-5]\d)?\s*[^\|\}]*)", "$1<!--$2-->"); // Removes time from date fields
// 'reterieved on DD Mon YYYY</ref>' or 'reterieved on Mon DD, YYYY</ref>' Mon to Month expander, TODO scan & test
string RefRetrieved = @"(?si)(<ref[^<>]*?>[^<>]*?retrieved(?:\s+on)?[^<>]*?)\b";
string RefEnd = @"(\s[^<>]*?</ref>)";
VisibleFixes += RegexReplace(ref ArticleText, RefRetrieved + @"Jan\.?" + RefEnd, "$1January$2");
VisibleFixes += RegexReplace(ref ArticleText, RefRetrieved + @"Feb\.?" + RefEnd, "$1February$2");
VisibleFixes += RegexReplace(ref ArticleText, RefRetrieved + @"Mar\.?" + RefEnd, "$1March$2");
VisibleFixes += RegexReplace(ref ArticleText, RefRetrieved + @"Apr\.?" + RefEnd, "$1April$2");
VisibleFixes += RegexReplace(ref ArticleText, RefRetrieved + @"Jun\.?" + RefEnd, "$1June$2");
VisibleFixes += RegexReplace(ref ArticleText, RefRetrieved + @"Jul\.?" + RefEnd, "$1July$2");
VisibleFixes += RegexReplace(ref ArticleText, RefRetrieved + @"Aug\.?" + RefEnd, "$1August$2");
VisibleFixes += RegexReplace(ref ArticleText, RefRetrieved + @"Sept?\.?" + RefEnd, "$1September$2");
VisibleFixes += RegexReplace(ref ArticleText, RefRetrieved + @"Oct\.?" + RefEnd, "$1October$2");
VisibleFixes += RegexReplace(ref ArticleText, RefRetrieved + @"Nov\.?" + RefEnd, "$1November$2");
VisibleFixes += RegexReplace(ref ArticleText, RefRetrieved + @"Dec\.?" + RefEnd, "$1December$2");
// format ambiguous cite dates
ArticleTextBeforeGuessedDate = ArticleText;
if(APPLY_AMBIGUOUS_FIXES)
{
string ArchiveAirAccess = SICitStart + @"(?:archive|air|access)?date2?\s*=\s*)";
// if date is ambiguous between American and British format, will default to American
if(ArticleLocaleFound.Equals("US"))
{
VisibleFixes += RegexReplace(ref ArticleText, ArchiveAirAccess + @"(1[0-2])[/_\-\.]0?([1-9])[/_\-\.](?:20)?([01]\d)\b", "${1}20$4-$2-0$3");
VisibleFixes += RegexReplace(ref ArticleText, ArchiveAirAccess + @"(1[0-2])[/_\-\.](1[0-2])[/_\-\.](?:20)?([01]\d)\b", "${1}20$4-$2-$3");
VisibleFixes += RegexReplace(ref ArticleText, ArchiveAirAccess + @"0?([1-9])[/_\-\.]0?([1-9])[/_\-\.](?:20)?([01]\d)\b", "${1}20$4-0$2-0$3");
VisibleFixes += RegexReplace(ref ArticleText, ArchiveAirAccess + @"0?([1-9])[/_\-\.](1[0-2])[/_\-\.](?:20)?([01]\d)\b", "${1}20$4-0$2-$3");
VisibleFixes += RegexReplace(ref ArticleText, ArchiveAirAccess + @"(1[0-2])[/_\-\.]?0?([1-9])[/_\-\.]?(200\d|19[6-9]\d)\b", "$1$4-$2-0$3");
VisibleFixes += RegexReplace(ref ArticleText, ArchiveAirAccess + @"(1[0-2])[/_\-\.]?(1[0-2])[/_\-\.]?(200\d|19[6-9]\d)\b", "$1$4-$2-$3");
VisibleFixes += RegexReplace(ref ArticleText, ArchiveAirAccess + @"0?([1-9])[/_\-\.]?0?([1-9])[/_\-\.]?(200\d|19[6-9]\d)\b", "$1$4-0$2-0$3");
VisibleFixes += RegexReplace(ref ArticleText, ArchiveAirAccess + @"0?([1-9])[/_\-\.]?(1[0-2])[/_\-\.]?(200\d|19[6-9]\d)\b", "$1$4-0$2-$3");
}
else // Intl format
{
VisibleFixes += RegexReplace(ref ArticleText, ArchiveAirAccess + @"(1[0-2])[/_\-\.]0?([1-9])[/_\-\.](?:20)?([01]\d)\b", "${1}20$4-0$3-$2");
VisibleFixes += RegexReplace(ref ArticleText, ArchiveAirAccess + @"(1[0-2])[/_\-\.](1[0-2])[/_\-\.](?:20)?([01]\d)\b", "${1}20$4-$3-$2");
VisibleFixes += RegexReplace(ref ArticleText, ArchiveAirAccess + @"0?([1-9])[/_\-\.]0?([1-9])[/_\-\.](?:20)?([01]\d)\b", "${1}20$4-0$3-0$2");
VisibleFixes += RegexReplace(ref ArticleText, ArchiveAirAccess + @"0?([1-9])[/_\-\.](1[0-2])[/_\-\.](?:20)?([01]\d)\b", "${1}20$4-$3-0$2");
VisibleFixes += RegexReplace(ref ArticleText, ArchiveAirAccess + @"(1[0-2])[/_\-\.]?0?([1-9])[/_\-\.]?(200\d|19[6-9]\d)\b", "$1$4-0$3-$2");
VisibleFixes += RegexReplace(ref ArticleText, ArchiveAirAccess + @"(1[0-2])[/_\-\.]?(1[0-2])[/_\-\.]?(200\d|19[6-9]\d)\b", "$1$4-$3-$2");
VisibleFixes += RegexReplace(ref ArticleText, ArchiveAirAccess + @"0?([1-9])[/_\-\.]?0?([1-9])[/_\-\.]?(200\d|19[6-9]\d)\b", "$1$4-0$3-0$2");
VisibleFixes += RegexReplace(ref ArticleText, ArchiveAirAccess + @"0?([1-9])[/_\-\.]?(1[0-2])[/_\-\.]?(200\d|19[6-9]\d)\b", "$1$4-$3-0$2");
}
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @"(?:archive|air|access)?date2?\s*=\s*(?:200\d|19[7-9]\d)-[0-1]\d-[0-3]\d\s+)([0-2]?\d\:?[0-5]\d(?:\:?[0-5]\d)?\s*[^[\|}]*)", "$1<!--$2-->"); // Removes time from ISO date fields
}
SpecificFixes = VisibleFixes-VisibleFixesCheckpoint;
if (!ArticleTextBeforeCiteTemplate.Equals(ArticleText) && SpecificFixes > 0)
Summary += String.Format("format cite template dates ({0}), ", SpecificFixes);
if(!ArticleTextBeforeGuessedDate.Equals(ArticleText))
Summary += "format ambiguous cite dates (set " + ArticleLocaleFound + " format), ";
if(DayMonthYearFixApplied)
Summary += "combine day, month & year, ";
// date of birth / death fixes
ArticleTextBeforeDOBDOD = ArticleText;
VisibleFixesCheckpoint = VisibleFixes;
string DOBDODStart = @"(?mi)^('''[^'\n]+'''.+?[\(;,]\s*)";
string DOBDODEnd = @"\.(\s*\[*(?:" + MonthList2 + @"\s+[0-3]?\d|[0-3]?\d\s*" + MonthList2 + @")?\]*,?\s*\[*[1-2]?\d{3}\]*)\s*\)";
VisibleFixes += RegexReplace(ref ArticleText, @"(?mi)('''[^'\n]+'''.+?[\(;,]\s*)d\.(\s+\[*(?:" + MonthList2 + @"\s+[0-3]?\d|[0-3]?\d\s*" + MonthList2 + @")?\]*,?\s*\[*[1-2]?\d{3}\]*)\s*\)", "$1died$2)"); // date of death
VisibleFixes += RegexReplace(ref ArticleText, DOBDODStart + @"(?:born|b\.)\s*(\[*(?:" + MonthList2 + @"\s+[0-3]?\d|[0-3]?\d\s*" + MonthList2 + @")?\]*,?\s*\[*[1-2]?\d{3}\]*.*?)\s*(?:[,;:\-–]|&.dash;)\s*(?:died|d\.)\s*(\[*(?:" + MonthList2 + @"\s+[0-3]?\d|[0-3]?\d\s*" + MonthList2 + @")?\]*,?\s*\[*[1-2]?\d{3}\]*.*?)\s*\)", "$1$2 – $3)"); // birth and death, (regex shares date clause logic with birth, death fixes)
VisibleFixes += RegexReplace(ref ArticleText, DOBDODStart + @"d" + DOBDODEnd, "$1died $2)"); // date of death
VisibleFixes += RegexReplace(ref ArticleText, DOBDODStart + @"b" + DOBDODEnd, "$1born $2)"); // date of birth
if(!ArticleTextBeforeDOBDOD.Equals(ArticleText))
{
SpecificFixes = VisibleFixes - VisibleFixesCheckpoint;
Summary += String.Format("format date of birth/death ({0}), ", SpecificFixes);
}
// reference needed --> {{fact}}
VisibleFixesCheckpoint = VisibleFixes;
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)\s*(?:<((?:sup|ref|small)>)\s*)?[\(\[{]*('*)[\(\[{]+\**\s*(?:reference|citation)s?\s+needed\s*(?:[\)\]]+|\}+)\2[\)\]\}]*(?:\s*</\1)?", @"{{fact|date={{subst:CURRENTMONTHNAME}} {{subst:CURRENTYEAR}}}}");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)\s*<ref>\s*(?:reference|citation)s?\s+needed\s*</ref>", @"{{fact|date={{subst:CURRENTMONTHNAME}} {{subst:CURRENTYEAR}}}}");
if(VisibleFixes-VisibleFixesCheckpoint > 0)
Summary += @"'reference needed' --> {{fact}} template, ";
VisibleFixesCheckpoint = VisibleFixes;
// log any articles with unlinked bare refs
if(Regex.IsMatch(ArticleText, @"(?si)<ref>\s*(?:http://|www\.)[^\[\]<>""\s]+?\s*</ref>") && FIREFOX_REFLINKS && !RELFINKS_ACTIONED)
{
System.Diagnostics.Process.Start(@"""C:\Program Files\Mozilla Firefox\firefox.exe""", @"-new-tab ""http://toolserver.org/~dispenser/cgi-bin/webreflinks.py?page=" + ArticleTitle + @"&citeweb=on&force=on""");
RELFINKS_ACTIONED = true;
}
if(Regex.IsMatch(ArticleText, @"(?si)<ref>\s*(?:http://|www\.)[^\[\]<>""\s]+?\s*</ref>") && !FIREFOX_REFLINKS)
LogToFile("@@@REFLINKS@@@unlinked bare refs found, run reflinks");
// <ref>...<ref><ref> --> <ref>...</ref><ref>
VisibleFixes += RegexReplace(ref ArticleText, @"(?si)(<\s*ref(?:\s+name\s*=[^<>]*?)?\s*>[^<>""]+?)<ref>(\s*<ref>)", "$1</ref>$2");
// <ref name="Fred"><ref name="Bert"> --> <ref name="Fred"/><ref name="Bert">
VisibleFixes += RegexReplace(ref ArticleText, @"(?si)(<\s*ref\s+name\s*=\s*""?[^<>=""\/]+?""?\s*)>(<\s*ref\s+name\s*=\s*""?[^<>=""\/]+?""?\s*>)", "$1/>$2");
// <ref name="Fred"> </ref> --> <ref name="Fred" /> (done as genfix too but do here to avoid matching on DUPREF
VisibleFixes += RegexReplace(ref ArticleText, @"(?si)(<\s*ref\s+name\s*=\s*""?[^<>=""\/]+?""?\s*)>\s*<\s*/\s*ref\s*>", "$1 />", false);
SpecificFixes = VisibleFixes - VisibleFixesCheckpoint;
if(SpecificFixes > 0)
Summary += String.Format("fix ref format ({0}), ", SpecificFixes);
// end ref format fixes
// duplicate citation fixer (both named), same name but different ref: remove second ref as per WP:REFNAME this won't be used
Regex URLformat = new Regex(@"(?si).*?\b(http://[^\[\]<>""\s\|{}]+).*", RegexOptions.Compiled);
Regex ISBNformat = new Regex(@"(?si).*?\bIS[BS]N(?:\s*=\s*)?(\d[\d-\s]+\d)\b.*", RegexOptions.Compiled);
Regex DupBothNamedRegex = new Regex(@"(?s)(<\s*ref\s+name\s*=\s*""?([^<>=""]+)""?\s*>\s*([^<>]{25,})\s*<\s*/\s*ref>)(.*?)(<\s*ref\s+name\s*=\s*""?\2""?)\s*>\s*([^<>]+?)\s*<\s*/\s*ref>", RegexOptions.Compiled);
VisibleFixesCheckpoint = VisibleFixes;
if (DupBothNamedRegex.IsMatch(ArticleText))
{
MatchCollection MatchList1 = DupBothNamedRegex.Matches(ArticleText);
for (int i = 0; i < MatchList1.Count; i++)
{
bool RefsAreTheSame = false;
bool Certain = false;
string ReasonTheSame = "";
Match TheMatch = MatchList1[i];
string DupRefName = TheMatch.Groups[2].Value;
string RefValue1 = TheMatch.Groups[3].Value;
string RefValue2 = TheMatch.Groups[6].Value;
// TODO: if refs differ by pages only, rename them?
if(DifferentByPageOnly(RefValue1, RefValue2))
{
LogToFile(String.Format("@@@DUPREF@@@ref '{0}' differs by pages only@@@{1}@@@{2}", DupRefName, RefValue1, RefValue2));
continue;
}
float Similarity = GetSimilarity(RefValue1, RefValue2);
// refs the same if ref2 value is ibid or space or very short
if(RefValue2.Trim().Length < 5)
{
RefsAreTheSame = true;
ReasonTheSame = "Ref2 very short";
Certain = true;
}
if(!RefsAreTheSame && Regex.IsMatch(RefValue2, @"(?is)\b(ibid|op.{1,4}cit|see\s+above|empty|internet| )\b") && RefValue2.Length < 12)
{
RefsAreTheSame = true;
ReasonTheSame = "Ref2 is ibid-like";
Certain = true;
}
if(!RefsAreTheSame && Regex.Replace(RefValue1.Trim(), @"\W", "").Equals(Regex.Replace(RefValue2.Trim(), @"\W", "")))
{
RefsAreTheSame = true;
ReasonTheSame = "Refs same except non-word chars";
Certain = true;
}
// the same if URL in them is the same, if different then not the same
if(!RefsAreTheSame && URLformat.IsMatch(RefValue1))
{
string URL1 = URLformat.Replace(RefValue1, "$1");
string URL2 = URLformat.Replace(RefValue2, "$1");
if(DEBUG_MODE)
LogToFile(String.Format("URL1 = '{0}' and URL2 = '{1}'", URL1, URL2));
if(URL1.Trim().Equals(URL2.Trim()))
{
RefsAreTheSame = true;
ReasonTheSame = "Refs have same URL";
if(FieldsTheSame(RefValue1, RefValue2, "title") == 0)
{
ReasonTheSame = "Refs have same title and URL";
Certain = true;
}
}
}
// the same if ISBN in them is the same, if different then not the same
if(!RefsAreTheSame && ISBNformat.IsMatch(RefValue1))
{
string ISBN1 = ISBNformat.Replace(RefValue1, "$1");
string ISBN2 = ISBNformat.Replace(RefValue2, "$1");
if(DEBUG_MODE)
LogToFile(String.Format("@@@DEBUG@@@ISBN1 = '{0}' and ISBN2 = '{1}'", ISBN1, ISBN2));
if(ISBN1.Trim().Equals(ISBN2.Trim()))
{
RefsAreTheSame = true;
ReasonTheSame = "Refs have same ISBN";
}
}
/*TODO what else to match on now, test below */
if(AllFieldsTheSame(RefValue1, RefValue2, "pmid", "author", "year"))
{
RefsAreTheSame = Certain = true;
ReasonTheSame = "Refs have same pmid, author, year";
}
if(AllFieldsTheSame(RefValue1, RefValue2, "pmid", "title", "year"))
{
RefsAreTheSame = Certain = true;
ReasonTheSame = "Refs have same pmid, title, year";
}
if(AllFieldsTheSame(RefValue1, RefValue2, "author", "title", "year"))
{
RefsAreTheSame = Certain = true;
ReasonTheSame = "Refs have same author, title, year";
}
if(!RefsAreTheSame && Similarity > 0.70)
{
RefsAreTheSame = true;
ReasonTheSame = "Refs have high similarity";
}
if(APPLY_AMBIGUOUS_FIXES && !Certain && ReasonTheSame.Equals(""))
ReasonTheSame = "Ambiguous mode";
if(RefsAreTheSame && (Certain || APPLY_AMBIGUOUS_FIXES))
{
VisibleFixes += LoopedRegexReplace(ref ArticleText, @"(?s)(<\s*ref\s+name\s*=\s*""?(" + Regex.Escape(DupRefName) + @")""?\s*>\s*([^<>]{25,})\s*<\s*/\s*ref>)(.*?)(<\s*ref\s+name\s*=\s*""?\2""?)\s*>\s*([^<>]+?)\s*<\s*/\s*ref>", @"$1$4$5/>");
LogToFile(String.Format("@@@DUPREF@@@ref '{0}' has values@@@{1}@@@{2}@@@similarity {3}@@@Reason: {4}", DupRefName, RefValue1, RefValue2, Similarity, ReasonTheSame));
}
if(RefsAreTheSame && !Certain && !APPLY_AMBIGUOUS_FIXES)
LogToFile(String.Format("@@@DUPREF@@@ref '{0}' has values@@@{1}@@@{2}@@@similarity {3}@@@Reason: {4}@@@But ambiguous", DupRefName, RefValue1, RefValue2, Similarity, ReasonTheSame));
if(!RefsAreTheSame && APPLY_AMBIGUOUS_FIXES)
{ // change ref name of second ref if not used elsewhere
if(!Regex.IsMatch(ArticleText, @"(?s)(<\s*ref\s+name\s*=\s*""?" + Regex.Escape(DupRefName) + @")""?\s*/\s*>") && !Regex.IsMatch(ArticleText, @"(?s)(<\s*ref\s+name\s*=\s*""?" + Regex.Escape(DupRefName) + @")""?_02\s*>"))
{
VisibleFixes += RegexReplace(ref ArticleText, @"(?s)(<\s*ref\s+name\s*=\s*""?" + Regex.Escape(DupRefName) + @")(""?\s*>\s*" + Regex.Escape(RefValue2) + @"\s*<\s*/\s*ref>)", "$1_2$2");
LogToFile(String.Format("@@@DUPREF@@@ref '{0}' has values@@@{1}@@@{2}@@@similarity {3}@@@Renamed second", DupRefName, RefValue1, RefValue2, Similarity));
}
else
LogToFile(String.Format("@@@DUPREF@@@ref '{0}' has values@@@{1}@@@{2}@@@similarity {3}@@@Not same, failed rename of second", DupRefName, RefValue1, RefValue2, Similarity));
}
}
SpecificFixes = VisibleFixes - VisibleFixesCheckpoint;
Summary +=String.Format("tidy up duplicate named refs ({0}), ", SpecificFixes);
}
// duplicate citation fixer (not named): add named reference to first and use in latter ones
Regex DuplicateUnnamedRef = new Regex(@"(?s)(<\s*ref\s*>\s*([^<>]+)\s*<\s*/\s*ref>)(.*?)(<\s*ref\s*>\s*\2\s*<\s*/\s*ref>)", RegexOptions.Compiled);
ArticleTextBeforeDuplicateUnnamedRef = ArticleText;
VisibleFixesCheckpoint = VisibleFixes;
if(DuplicateUnnamedRef.IsMatch(ArticleText) && DO_DUPE_UNNAMED_REF_FIX)
{
if(!Regex.IsMatch(ArticleText, @"<ref name=""multiref\d+""/?>"))
{
MatchCollection MultirefMC = DuplicateUnnamedRef.Matches(ArticleText);
for(int j = 1; j<MAX_ITERATIONS; j++)
{
string Before = ArticleText;
string Replace = String.Format(@"<ref name=""multiref{0}"">$2</ref>$3<ref name=""multiref{0}""/>", j);
string FriendlyReplace = "";
string FriendlyName = "";
string Multiref = String.Format(@"multiref{0}", j);
string MultirefRefString = "";
int NotCounting = 0;
string RefName = @"(?si)<\s*ref\s+name\s*=\s*""";
string NameMask = @""">(?-i)\s*(?:sir)?\s*((?:[A-Z]+\.?){0,3}\s*[A-Z][\w-']{2,}[,\.]?\s*(?:\s+\w\.?|\b(?:[A-Z]+\.?){0,3})?(?:\s+[A-Z][\w-']{2,}){0,3}(?:\s+\w(?:\.?|\b)){0,2})\s*(?:[,\.'&;:\[\(“`]|et\s+al)(?i)[^{}<>\n]*?";
string YearMask = @"(\([12]\d{3}\)|\b[12]\d{3}[,\.\)])";
string PageMask = @"('*(?:p+g?|pages?)'*\.?'*(?: )?\s*(?:\d{1,3}|(?-i)[XVICM]+(?i))\.?(?:\s*[-/&\.,]\s*(?:\d{1,3}|(?-i)[XVICM]+(?i)))?\b)";
if(j == 1 || DuplicateUnnamedRef.IsMatch(ArticleText))
{
try
{
Match MultirefMatch = MultirefMC[j-1];
MultirefRefString = MultirefMatch.Groups[2].Value;
}
catch // index out of range exception i.e. no more matches
{
break; // if no more matches in collection, we've finished
}
if(Regex.IsMatch(MultirefRefString, @"(?is)\b(ibid|op.{1,4}cit)\b") && !APPLY_AMBIGUOUS_FIXES)
{
LogToFile("@@@MREFERR@@@ref contains ibid/op cit@@@" + Regex.Replace(MultirefRefString, @"\n", " "));
ArticleText = Before;
continue;
}
//ArticleText = DuplicateUnnamedRef.Replace(ArticleText, Replace, 1); // replacements limited to 1
Regex MultirefReplace = new Regex(@"(?s)(<\s*ref\s*>\s*(" + Regex.Escape(MultirefRefString) + @")\s*<\s*/\s*ref>)(.*?)(<\s*ref\s*>\s*\2\s*<\s*/\s*ref>)", RegexOptions.Compiled);
ArticleText = MultirefReplace.Replace(ArticleText, Replace, 1);
FriendlyName = "";
// try description of a simple external link
if(Regex.IsMatch(ArticleText, RefName + FriendlyName + @"""\s*/?\s*>") || FriendlyName.Length < 4)
FriendlyName = DeriveFriendlyName(ArticleText, RefName + Multiref + @""">\s*[^{}<>\n]*?\s*\[*(?:http://www\.|http://|www\.)[^\[\]<>""\s]+?\s+([^{}<>\[\]]{4,35}?)\s*(?:\]|<!--|⌊⌊⌊⌊|</ref>)", 1);
// website URL first, allowing a name before link
if(Regex.IsMatch(ArticleText, RefName + FriendlyName + @"""\s*/?\s*>") || FriendlyName.Length < 4)
FriendlyName = DeriveFriendlyName(ArticleText, RefName + Multiref + @""">\s*\w*?[^{}<>]{0,4}?\s*(?:\[?|\{\{\s*cit[^{}<>]*\|\s*url\s*=\s*)\s*(?:http://www\.|http://|www\.)([^\[\]<>""\s\/:]+)", 1);
// Harvnb template {{Harvnb|Young|1852|p=50}}
if(Regex.IsMatch(ArticleText, RefName + FriendlyName + @"""\s*/?\s*>") || FriendlyName.Length < 4)
FriendlyName = DeriveFriendlyName(ArticleText, RefName + Multiref + @""">\s*{{Harvnb\s*\|\s*([^{}\|]+?)\s*\|\s*(\d{4})\s*\|\s*([^{}\|]+?)\s*}}\s*", 3);
// now just try to use the whole reference if it's short (<35 characters)
if(Regex.IsMatch(ArticleText, RefName + FriendlyName + @"""\s*/?\s*>") || FriendlyName.Length < 4)
FriendlyName = DeriveFriendlyName(ArticleText, RefName + Multiref + @""">\s*([^<>{}]{4,35})\s*</ref>", 1);
//now try title of a citation
if(Regex.IsMatch(ArticleText, RefName + FriendlyName + @"""\s*/?\s*>") || FriendlyName.Length < 4)
FriendlyName = DeriveFriendlyName(ArticleText, RefName + Multiref + @""">\s*\{\{\s*cit[^{}<>]*\|\s*url\s*=\s*([^\/<>{}\|]{4,35})", 1);
// name...year...page
if(Regex.IsMatch(ArticleText, RefName + FriendlyName + @"""\s*/?\s*>") || FriendlyName.Length < 4)
FriendlyName = DeriveFriendlyName(ArticleText, RefName + Multiref + NameMask + YearMask + @"[^{}<>\n]*?" + PageMask + @"\s*</ref>", 3);
// name...page
if(Regex.IsMatch(ArticleText, RefName + FriendlyName + @"""\s*/?\s*>") || FriendlyName.Length < 4)
FriendlyName = DeriveFriendlyName(ArticleText, RefName + Multiref + NameMask + PageMask + @"\s*</ref>", 2);
// name...year
if(Regex.IsMatch(ArticleText, RefName + FriendlyName + @"""\s*/?\s*>") || FriendlyName.Length < 4)
FriendlyName = DeriveFriendlyName(ArticleText, RefName + Multiref + NameMask + YearMask + @"\s*</ref>", 2);
// now see if specific reference name has been written
// string MultirefRefStringToCompare = Regex.Replace(MultirefRefString, @"\n", " ");
// ="if (MultirefRefStringToCompare.Equals(@"""&D1&""")) FriendlyName = """&F1&""";
FriendlyName = CleanFriendlyName(FriendlyName);
try // if FriendlyName has some brackets or something, will get regex parse error
{
// if can't get a distinct name, revert multiref and log
if(FriendlyName.Length > 3 && Regex.IsMatch(ArticleText, RefName + FriendlyName + @"""\s*/?\s*>"))
{
LogToFile("@@@MREFERR@@@FriendlyName '" + FriendlyName + "' already in use for@@@" + Regex.Replace(MultirefRefString, @"\n", " "));
ArticleText = Before;
continue;
}
// only attempt to rename if suitable name found and isn't already used by article
if(FriendlyName.Length > 3 && !Regex.IsMatch(ArticleText, RefName + FriendlyName + @"""\s*/?\s*>"))
{
FriendlyReplace = @"${1}" + FriendlyName + @"${2}";
NotCounting += LoopedRegexReplace(ref ArticleText, @"(?si)(<ref name="")" + Multiref + @"(""/?>)", FriendlyReplace);
LogToFile("@@@MULTIREF@@@'" + Multiref + "' --> '" + FriendlyName + "' for@@@" + Regex.Replace(MultirefRefString, @"\n", " "));
VisibleFixes++;
}
else
{
LogToFile("@@@MREFERR@@@'" + Multiref + "' failed to get renamed for@@@" + MultirefRefString);
ArticleText = Before;
continue;
}
//now do 'first named' on loop, will have hits if three or more unnamed references were the same...
VisibleFixes += LoopedRegexReplace(ref ArticleText, @"(?s)(<\s*ref\s+name\s*=\s*""([^<>""\/]+)""\s*>\s*([^<>]+)\s*<\s*/\s*ref>)(.*?)(<\s*ref\s*>)\s*\3\s*<\s*/\s*ref>", @"$1$4<ref name=""$2""/>");
}
catch // regex parse exception
{
ArticleText = Before;
LogToFile("@@@MREFERR@@@ parse exception on FriendlyName@@@" + FriendlyName);
continue;
}
}
else
break;
}
}
else
LogToFile("@@@MULTIREF@@@set to insert multiref but one already there");
}
SpecificFixes = VisibleFixes-VisibleFixesCheckpoint;
if(SpecificFixes > 0)
Summary += String.Format("set identical unnamed references to use named refs ({0}), ", SpecificFixes);
// end ref combine fixes
// URL format fixes, may be caught by AWB gen fixes before reaching here
ArticleTextBeforeURL = ArticleText;
VisibleFixesCheckpoint = VisibleFixes;
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)([\s\[>=])htp://(\w+)", "$1http://$2"); // 'htp' instead of 'http' in an external link
if (!Regex.IsMatch(ArticleText, @"HTTP/\d\."))
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)([\s\[>=](?:ht|f))tp:(?:/|///)(\w+)", "$1tp://$2"); // Fixes single or triple slash in ftp or http external link
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)([\s\[>=])((?:ht|f)tp:?/+)(\2)+", "$1$2"); // Fixes multiple http:// or ftp:// in an external link
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)(\[(?:http://|www\.)[^\[\]<>""\s]*?)\|''", "$1 ''"); // fixes [www.site.com|''my cool site''] which links with the bar at the end of the URL
VisibleFixes += RegexReplace(ref ArticleText, @"(?si)(\{\{\s*cite\s+(?:web|journal)[^{}]*\|\s*url\s*=\s*)(www\.)", "$1http://$2"); // for cite web/journal the URL requires http:// at start
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)(http://ww)\.", "$1w."); // fixes http://ww.
// complete square brackets around external links in ref tags
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)(<ref.*?>[^\[\]<>]*?\[\s*http://[^\[\]<>]*?)(?:(\w)})?(</ref>)", "$1$2]$3");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)(<ref.*?>)(\s*http://[^\[\]<>]*?\][^\[\]<>]*?</ref>)", "$1[$2");
if(!ArticleTextBeforeURL.Equals(ArticleText))
{
SpecificFixes = VisibleFixes - VisibleFixesCheckpoint;
Summary += String.Format("fix web link format ({0}), ", SpecificFixes);
}
VisibleFixes += RegexReplace(ref ArticleText, @"(?mi)^\s*''for\s(.+?),\s+(?:please\s)?see\s+\[\[(.+?)\]\]\n", "{{for|$1|$2}}\n");
//VisibleFixes += RegexReplace(ref ArticleText, @"(?!\[\[September 11,? 2001\]\])\[\[\s*" + MonthList + @"\s*0?([1-3]?\d)(,?)\s*(200\d|19[7-9]\d)\s*\]\]", "[[$1 $2]]$3 [[$4]]"); // [[Month DD, YYYY]] fix
//VisibleFixes += RegexReplace(ref ArticleText, @"\[\[\s*0?([1-3]?\d)\s*" + MonthList + @",?\s*(200\d|19[7-9]\d)\s*\]\]", "[[$1 $2]] [[$3]]"); // [[DD Month YYYY]] fix
if(Regex.IsMatch(ArticleText, @"(?si){{\s*cit[^{}]*\|\s*url\s*=\s*[^\|}{]+?\.PDF"))
VisibleFixes += AddPDFFormatField(ref ArticleText);
ArticleTextBeforeDuplicateCiteField = ArticleText;
// remove year and month entries if they match a date entry
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @"date\s*=\s*(?:\[*" + MonthList2 + @"\s+[0-3]?\d?|[0-3]?\d?\s*" + MonthList2 + @")\]*,?\s*(\d{4})\b(?:\s*\|?[^{}]*?))\|\s*\byear\s*=\s*\2\s*(\||\}\})", "$1$3", false); // date then year (Int/Am date)
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @"date\s*=\s*(\d{4})-[01]\d-[0-3]\d(?:\s*\|?[^{}]*?))\|\s*\byear\s*=\s*\2\s*(\||\}\})", "$1$3", false); // date then year (ISO date)
VisibleFixes += RegexReplace(ref ArticleText, @"(?si)(\{\{\s*cit[^{}]*)\|\s*year\s*=\s*(\d{4})\s*((?:\|[^{}]*?)?\|?\s*\bdate\s*=\s*(?:\[*" + MonthList2 + @"\s+[0-3]?\d?|[0-3]?\d?\s*" + MonthList2 + @")\]*,?\s*\2\s*(\||\}\}))", "$1$3", false); // year then date (Int/Am date)
VisibleFixes += RegexReplace(ref ArticleText, @"(?si)(\{\{\s*cit[^{}]*)\|\s*year\s*=\s*(\d{4})\s*((?:\|[^{}]*?)?\|?\s*\bdate\s*=\s*\2-[01]\d-[0-3]\d\s*(\||\}\}))", "$1$3", false); // year then date (ISO date)
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @"date\s*=\s*[^{}\|=]*?\b" + MonthList + @"\b(?:\s*\|?[^{}]*?))\|\s*\bmonth\s*=\s*\2\s*(\||\}\})", "$1$3", false); // date then month
VisibleFixes += RegexReplace(ref ArticleText, @"(?si)(\{\{\s*cit[^{}]*)\|\s*month\s*=\s*" + MonthList + @"\s*((?:\|[^{}]*?)?\|?\s*\bdate\s*=\s*[^{}\|=]*?\b\2\s*([^{}\|=]*?(\||\}\})))", "$1$3", false); // month then date
VisibleFixes += LoopedRegexReplace(ref ArticleText, @"(?s)({{\s*[Cc]it[^{}]*\|\s*)(\w+)\s*=\s*([^\|}{]+?)\s*\|((?:[^{}]*?\|)?\s*)\2(\s*=\s*)\3(\s*(\||\}\}))", "$1$4$2$5$3$6", false); // duplicate field remover for cite templates
VisibleFixes += LoopedRegexReplace(ref ArticleText, @"(?s)(\{\{\s*cit[^{}]*\|\s*)(\w+)(\s*=\s*[^\|}{\s][^\|}{]+?\s*\|?(?:[^{}]*?)?)\|\s*\2\s*=\s*(\||\}\})", "$1$2$3$4", false); // 'field=populated | field=null' drop field=null
VisibleFixes += LoopedRegexReplace(ref ArticleText, @"(?s)(\{\{\s*cit[^{}]*\|\s*)(\w+)\s*=\s*\|\s*((?:[^{}]+?\|)?\s*\2\s*=\s*[^\|}{\s])", "$1$3", false); // 'field=null | field=populated' drop field=null
if(!ArticleTextBeforeDuplicateCiteField.Equals(ArticleText))
{
SpecificFixes = VisibleFixes - VisibleFixesCheckpoint;
Summary += "rm dupe cite field(s), ";
}
// DEFAULTSORT fixes, may be caught by AWB gen fixes before reaching here
ArticleTextBeforeDEFAULTSORT = ArticleText;
VisibleFixes += RegexReplace(ref ArticleText, @"(?s)({{DEFAULTSORT:(.*?)}})\s+(.*?){{DEFAULTSORT:\2}}\s+", "$1\n$3", false); // double defaultsort remover
VisibleFixes += LoopedRegexReplace(ref ArticleText, @"(?s)(\n\[\[Category:)([^\]]+\]\])(.*?)\1\2", "$1$2$3"); // duplicate category remover
VisibleFixes += LoopedRegexReplace(ref ArticleText, @"(?s)(\n\[\[Category:)([^\]\|]+)(\|.*?\]\])(.*?)\1\2\]\]", "$1$2$3$4"); // duplicate category remover 2
VisibleFixes += LoopedRegexReplace(ref ArticleText, @"(?s)(\n\[\[Category:)([^\]\|]+)\]\](.*?)\1\2(\|.*?\]\])", "$1$2$4$3"); // duplicate category remover 3
VisibleFixes += LoopedRegexReplace(ref ArticleText, @"(?s){{DEFAULTSORT\:([^{}]+)}}(.*?\[\[Category\:[^{}]+)\|\1\]\]", "{{DEFAULTSORT:$1}}$2]]", false); // defautsort with category cleaner
VisibleFixes += RegexReplace(ref ArticleText, @"({{DEFAULTSORT.*?}}\n)(.*?)(\[\[Category:)", "$2$1$3", false); // moves defaultsort to be directly above category
//VisibleFixes += RegexReplace(ref ArticleText, @"(?si){{DEFAULTSORT\:([^{}]+)}}(.*?\[\[Category\:(?:\d{4} (?:births|deaths)|Living people))\|\1\]\]", "{{DEFAULTSORT:$1}}$2]]"); // births deaths category cleaner
//VisibleFixes += RegexReplace(ref ArticleText, @"(?si){{DEFAULTSORT\:([^{}]+)}}(.*?)\[\[Category:Living people\]\]\n(.*?)\[\[Category:Year of birth missing (living people)\]\](?:\n)?", "{{Lifetime|||$1}}$2$3"); // living birth missing lifetime
//VisibleFixes += RegexReplace(ref ArticleText, @"(\[\[(?:" + MonthList2 + @"\s*[1-3]?\d|[1-3]?\d\s+" + MonthList2 + @")\]\],?)\s+(\d{4})\b", "$1 [[$2]]"); // fully wikilink part wikilinked dates
//VisibleFixes += RegexReplace(ref ArticleText, @"\b((?:" + MonthList2 + @"\s*[1-3]?\d|[1-3]?\d\s+" + MonthList2 + @"))(,?)\s+(\[\[\d{4}\]\])", "[[$1]]$2 $3"); // fully wikilink part wikilinked dates
if(!ArticleTextBeforeDEFAULTSORT.Equals(ArticleText))
Summary += "tidy up DEFAULTSORT/categories, ";
// langfixes
VisibleFixes += LoopedRegexReplace(ref ArticleText, @"(?i)((\(|[;,]\s*)\s*\[\[\s*" + LangList + @" language\s*\|\s*\3)\s*:+(\]\])", "$1$4:"); // [[blah language|blah:]] --> [[blah language|blah]]:
if(LANGTAG_FOR_MODE)
{
int NonCountedFixes = 0; // language tag for fix may involve multiple steps that should count as just one
VisibleFixes += MottoFix(ref ArticleText);
VisibleFixesCheckpoint = VisibleFixes;
NonCountedFixes += LoopedRegexReplace(ref ArticleText, @"(\(|;\s*)\[\[" + LangList + @"\]\]\s*:+(\s*(?<ap>''|"")?(''')?([^'{}\)\]\[;/""]+?)\k<ap>?(\2)?\s*(\)|;|\s=|,\s*(?:(?:commonly\s+)?abbreviated|singular|plural|from|transl|alternative|\bor\b|(?:sometimes\s+)?also|meaning|literally|died|born|\(?\d{3,4}))(\2)?(?:\k<ap>([^']))?)", "$1[[$2 language|$2]]:$3"); // converts wikilinked language name to [[blah language|blah]] to be templated by later fixes
VisibleFixes += LoopedRegexReplace(ref ArticleText, @"(?i)((\(|[;,]\s*)\s*\[\[\s*" + LangList + @" language\s*\|\s*\3\s*\]\]\s*):+\s*", "$1 for ");
SpecificFixes = VisibleFixes - VisibleFixesCheckpoint;
if(SpecificFixes > 0)
Summary += String.Format("clarify translation by using 'for' ({0}), ", SpecificFixes);
}
if(!REPORT_MODE && !LANGTAG_FOR_MODE)
{
int NonCountedFixes = 0; // language tag fixes involve multiple steps that should count as just one
ArticleTextBeforeLanguageTemplate = ArticleText;
VisibleFixes += MottoFix(ref ArticleText);
NonCountedFixes += LoopedRegexReplace(ref ArticleText, @"(\(|;\s*)\[\[" + LangList + @"\]\]\s*:+(\s*(?<ap>''|"")?(''')?([^'{}\)\]\[;/""]+?)\k<ap>?(\2)?\s*(\)|;|\s=|,\s*(?:(?:commonly\s+)?abbreviated|singular|plural|from|transl|alternative|\bor\b|(?:sometimes\s+)?also|meaning|literally|died|born|\(?\d{3,4}))(\2)?(?:\k<ap>([^']))?)", "$1[[$2 language|$2]]:$3"); // converts wikilinked language name to [[blah language|blah]] to be templated by later fixes
NonCountedFixes += LoopedRegexReplace(ref ArticleText, @"(?i)(\(|[;,]\s*)\s*\[\[\s*" + LangList + @" language\s*\|\s*\2\s*\]\]\s*(?:translation)?:+\s*((?:''')?)((?:''|"")?)([^{}\)\(\]\[;/""<>']*?(?:[^{}\)\(\]\[;/""<>']'[^{}\)\(\]\[;/""<>']*?)*(?:\([^{}\)\(\]\[;/""<>']+\)[^{}\)\(\]\[;/""<>']*)*)(?:(\3)\4\s*((?:(?<=[^']'''?(?:'')?)\s*)|\)|;|\s=|(?:,|(?<=[^']'''?(?:'')?))\s*\[*(?:(?:commonly\s+)?abbreviated|singular|plural|from|transl|alternative|\bor\b|(?:sometimes\s+)?also|meaning|literally|died|born|(?-i)[A-Z]+\b(?i)|\(?\d{3,4}))|\s*(\)|;|\s=|,\s*\[*(?:(?:commonly\s+)?abbreviated|singular|plural|from|transl|alternative|\bor\b|(?:sometimes\s+)?also|meaning|literally|died|born|January|February|March|April|May|June|July|August|September|October|November|December|(?-i)[A-Z]+\b(?i)|\(?\d{3,4}))(\3)\4)", "$1{{lang-$2@@|$3$5$6$9}}$7$8"); //Language tag fix general, but not items with '''bold word''' in middle
NonCountedFixes += LoopedRegexReplace(ref ArticleText, @"(?i)(\(|[;,]\s*)\s*\[\[\s*" + LangList + @" language\s*\|\s*\2\s*\]\]\s*(?:translation)?:+\s*((?:''')?)((?:''|"")?)([^{}\)\(\]\[;/""<>']*?(?:[^{}\)\(\]\[;/""<>']'[^{}\)\(\]\[;/""<>']*?)*(?:'''[^{}\)\(\]\[;/""<>']+'''[^{}\)\(\]\[;/""<>']*)+)(?:(\3)\4\s*((?:(?<=[^']'''?(?:'')?)\s*)|\)|;|\s=|(?:,|(?<=[^']'''?(?:'')?))\s*\[*(?:(?:commonly\s+)?abbreviated|singular|plural|from|transl|alternative|\bor\b|(?:sometimes\s+)?also|meaning|literally|died|born|(?-i)[A-Z]+\b(?i)|\(?\d{3,4}))|\s*(\)|;|\s=|,\s*\[*(?:(?:commonly\s+)?abbreviated|singular|plural|from|transl|alternative|\bor\b|(?:sometimes\s+)?also|meaning|literally|died|born|January|February|March|April|May|June|July|August|September|October|November|December|(?-i)[A-Z]+\b(?i)|\(?\d{3,4}))(\3)\4)", "$1{{lang-$2@@|$3$5$6$9}}$7$8"); //Language tag fix for items with '''bold word''' in middle
// before running this find & replaces, check that one or more of the above three fixes changed something
if (!ArticleTextBeforeLanguageTemplate.Equals(ArticleText))
{
VisibleFixesCheckpoint = VisibleFixes;
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Abkhazian@@\|", "$1ab|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Afar@@\|", "$1aa|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Afrikaans@@\|", "$1af|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Akan@@\|", "$1ak|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Albanian@@\|", "$1sq|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Amharic@@\|", "$1am|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Arabic@@\|", "$1ar|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Aragonese@@\|", "$1an|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Armenian@@\|", "$1hy|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Assamese@@\|", "$1as|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Avaric@@\|", "$1av|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Avestan@@\|", "$1ae|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Aymara@@\|", "$1ay|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Azerbaijani@@\|", "$1az|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Bambara@@\|", "$1bm|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Bashkir@@\|", "$1ba|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Basque@@\|", "$1eu|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Belarusian@@\|", "$1be|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Bengali@@\|", "$1bn|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Bihari@@\|", "$1bh|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Bislama@@\|", "$1bi|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Bosnian@@\|", "$1bs|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Breton@@\|", "$1br|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Bulgarian@@\|", "$1bg|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Burmese@@\|", "$1my|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Catalan@@\|", "$1ca|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Chamorro@@\|", "$1ch|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Chechen@@\|", "$1ce|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Chichewa@@\|", "$1ny|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Chinese@@\|", "$1zh|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Church Slavic@@\|", "$1cu|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Chuvash@@\|", "$1cv|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Cornish@@\|", "$1kw|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Corsican@@\|", "$1co|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Cree@@\|", "$1cr|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Croatian@@\|", "$1hr|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Czech@@\|", "$1cs|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Danish@@\|", "$1da|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Divehi@@\|", "$1dv|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Dutch@@\|", "$1nl|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Dzongkha@@\|", "$1dz|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)English@@\|", "$1en|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Esperanto@@\|", "$1eo|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Estonian@@\|", "$1et|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Ewe@@\|", "$1ee|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Faroese@@\|", "$1fo|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Fijian@@\|", "$1fj|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Finnish@@\|", "$1fi|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)French@@\|", "$1fr|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Fulah@@\|", "$1ff|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Galician@@\|", "$1gl|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Ganda@@\|", "$1lg|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Georgian@@\|", "$1ka|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)German@@\|", "$1de|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Greek@@\|", "$1el|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Guaran@@\|", "$1gn|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Gujarati@@\|", "$1gu|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Haitian@@\|", "$1ht|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Hausa@@\|", "$1ha|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Hebrew@@\|", "$1he|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Herero@@\|", "$1hz|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Hindi@@\|", "$1hi|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Hiri Motu@@\|", "$1ho|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Hungarian@@\|", "$1hu|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Icelandic@@\|", "$1is|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Ido@@\|", "$1io|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Igbo@@\|", "$1ig|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Indonesian@@\|", "$1id|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Interlingue@@\|", "$1ie|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Inuktitut@@\|", "$1iu|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Inupiaq@@\|", "$1ik|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Irish@@\|", "$1ga|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Italian@@\|", "$1it|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Japanese@@\|", "$1ja|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Javanese@@\|", "$1jv|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Kalaallisut@@\|", "$1kl|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Kannada@@\|", "$1kn|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Kanuri@@\|", "$1kr|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Kashmiri@@\|", "$1ks|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Kazakh@@\|", "$1kk|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Khmer@@\|", "$1km|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Kikuyu@@\|", "$1ki|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Kinyarwanda@@\|", "$1rw|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Kirghiz@@\|", "$1ky|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Kirundi@@\|", "$1rn|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Komi@@\|", "$1kv|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Kongo@@\|", "$1kg|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Korean@@\|", "$1ko|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Kurdish@@\|", "$1ku|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Kwanyama@@\|", "$1kj|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Lao@@\|", "$1lo|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Latin@@\|", "$1la|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Latvian@@\|", "$1lv|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Limburgish@@\|", "$1li|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Lingala@@\|", "$1ln|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Lithuanian@@\|", "$1lt|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Luxembourgish@@\|", "$1lb|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Macedonian@@\|", "$1mk|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Malagasy@@\|", "$1mg|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Malay@@\|", "$1ms|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Malayalam@@\|", "$1ml|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Maltese@@\|", "$1mt|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Manx@@\|", "$1gv|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Marathi@@\|", "$1mr|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Marshallese@@\|", "$1mh|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Moldavian@@\|", "$1mo|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Mongolian@@\|", "$1mn|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Nauru@@\|", "$1na|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Navajo@@\|", "$1nv|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Ndonga@@\|", "$1ng|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Nepali@@\|", "$1ne|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)North Ndebele@@\|", "$1nd|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Northern Sami@@\|", "$1se|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Norwegian@@\|", "$1no|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Norwegian Bokml@@\|", "$1nb|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Norwegian Nynorsk@@\|", "$1nn|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Occitan@@\|", "$1oc|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Ojibwa@@\|", "$1oj|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Oriya@@\|", "$1or|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Oromo@@\|", "$1om|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Ossetian@@\|", "$1os|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)P[au]njabi@@\|", "$1pa|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Pashto@@\|", "$1ps|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Persian@@\|", "$1fa|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Polish@@\|", "$1pl|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Portuguese@@\|", "$1pt|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Quechua@@\|", "$1qu|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Raeto-Romance@@\|", "$1rm|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Romanian@@\|", "$1ro|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Russian@@\|", "$1ru|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Samoan@@\|", "$1sm|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Sango@@\|", "$1sg|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Sanskrit@@\|", "$1sa|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Sardinian@@\|", "$1sc|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Scottish Gaelic@@\|", "$1gd|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Serbian@@\|", "$1sr|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Serbo-Croatian@@\|", "$1sh|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Shona@@\|", "$1sn|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Sichuan Yi@@\|", "$1ii|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Sindhi@@\|", "$1sd|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Sinhala@@\|", "$1si|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Slovak@@\|", "$1sk|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Slovenian@@\|", "$1sl|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Somali@@\|", "$1so|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)South Ndebele@@\|", "$1nr|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Southern Sotho@@\|", "$1st|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Spanish@@\|", "$1es|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Sundanese@@\|", "$1su|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Swahili@@\|", "$1sw|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Swati@@\|", "$1ss|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Swedish@@\|", "$1sv|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Tagalog@@\|", "$1tl|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Tahitian@@\|", "$1ty|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Tajik@@\|", "$1tg|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Tamil@@\|", "$1ta|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Tatar@@\|", "$1tt|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Telugu@@\|", "$1te|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Thai@@\|", "$1th|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Tibetan@@\|", "$1bo|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Tigrinya@@\|", "$1ti|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Tonga@@\|", "$1to|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Tsonga@@\|", "$1ts|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Tswana@@\|", "$1tn|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Turkish@@\|", "$1tr|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Turkmen@@\|", "$1tk|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Twi@@\|", "$1tw|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Uighur@@\|", "$1ug|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Ukrainian@@\|", "$1uk|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Urdu@@\|", "$1ur|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Uzbek@@\|", "$1uz|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Venda@@\|", "$1ve|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Vietnamese@@\|", "$1vi|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Volapk@@\|", "$1vo|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Walloon@@\|", "$1wa|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Welsh@@\|", "$1cy|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Western Frisian@@\|", "$1fy|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Wolof@@\|", "$1wo|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Xhosa@@\|", "$1xh|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Yiddish@@\|", "$1yi|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Yoruba@@\|", "$1yo|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Zhuang@@\|", "$1za|");
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)({{lang-)Zulu@@\|", "$1zu|");
VisibleFixes += RegexReplace(ref ArticleText, @"([^'])('')((?:\(|;\s*){{lang-\w\w\|.*?}}\))\2([^'])", "$1$3$4"); //removes outside italics
VisibleFixes += RegexReplace(ref ArticleText, @"([^'])(''')((?:\(|;\s*){{lang-\w\w\|)([^']+?)(}}\))\2([^'])", "$1$3$2$4$2$5$6"); //moves bold inside
NonCountedFixes += RegexReplace(ref ArticleText, @" }};", " }}", false); // Fix where control character matches on ; within Anbsp; HTML tag
NonCountedFixes += RegexReplace(ref ArticleText, @"([\(;,]\s*\{\{lang-([a-z][a-z])\|[^{}\|<>""]+}},?\s+(?:or|also|also\s+called|singular:)\s+)(''+[^{}\|<>'""]+''+)\s*\)", "$1{{lang|$2|$3}})"); // ({{lang-aa|word}} or ''word'') --> ({{lang-aa|word}} or {{lang|ar|''word''}})
SpecificFixes = VisibleFixes - VisibleFixesCheckpoint;
Summary += String.Format("apply [[Template:Lang]] ({0}), ", SpecificFixes);
}
}
else if(REPORT_MODE) // just extract text in ([[Foreign language|Foreign]]: ''text'') to edit summary to review whether it's English or not
{
string LangTagReport = "";
LangTagReport = LanguageTagReport(ArticleText);
if(!LangTagReport.Equals(""))
{
LogToFile("@@@LANGTAG@@@" + LangTagReport);
ArticleLogged = true;
}
}
// lang template whitespace fix
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)(\{\{)\s*(lang-\w\w\|)\s+([^{}<>]+?)\s*(\}\})", "$1$2$3$4", false);
VisibleFixes += RegexReplace(ref ArticleText, @"(?i)(\{\{)\s*(lang-\w\w\|)\s*([^{}<>]+?)\s+(\}\})", "$1$2$3$4", false);
// do some stuff in report mode only, TODO under construction
if(REPORT_MODE || APPLY_AMBIGUOUS_FIXES)
{
ArticleTextBeforeReportMode = ArticleText;
VisibleFixes += RegexReplace(ref ArticleText, @"(?si)(\{\{\s*cit[^{}]*\w+\s\w+\s\w+\s\w+\s*)=\s*\|", "$1|"); // remove = from end of field
VisibleFixes += RegexReplace(ref ArticleText, @"(?si)(\{\{\s*cit[^{}]*\|)(^[={}]*?\|)", "$1unused_text=$2"); // cite fields without a field= entry between two bars
VisibleFixes += RegexReplace(ref ArticleText, @"(?si)(<ref)>(\s+name\s*=\s*[^<>\/]+?/?>)", "$1$2"); // <ref> name = "Fred">, TODO scan & test
if(!Regex.IsMatch(ArticleText, @"(?si)\{\{\s*cite\s+(map|sm|manual)\b\s*\|"))
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @"\w{2,}\s*=\s*[^{}\|<>]+?)[,\\/\+]?(\s(?-i)[a-z\d]{2,}\s*=)", "$1|$2"); // cite fields with no | between fields, |, matches lowercase second field name only, TODO scan & test,
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @"url\s*)\|(\s*http://)", "$1=$2"); // {{cite web url|http://, TODO scan & test
// trial fixes section
// cite...title=[http://url description] --> title=description | url=http://url, TODO scan & test
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @"title\s*=\s*)\[+\s*(http://[^\[\]<>""\s]+)\s+([^{}\|<>]+?)\]+(\s*(?:\||\}\}))" , "$1$3|url=$2$4");
// cite title=http://url --> title=null | url=http://url
VisibleFixes += RegexReplace(ref ArticleText, SICitStart + @"title\s*=\s*)\[*\s*(http://[^\[\]<>""\s]+)\s*\]*" + TemEnd, "$1|url=$2$3");
//VisibleFixes += LoopedRegexReplace(ref ArticleText, @"(?i)((\(|[;,]\s*)\s*\[\[\s*" + LangList + @" language\s*\|\s*\3)\s*(\]\])\s*,", "$1$4:"); // [[blah language|blah]], --> [[blah language|blah]]:, many false positives, TODO probably remove this
// end trial fixes
if (!ArticleTextBeforeReportMode.Equals(ArticleText))
{
LogToFile("@@@AMBIG@@@" + "Ambiguous fixes applied");
ArticleLogged = true;
}
}
// if in report mode log articles with a visible change, except those already logged by langtags report mode
if(REPORT_MODE && VisibleFixes > 0 && !ArticleLogged)
{
if(Summary.Equals(""))
Summary = "no specific summary";
LogToFile("@@@SUMMARY@@@" + Summary);
}
if ((VisibleFixes == 0 && !FIXING_TYPOS) || REPORT_MODE)
{
Skip = true;
Summary = "";
return(OriginalArticleText);
}
return(ArticleText);
}
// method to perform looped find & replace on given input string
public int LoopedRegexReplace(ref string ArticleTextLocal, string Find, string Replace, bool VisibleFix)
{
int MatchCount = 0;
for(int i = 0; i < 20; i++)
{
string Before = ArticleTextLocal;
MatchCount += Regex.Matches(ArticleTextLocal, Find).Count;
ArticleTextLocal = Regex.Replace(ArticleTextLocal, Find, Replace);
if(Before.Equals(ArticleTextLocal))
break;
}
if(MatchCount > 0 && DEBUG_MODE)
LogToFile(String.Format(@"@@@DEBUG@@@Matched {0} times on find=", MatchCount) + Find + " replace=" + Replace);
if(VisibleFix)
return(MatchCount);
return(0);
}
public int LoopedRegexReplace(ref string ArticleTextLocal, string Find, string Replace)
{
return(LoopedRegexReplace(ref ArticleTextLocal, Find, Replace, true));
}
// method to work out the dominant date format in an article given article text, returns string with locale (also used in edit summary)
public string DetermineArticleDateLocale(string ArticleTextLocal)
{
int AmericanDateCount = Regex.Matches(ArticleTextLocal, @"" + MonthList + @"\s*[0-3]?\d").Count;
int InternationalDateCount = Regex.Matches(ArticleTextLocal, @"[0-3]?\d\s*" + MonthList + @"").Count;
// if counts are equal, return "US"
if(InternationalDateCount > AmericanDateCount)
return("Intl."); //+ InternationalDateCount + @"/" + AmericanDateCount);
else
return("US"); //+ AmericanDateCount + @"/" + InternationalDateCount);
}
// method to add format=PDF to citations without it, masks then unmasks citations with field already set
public int AddPDFFormatField(ref string ArticleTextLocal)
{
string Before = ArticleTextLocal;
int MatchCount = 0;
// mask citations with PDF format given
ArticleTextLocal = Regex.Replace(ArticleTextLocal, @"(?si)\{\{(\s*cit[^{}]*format\s*=[^}{]*PDF)", "#{{#$1");
// add PDF format
MatchCount = Regex.Matches(ArticleTextLocal, SICitStart + @"url\s*=\s*[^\|}{]+?\.PDF\s*)(\||\}\})").Count;
ArticleTextLocal = Regex.Replace(ArticleTextLocal, SICitStart + @"url\s*=\s*[^\|}{]+?\.PDF\s*)(\||\}\})", "$1|format=PDF$2");
// unmask
ArticleTextLocal = Regex.Replace(ArticleTextLocal, @"#\{\{#", "{{");
if(!Before.Equals(ArticleTextLocal))
return(MatchCount);
return(0);
}
// method to extract text in ([[Foreign language|Foreign]]: ''text'') to edit summary to review whether it's English or not
public string LanguageTagReport(string ArticleTextLocal)
{
string ReturnString = "@@@langtag@@@";
int NotCounting = 0;
Regex exp1 = new Regex(@"(?i)(\(|[;,]\s*)\s*\[\[\s*" + LangList + @" language\s*\|\s*\2\s*\]\]\s*(?:translation)?:+\s*((?:''')?)((?:''|"")?)([^{}\)\(\]\[;/""<>']*?(?:[^{}\)\(\]\[;/""<>']'[^{}\)\(\]\[;/""<>']*?)*(?:\([^{}\)\(\]\[;/""<>']+\)[^{}\)\(\]\[;/""<>']*)*)(?:(\3)\4\s*((?:(?<=[^']'''?(?:'')?)\s*)|\)|;|\s=|(?:,|(?<=[^']'''?(?:'')?))\s*\[*(?:(?:commonly\s+)?abbreviated|singular|plural|from|transl|alternative|\bor\b|(?:sometimes\s+)?also|meaning|literally|died|born|(?-i)[A-Z]+\b(?i)|\(?\d{3,4}))|\s*(\)|;|\s=|,\s*\[*(?:(?:commonly\s+)?abbreviated|singular|plural|from|transl|alternative|\bor\b|(?:sometimes\s+)?also|meaning|literally|died|born|January|February|March|April|May|June|July|August|September|October|November|December|(?-i)[A-Z]+\b(?i)|\(?\d{3,4}))(\3)\4)", RegexOptions.Compiled); // langtags fix general
Regex exp2 = new Regex(@"(?i)(\(|[;,]\s*)\s*\[\[\s*" + LangList + @" language\s*\|\s*\2\s*\]\]\s*(?:translation)?:+\s*((?:''')?)((?:''|"")?)([^{}\)\(\]\[;/""<>']*?(?:[^{}\)\(\]\[;/""<>']'[^{}\)\(\]\[;/""<>']*?)*(?:'''[^{}\)\(\]\[;/""<>']+'''[^{}\)\(\]\[;/""<>']*)+)(?:(\3)\4\s*((?:(?<=[^']'''?(?:'')?)\s*)|\)|;|\s=|(?:,|(?<=[^']'''?(?:'')?))\s*\[*(?:(?:commonly\s+)?abbreviated|singular|plural|from|transl|alternative|\bor\b|(?:sometimes\s+)?also|meaning|literally|died|born|(?-i)[A-Z]+\b(?i)|\(?\d{3,4}))|\s*(\)|;|\s=|,\s*\[*(?:(?:commonly\s+)?abbreviated|singular|plural|from|transl|alternative|\bor\b|(?:sometimes\s+)?also|meaning|literally|died|born|January|February|March|April|May|June|July|August|September|October|November|December|(?-i)[A-Z]+\b(?i)|\(?\d{3,4}))(\3)\4)", RegexOptions.Compiled); // 'Language tag fix for items...'
ArticleTextLocal = Regex.Replace(ArticleTextLocal, @"(\(|;\s*)\[\[" + LangList + @"\]\]\s*:+(\s*(?<ap>''|"")?(''')?([^'{}\)\]\[;/""]+?)\k<ap>?(\2)?\s*(\)|;|\s=|,\s*(?:(?:commonly\s+)?abbreviated|singular|plural|from|transl|alternative|\bor\b|(?:sometimes\s+)?also|meaning|literally|died|born|\(?\d{3,4}))(\2)?(?:\k<ap>([^']))?)", "$1[[$2 language|$2]]:$3"); // converts wikilinked language name to [[blah language|blah]] to be templated by later fixes
MatchCollection MatchList1 = exp1.Matches(ArticleTextLocal);
for (int i = 0; i < MatchList1.Count; i++)
{
Match TheMatch = MatchList1[i];
ReturnString += TheMatch.Groups[2].Value + "@@@" + TheMatch.Groups[5].Value + "@@@";
}
// 'Language tag fix for items' requires langtags general to run first
NotCounting = LoopedRegexReplace(ref ArticleTextLocal, @"(?i)(\(|[;,]\s*)\s*\[\[\s*" + LangList + @" language\s*\|\s*\2\s*\]\]\s*(?:translation)?:+\s*((?:''')?)((?:''|"")?)([^{}\)\(\]\[;/""<>']*?(?:[^{}\)\(\]\[;/""<>']'[^{}\)\(\]\[;/""<>']*?)*(?:\([^{}\)\(\]\[;/""<>']+\)[^{}\)\(\]\[;/""<>']*)*)(?:(\3)\4\s*((?:(?<=[^']'''?(?:'')?)\s*)|\)|;|\s=|(?:,|(?<=[^']'''?(?:'')?))\s*\[*(?:(?:commonly\s+)?abbreviated|singular|plural|from|transl|alternative|\bor\b|(?:sometimes\s+)?also|meaning|literally|died|born|(?-i)[A-Z]+\b(?i)|\(?\d{3,4}))|\s*(\)|;|\s=|,\s*\[*(?:(?:commonly\s+)?abbreviated|singular|plural|from|transl|alternative|\bor\b|(?:sometimes\s+)?also|meaning|literally|died|born|January|February|March|April|May|June|July|August|September|October|November|December|(?-i)[A-Z]+\b(?i)|\(?\d{3,4}))(\3)\4)", "$1{{lang-$2@@|$3$5$6$9}}$7$8"); // langtags fix general
MatchCollection MatchList2 = exp2.Matches(ArticleTextLocal);
if((MatchList1.Count + MatchList2.Count) == 0)
return("");
for (int i = 0; i < MatchList2.Count; i++)
{
Match TheMatch = MatchList2[i];
ReturnString += TheMatch.Groups[2].Value + "@@@" + TheMatch.Groups[5].Value + "@@@";
}
return(ReturnString);
}
// method to perform given regex replace on article text, returning whether visible fix made, fixes assumed visible unless specified
public int RegexReplace(ref string ArticleText, string Find, string Replace, bool VisibleFix)
{
string Before = ArticleText;
int MatchCount = Regex.Matches(ArticleText, Find).Count;
ArticleText = Regex.Replace(ArticleText, Find, Replace);
if(MatchCount > 0 && DEBUG_MODE)
LogToFile(String.Format(@"@@@DEBUG@@@Matched {0} times on ", MatchCount) + Find);
if(!Before.Equals(ArticleText) && VisibleFix)
return(MatchCount);
return(0);
}
public int RegexReplace(ref string ArticleText, string Find, string Replace)
{
return(RegexReplace(ref ArticleText, Find, Replace, true));
}
public void LogToFile(string text)
{
System.IO.StreamWriter writer = new System.IO.StreamWriter("Module.log", true); // specifies append mode
writer.WriteLine("[[" + ArticleTitleG + "]]" + Regex.Replace(text, "\r?\n", " ")); // + "@@@" + DateTime.Now
writer.Close();
}
public int MottoFix(ref string ArticleText)
{
int VisibleFixes = 0;
string MottoBeg = @"(?i)(motto\s*=\s*)(''')?('')?(?:{{lang\|";
string MottoMid = @"\|)?(.*?)(?:}})?\3?\4?(\s*\<\/?br\s*\/?\>\s*(?:\<small\>)?\s*\(?\s*)\[\[\s*";
string MottoEnd = @"\s*\]\]\:\s*(.*?)(\s*\|)";
VisibleFixes += RegexReplace(ref ArticleText, MottoBeg + @"la" + MottoMid + @"Latin" + MottoEnd, "$1{{lang-la|$2$4$2}}$5$6$7");
VisibleFixes += RegexReplace(ref ArticleText, MottoBeg + @"el" + MottoMid + @"Greek" + MottoEnd, "$1{{lang-el|$2$4$2}}$5$6$7");
return(VisibleFixes);
}
public string CleanFriendlyName(string FriendlyName)
{
string CharsToTrim = @".;: {}[]|`?\/$’‘-_–=+,";
FriendlyName = Regex.Replace(FriendlyName, @"(\<\!--.*?--\>|⌊{3,}\d+⌋{3,})" , ""); // rm comments from ref name, might be masked
FriendlyName = FriendlyName.Trim(CharsToTrim.ToCharArray());
FriendlyName = Regex.Replace(FriendlyName, @"(''+|[“‘”""\[\]\(\)\<\>⌋⌊])" , ""); // remove chars
FriendlyName = Regex.Replace(FriendlyName, @"(\s{2,}| |\t|\n)" , " "); // spacing fixes
if(Regex.IsMatch(FriendlyName, @"(?im)(\s*(date\s+)?(retrieved|accessed)\b|^\d+$)")) // don't allow friendly name to be 'retrieved on...' or just a number
return("");
return(FriendlyName);
}
public string DeriveFriendlyName(string ArticleText, string Mask, int Components)
{
string FriendlyNameString = "";
Regex FriendlyName = new Regex(Mask, RegexOptions.Compiled);
MatchCollection FriendlyMatchList = FriendlyName.Matches(ArticleText);
if(FriendlyMatchList.Count > 0)
{
Match FriendlyMatch = FriendlyMatchList[0];
FriendlyNameString = FriendlyMatch.Groups[1].Value;
if(Components > 1)
FriendlyNameString += " " + FriendlyMatch.Groups[2].Value;
if(Components > 2)
FriendlyNameString += " " + FriendlyMatch.Groups[3].Value;
}
else return("");
return(CleanFriendlyName(FriendlyNameString));
}
// from http://www.codeproject.com/KB/recipes/improvestringsimilarity.aspx
public int ComputeDistance (string s, string t)
{
int n=s.Length;
int m=t.Length;
int[,] distance=new int[n + 1, m + 1]; // matrix
int cost=0;
if(n == 0) return m;
if(m == 0) return n;
//init1
for(int i=0; i <= n; distance[i, 0]=i++);
for(int j=0; j <= m; distance[0, j]=j++);
//find min distance
for(int i=1; i <= n; i++)
{
for(int j=1; j <= m;j++)
{
cost=(t.Substring(j - 1, 1) ==
s.Substring(i - 1, 1) ? 0 : 1);
distance[i,j]=Math.Min(Math.Min(distance[i - 1, j] + 1,
distance[i, j - 1] + 1),
distance[i - 1, j - 1] + cost);
}
}
return distance[n, m];
}
public float GetSimilarity(string string1, string string2)
{
float dis=ComputeDistance(string1, string2);
float maxLen=string1.Length;
if (maxLen < string2.Length)
maxLen = string2.Length;
if (maxLen == 0.0F)
return 1.0F;
else
return 1.0F - dis/maxLen;
}
// returns 0 if fields present and the same, -1 if fields not present and +1 if fields present and different
public int FieldsTheSame(string RefValue1, string RefValue2, string FieldName)
{
string FieldFormat = @"(?si).*?\|\s*" + FieldName + @"\s*=\s*([^{}\|]+?)" + TemEnd + @".*";
string Field1 = Regex.Replace(RefValue1, FieldFormat, "$1").Trim();
string Field2 = Regex.Replace(RefValue2, FieldFormat, "$1").Trim();
if(DEBUG_MODE)
LogToFile(String.Format("@@@DUPREF@@@Asked for '{0}', values@@@{1}@@@{2}", FieldName, Field1, Field2));
if(Field1.Equals("") || Field2.Equals(""))
return(-1);
if(Field1.Equals(Field2))
return(0);
else
return(1);
}
public bool DifferentByPageOnly(string RefValue1, string RefValue2)
{
bool PagesPresentAndDifferent = false;
string NullPagesField = @"(?si)(.*?\|\s*pages?\s*=)\s*[^{}\|]+?(\s*(?:\||\}\}).*)";
if(FieldsTheSame(RefValue1, RefValue2, "pages") == 1)
PagesPresentAndDifferent = true;
if (!PagesPresentAndDifferent && FieldsTheSame(RefValue1, RefValue2, "page") == 1)
PagesPresentAndDifferent = true;
if(PagesPresentAndDifferent)
{
// remove value of pages field from refs, then see if they are the same
string PagelessRefValue1 = Regex.Replace(RefValue1, NullPagesField, "$1$2");
string PagelessRefValue2 = Regex.Replace(RefValue2, NullPagesField, "$1$2");
if(DEBUG_MODE)
LogToFile(String.Format("@@@DUPREF@@@DifferentByPageOnly removed pages to give values@@@{0}@@@{1}", PagelessRefValue1, PagelessRefValue2));
// refs are different by pages only if word character part of two refs are the same without the pages info in
if(Regex.Replace(PagelessRefValue1, @"\W", "").Trim().Equals(Regex.Replace(PagelessRefValue2, @"\W", "").Trim()))
return(true);
}
return(false);
}
public bool AllFieldsTheSame(string RefValue1, string RefValue2, string Field1, string Field2)
{
return(FieldsTheSame(RefValue1, RefValue2, Field1) == 0 && FieldsTheSame(RefValue1, RefValue2, Field2) == 0);
}
public bool AllFieldsTheSame(string RefValue1, string RefValue2, string Field1, string Field2, string Field3)
{
return(AllFieldsTheSame(RefValue1, RefValue2, Field1, Field2) && FieldsTheSame(RefValue1, RefValue2, Field3) == 0);
}
//