Jump to content

User:Tom.Bot/Task2 code

From Wikipedia, the free encyclopedia

Source

[edit]
public string ProcessArticle(string ArticleText, string ArticleTitle, int wikiNamespace, out string Summary, out bool Skip)
{
	// global switches //////////////////////////////////////////////////////////
	
	bool SaveSkipSummaries = false;
	bool SaveSkipSummaries_FromOk = false;
	bool SkipPagesLargerThanLimit = false; // used with int Limit
	bool SaveOnlyPolbotPages = false;
	bool ManuallyCheckPagesWithoutAnInfobox = false; // b/c some BLPs have unnecessary {{Taxonbar}}s
	bool LiveDebug = false;
	bool GenFixes = true; // summary text only
	Skip = false;
	
	
	// global-use vars //////////////////////////////////////////////////////////
	
	int Limit = 2500; // characters/bytes on a page; used with bool SkipPagesLargerThanLimit
	bool HTML1Attempted = false;
	bool FromOk = false;
	Summary = "";
	
	
	// preliminary exceptions/error checking ////////////////////////////////////
	
	if (SaveOnlyPolbotPages)
	{
		bool Polbot = Regex.IsMatch(ArticleText, @"\bPolbot\b", RegexOptions.IgnoreCase);
		if (!Polbot)
		{
			Summary = @"!Polbot. ";
			Skip = true;
		}
	}
	
	if (SkipPagesLargerThanLimit)
	{
		string TooBig_Regex = @"^[\d\D]{" + (Limit + 1) + "}";
		bool TooBig = Regex.IsMatch(ArticleText, TooBig_Regex);
		if (TooBig)
		{
			Summary += "Too big (>" + Limit + "B). ";
			Skip = true;
		}
	}
	
	// check for inappropriate infoboxes
	string PeopleTemplates_Regex = @"\{\{\s*(?:[Ii]nfobox[ _]+actor[ _]+voice|[Ii]nfobox[ _]+Actor|[Ii]nfobox[ _]+actor|[Ii]nfobox[ _]+Actress|[Ii]nfobox[ _]+actress|[Ii]nfobox[ _]+adult[ _]+biography|[Ii]nfobox[ _]+adult[ _]+female|[Ii]nfobox[ _]+adult[ _]+male|[Ii]nfobox[ _]+Biography|[Ii]nfobox[ _]+biography|[Ii]nfobox[ _]+bio|[Ii]nfobox[ _]+Celebrity|[Ii]nfobox[ _]+director|[Ii]nfobox[ _]+entertainer|[Ii]nfobox[ _]+Fashion[ _]+Designer|[Ii]nfobox[ _]+fashion[ _]+designer|[Ii]nfobox[ _]+film[ _]+actor|[Ii]nfobox[ _]+film[ _]+director|[Ii]nfobox[ _]+human[ _]+being|[Ii]nfobox[ _]+human|[Ii]nfobox[ _]+Indian[ _]+Businessmen|[Ii]nfobox[ _]+Journalist|[Ii]nfobox[ _]+journalist|[Ii]nfobox[ _]+people|[Ii]nfobox[ _]+performer|[Ii]nfobox[ _]+person/measurements|[Ii]nfobox[ _]+person[ _]+ii|[Ii]nfobox[ _]+person|[Ii]nfobox[ _]+Person|[Ii]nfobox[ _]+photographer|[Ii]nfobox[ _]+Real[ _]+Person|[Ii]nfobox[ _]+trade[ _]+unionist|[Ii]nfobox[ _]+victim|[Pp]ersonbox)(?=\s*(?:\||\<\!\-\-))";
	string ScientistTemplates_Regex = @"\{\{\s*(?:[Ii]nfobox[ _]+Academic|[Ii]nfobox[ _]+chemist|[Ii]nfobox[ _]+historian|[Ii]nfobox[ _]+mathematician|[Ii]nfobox[ _]+Professor|[Ii]nfobox[ _]+scientist|[Ii]nfobox[ _]+Scientist)(?=\s*(?:\||\<\!\-\-))";
	bool BadInfobox1 = Regex.IsMatch(ArticleText, PeopleTemplates_Regex, RegexOptions.IgnoreCase);
	bool BadInfobox2 = Regex.IsMatch(ArticleText, ScientistTemplates_Regex, RegexOptions.IgnoreCase);
	if (BadInfobox1 || BadInfobox2)
	{
		Summary += @"Person/scientist infobox found. ";
		Skip = true;
	}
	
	// check for appropriate infoboxes
	string TitleTemplates_Regex = @"\{\{\s*(?:DISPLAY ?TITLE|[Ii]talicisedtitle|[Ii]talicised[ _]+title|[Ii]talicizedtitle|[Ii]talicized[ _]+title|[Ii]talicizetitle|[Ii]talicize[ _]+title|[Ii]talicstitle|[Ii]talics[ _]+title|[Ii]talics|ITALICTITLE|[Ii]talictitle|[Ii]talic[ _]+title[ _]+infobox|[Ii]talic[ _]+title|[Ii]talic|[Ii]tal|[Rr]edirect[ _]+italic[ _]+title|[Tt]itle[ _]+italic)";
	
	string TaxoTemplates_Regex = @"\{\{\s*(?:Template:\s*|Wikipedia:\s*)?(?:Infobox[ _]+)?(" + // prefixes
										@"Taxobox|Taxo|TX|Speciesbox|Subspeciesbox|Infraspeciesbox|" + // taxo/species
										@"Automatic[ _]+t?axobox|" + // auto
										@"bacteria|microorganism|virus|oobox" + // other
										@")(?=\s*(?:\||\<\!\-\-|" + TitleTemplates_Regex + @"|(?<=Automatic[ _]+t?axobox\s*)\}\}))"; // suffixes
	bool NoTaxoTemplates = !Regex.IsMatch(ArticleText, TaxoTemplates_Regex, RegexOptions.IgnoreCase);
	if (NoTaxoTemplates)
	{
		if (ManuallyCheckPagesWithoutAnInfobox)
		{
			if (!BadInfobox1 && !BadInfobox2)
			{
				// OK to proceed (manually)
			}
			else
			{
				// Skip is already true from 'inappropriate infoboxes' check
			}
		}
		else
		{
			Summary += @"No auto/taxo/speciesbox found. ";
			Skip = true;
		}
	}
	
	// standardize & check for {{Taxonbar
	string TaxonbarAliases_Regex = @"\{\{\s*(?:[Tt]axobar|[Tt]axon\-bar|[Tt]axonbar|[Tt]axonBar|[Tt]axonIds|[Tt]axon[ _]+bar)(?=\s*[\|\}])"; // 0 grps
	ArticleText = Regex.Replace(ArticleText, TaxonbarAliases_Regex, @"{{Taxonbar", RegexOptions.IgnoreCase);
	int iTaxonbars = Regex.Matches(ArticleText, @"\{\{Taxonbar", RegexOptions.IgnoreCase).Count;
	if (iTaxonbars != 1)
	{
		Summary += @"Unexpected # of taxonbars: " + iTaxonbars + ". ";
		Skip = true;
	}
	
	// store all {{Taxonbar...}} contents ffr
	string TBAll_Regex = @"\{\{Taxonbar([^\{\}]*)\}\}";
	Match mTBAll = Regex.Match(ArticleText, TBAll_Regex, RegexOptions.IgnoreCase);
	string TBAll = mTBAll.Value;
	
	bool BracketsInTB = (iTaxonbars > 0 && !mTBAll.Success);
	if (BracketsInTB)
	{
		Summary += @"Stray bracket(s) found in taxonbar. "; // can't add {{Taxonbar...}} to skip summary b/c mTBAll failed!
		Skip = true;
	}
	
	// get wikibase_item via WP API
	// ex: https://en.wikipedia.org//w/api.php?action=query&format=json&prop=pageprops&titles=Panthera%20leo&redirects=0&formatversion=2&ppprop=wikibase_item
	// wish I could find a URL_Encode function that worked....
	string ArticleTitle_URL = ArticleTitle.Replace(" ", @"%20").Replace(",", @"%2C").Replace("'", @"%27").Replace("-", @"%2D").Replace("–", @"%96").Replace("(", @"%28").Replace(")", @"%29").Replace(".", @"%2E").Replace("&", @"%26").Replace("?", @"%3F").Replace("+", @"%2B").Replace(":", @"%3A").Replace("!", @"%21").Replace("/", @"%2F").Replace(@"\", @"%5C");
	string URL1 = @"https://en.wikipedia.org//w/api.php?action=query&format=json&prop=pageprops&titles=" + 
						ArticleTitle_URL + @"&redirects=0&formatversion=2&ppprop=wikibase_item";
	string HTML1 = "";
	bool HTML1Failed = false;
	if (!Skip)
	{
		HTML1Attempted = true;
		try
		{
			HTML1 = Tools.GetHTML(URL1);
		}
		catch
		{
			HTML1Failed = true;
			Summary = "HTML1 failed. ArticleTitle_URL = " + ArticleTitle_URL + ". ";
			if (!LiveDebug) Skip = true;
		}
	}
	
	
	// WD/html error checks /////////////////////////////////////////////////////
	
	string QID = Regex.Match(HTML1, @"wikibase_item"":""([^""]+)").Groups[1].Value;
	if (string.IsNullOrEmpty(QID) && !Skip)
	{
		Summary = @"QID retrieval failed. ";
		Skip = true;
	}
	
	if (!Regex.IsMatch(QID, @"^Q\d+$") && !Skip) // case sensitive, jtbs
	{
		Summary = @"Unexpected QID format. ";
		Skip = true;
	}
	
	bool CorrectExistingFrom1 = Regex.IsMatch(TBAll, @"\|\s*from1?\s*=\s*" + QID + @"\s*[\|\}]", RegexOptions.IgnoreCase);
	if (CorrectExistingFrom1 && !Skip)
	{	// this should be the normal/most frequent skip case
		Summary = @"From1 already exists (correct) in " + TBAll + ". ";
		FromOk = true;
		Skip = true;
	}
	
	bool AnyFrom1  = Regex.IsMatch(TBAll, @"\|\s*from1?\s*=", RegexOptions.IgnoreCase);
	bool NullFrom1 = Regex.IsMatch(TBAll, @"\|\s*from1?\s*=\s*[\|\}]", RegexOptions.IgnoreCase);
	if (AnyFrom1 && !NullFrom1 && !CorrectExistingFrom1)
	{
		Summary += @"From1 already exists (incorrect)";
		if (Regex.IsMatch(Summary, "Taxonbar")) Summary += ". "; // case sensitive "T"
		else Summary += " in " + TBAll + ". ";
		Skip = true;
	}
	
	bool DupQ = (Regex.Matches(TBAll, @"\b" + QID + @"\b", RegexOptions.IgnoreCase).Count > 1); // case INsensitive
	if (DupQ && !Skip)
	{
		Summary += @"Duplicate """ + QID + @"""s found";
		if (Regex.IsMatch(Summary, "Taxonbar")) Summary += ". "; // case sensitive "T"
		else Summary += " in " + TBAll + ". ";
		Skip = true;
	}
	
	bool NonEmptyTB = !string.IsNullOrEmpty(mTBAll.Groups[1].Value.Trim());
	if (NonEmptyTB && !CorrectExistingFrom1 && HTML1Attempted)
	{
		bool NoFrom1   = !Regex.IsMatch(TBAll, @"\|\s*from1?\s*=", RegexOptions.IgnoreCase);
		if (NoFrom1)
		{
			// OK to proceed
		}
		else if (NullFrom1)
		{	// remove it
			string TBAll_new = Regex.Replace(TBAll, @"\|\s*from1?\s*=\s*(?=[\|\}])", "", RegexOptions.IgnoreCase);
			ArticleText = ArticleText.Replace(TBAll, TBAll_new);
		}
		else
		{
			Summary += @"Extra text in TBAll; batch these more carefully later";
			if (Regex.IsMatch(Summary, "Taxonbar")) Summary += ". "; // case sensitive "T"
			else Summary += @": " + TBAll + ". ";
			Skip = true;
		}
	}
	
	
	// main /////////////////////////////////////////////////////////////////////
	
	if (!Skip)
	{
		bool AnyFrom2 = Regex.IsMatch(TBAll, @"\|\s*from2\s*=", RegexOptions.IgnoreCase);
		string One    = (AnyFrom2) ? "1" : ""; // only use from1 if from2 exists, otherwise from
		string Pipe   = Regex.Match(TBAll, @"\{\{Taxonbar(\s*\|\s*)").Groups[1].Value;
		string Equals = Regex.Match(TBAll, @"\s*=\s*").Value;
		if (string.IsNullOrEmpty(Pipe))   Pipe   = "|";
		if (string.IsNullOrEmpty(Equals)) Equals = "=";
		string From1 = Pipe + "from" + One + Equals + QID;
		ArticleText = Regex.Replace(ArticleText, @"(\{\{Taxonbar)(?=\s*[\|\}])", @"$1" + From1);
		Summary = @"[[Template talk:Taxonbar#from1|Add from]]=[[d:Special:EntityPage/" + QID + @"|" + QID + @"]] to {{[[Template:Taxonbar|Taxonbar]]}}";
		
		// remove "form" typo if QIDs match
		string FormTypo1_Regex = @"(\{\{Taxonbar\s*\|\s*from1?\s*=\s*" + QID + @"\s*)\|\s*form1?\s*=\s*" + QID + @"\s*(?=\|)";   // 1 grp
		string FormTypo2_Regex = @"(\{\{Taxonbar\s*\|\s*from1?\s*=\s*" + QID + @")\s*\|\s*form1?\s*=\s*" + QID + @"\s*(?=\}\})"; // 1 grp
		bool FormTypo1 = Regex.IsMatch(ArticleText, FormTypo1_Regex, RegexOptions.IgnoreCase);
		bool FormTypo2 = Regex.IsMatch(ArticleText, FormTypo2_Regex, RegexOptions.IgnoreCase);
		ArticleText = Regex.Replace(ArticleText, FormTypo1_Regex, @"$1", RegexOptions.IgnoreCase);
		ArticleText = Regex.Replace(ArticleText, FormTypo2_Regex, @"$1", RegexOptions.IgnoreCase);
		if (FormTypo1 || FormTypo2) Summary += @" (|form= typo)";
		
		if (GenFixes) Summary += @"; [[WP:GenFixes]] on";
		Summary += ",";
	}
	
	
	// exception tracking ///////////////////////////////////////////////////////
	
	if (Skip && SaveSkipSummaries)
	{
		if (!FromOk || (FromOk && SaveSkipSummaries_FromOk))
		{
			string Message = ArticleTitle + "\t" + Summary + "\n";
			string File = @"Module output - Add from1 parameter (skip summaries).txt";
			string Path = @"F:\"; // desktop
			string FullPath = Path + File;
			const bool APPEND = true;
			Tools.WriteTextFileAbsolutePath(Message, FullPath, APPEND);
		}
	}
	
	return ArticleText;
}