Jump to content

User:Trappist the monk/lang-xx-Cyrl-Latn candidate lister

From Wikipedia, the free encyclopedia

awb script to list {{langx|<tag>|...}} templates that have both Cyrillic- and Latin-script text for <tag> values of cnr, sh, sr, and uz.

the c# module

[edit]
//---------------------------< M A I N >----------------------------------------------------------------------
//
// from a list of articles that have {{langx|<TAG>|...}}, extract Cyrillic- and Latin-script text from those
// templates that have both.
// 
// account for parameter aliases:
//		{{{2}}} (not named), |2=, |text= – should be Cyrillic-script text
//		{{{3}}} (not named), |3=, |translit= – should be Latin-script text
//
// for each template with both scripts, write a line to a local file:
//		*[[<article name>]] – <code><nowiki>{{Lang-<TAG>-Cyrl-Latn|<Cyrillic text>|<Latin>}}</nowiki></code>  → {{Lang-<TAG>-Cyrl-Latn|<Cyrillic text>|<Latin>}}
//
// for 'cnr' search wikitext:
//		hastemplate:"Langx" insource:/\{ *[Ll]angx *\| *cnr *\|[^\}]+\|/
// for 'sh' search wikitext:
//		hastemplate:"Langx" insource:/\{ *[Ll]angx *\| *sh *\|[^\}]+\|/
// for 'sr' use:
//		Category:Langx uses unsupported language tag – this category currently lists all {{langx|sr|...}} pages
// for alternate 'sr' search wikitext:
//		hastemplate:"Langx" insource:/\{ *[Ll]angx *\| *sr *\|[^\}]+\|/
// for 'uz' search wikitext:
//		hastemplate:"Langx" insource:/\{ *[Ll]angx *\| *uz *\|[^\}]+\|/
//

static string TAG = "cnr";		// language tag; one of: 'cnr', 'sh', 'sr', 'uz' and then recompile

public string ProcessArticle(string ArticleText, string ArticleTitle, int wikiNamespace, out string Summary, out bool Skip)
	{
	Skip = false;
	Summary = "";

	string pattern = "";
	
	pattern = @"(\{\{\s*[Ll]angx\s*\|\s*" + TAG + @"\s*\|\s*)([^\{\}]*)(\}\})";									// regex to find {{langx|sr|...}} templates

	if (Regex.Match (ArticleText, pattern).Success)
		ArticleText = Regex.Replace (ArticleText, pattern,
			delegate(Match match)
				{
				string	template = match.Groups[0].Value;												// this will be returned if no changes
				
				if (Regex.Match (template, @"\[\[").Success)											// abandon this template if it contains wikilinks
					return template;																	// because wikilinks confuse the regex
				
				string	parameters_str =  match.Groups[2].Value;										// template without '{{langx|??|' and '}}'; to be split on the pipes
				
				string[] parameters_t = parameters_str.Split('|');										// split template parameters into an array of strings
				string	cyrl = "";
				string	latn = "";
				
				int i = 0;
				foreach (string parameter in parameters_t)
					{
					if (parameter.Contains ('='))														// if an assignment operator
						{
						Match pmatch = Regex.Match (parameter, @"[^=]+=.+");							// split in to parameter name and parameter value
						string	name = pmatch.Groups[1].Value.Trim();									// and trim extraneous white space
						string	value = pmatch.Groups[2].Value.Trim();
						
						if (("2" == name) || ("text" == name))											// if either of these,
							cyrl = value;																// assume Cyrillic
						
						if (("3" == name) || ("translit" == name))										// because either of these must be Latin
							latn = value;
						}

					else if (0 == i)																	// here for positional parameters
						{
						if (Regex.Match (parameter, @"[\p{IsCyrillic}\p{IsCyrillicSupplement}',""\d\s\-]{2,}").Success)	// has at least 2 Cyrillic characters? assume Cyrillic text
							{
							cyrl = parameter.Trim();
							i++;
							}
						else																			// here when first positional not Cyrillic
							{
							latn = parameter.Trim();													// must be Latin
							break;																		// so we're done looking
							}
						}

					else																				// have Cyrillic
						{
						latn = parameter.Trim();														// this positional parameter must be Latin
						break;																			// and we're done looking
						}
					}

				if (("" != cyrl) && ("" != latn))														// generate output only when both are found
					{																					// can't do syntaxhighlight because expensive
					string	out_string = "*[[" + ArticleTitle + "]] – <code><nowiki>{{Lang-" + TAG + "-Cyrl-Latn|" + cyrl + "|" + latn + "}}</nowiki></code>  → {{Lang-" + TAG + "-Cyrl-Latn|" + cyrl + "|" + latn + "}}";

					System.IO.StreamWriter sw;
					string	log_file = @"Z:\Wikipedia\AWB\Monkbot_tasks\Monkbot_task_20\lang_"+ TAG + "_data.txt";	// path to our file

					sw = System.IO.File.AppendText (log_file);											// open file for appending
					sw.WriteLine (out_string);															// write wikilinked article title header
					sw.Close();																			// and close the file til next time
					}

				return template;																		// and return unmolested template
				});
	
	Skip = true;
	return ArticleText;																					// and done
	}

the awb settings file

[edit]
<?xml version="1.0" encoding="utf-8"?>
<AutoWikiBrowserPreferences xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xml:space="preserve" Version="6.3.1.1">
  <Project>wikipedia</Project>
  <LanguageCode>en</LanguageCode>
  <CustomProject />
  <Protocol>https://</Protocol>
  <LoginDomain />
  <List>
    <ListSource>hastemplate:"Langx" insource:/\{ *[Ll]angx *\| *cnr *\|[^\}]+\|/</ListSource>
    <SelectedProvider>WikiSearchAllNSListProvider</SelectedProvider>
    <ArticleList />
  </List>
  <FindAndReplace>
    <Enabled>false</Enabled>
    <IgnoreSomeText>false</IgnoreSomeText>
    <IgnoreMoreText>false</IgnoreMoreText>
    <AppendSummary>true</AppendSummary>
    <Replacements />
    <AdvancedReps />
    <SubstTemplates />
    <IncludeComments>false</IncludeComments>
    <ExpandRecursively>true</ExpandRecursively>
    <IgnoreUnformatted>false</IgnoreUnformatted>
  </FindAndReplace>
  <Editprefs>
    <GeneralFixes>false</GeneralFixes>
    <Tagger>false</Tagger>
    <Unicodify>false</Unicodify>
    <Recategorisation>0</Recategorisation>
    <NewCategory />
    <NewCategory2 />
    <ReImage>0</ReImage>
    <ImageFind />
    <Replace />
    <SkipIfNoCatChange>false</SkipIfNoCatChange>
    <RemoveSortKey>false</RemoveSortKey>
    <SkipIfNoImgChange>false</SkipIfNoImgChange>
    <AppendText>false</AppendText>
    <AppendTextMetaDataSort>false</AppendTextMetaDataSort>
    <Append>true</Append>
    <Text />
    <Newlines>2</Newlines>
    <AutoDelay>10</AutoDelay>
    <BotMaxEdits>0</BotMaxEdits>
    <SupressTag>false</SupressTag>
    <RegexTypoFix>false</RegexTypoFix>
  </Editprefs>
  <General>
    <AutoSaveEdit>
      <Enabled>false</Enabled>
      <SavePeriod>30</SavePeriod>
      <SaveFile />
    </AutoSaveEdit>
    <SelectedSummary>no summary; nothing saved by this script</SelectedSummary>
    <Summaries>
      <string>clean up</string>
      <string>re-categorisation per [[WP:CFD|CFD]]</string>
      <string>clean up and re-categorisation per [[WP:CFD|CFD]]</string>
      <string>removing category per [[WP:CFD|CFD]]</string>
      <string>[[Wikipedia:Template substitution|subst:'ing]]</string>
      <string>[[Wikipedia:WikiProject Stub sorting|stub sorting]]</string>
      <string>[[WP:AWB/T|Typo fixing]]</string>
      <string>bad link repair</string>
      <string>Fixing [[Wikipedia:Disambiguation pages with links|links to disambiguation pages]]</string>
      <string>Unicodifying</string>
      <string>no summary; nothing saved by this script</string>
    </Summaries>
    <PasteMore>
      <string />
      <string />
      <string />
      <string />
      <string />
      <string />
      <string />
      <string />
      <string />
      <string />
    </PasteMore>
    <FindText />
    <FindRegex>false</FindRegex>
    <FindCaseSensitive>false</FindCaseSensitive>
    <WordWrap>true</WordWrap>
    <ToolBarEnabled>false</ToolBarEnabled>
    <BypassRedirect>true</BypassRedirect>
    <AutoSaveSettings>false</AutoSaveSettings>
    <noSectionEditSummary>false</noSectionEditSummary>
    <restrictDefaultsortAddition>true</restrictDefaultsortAddition>
    <restrictOrphanTagging>true</restrictOrphanTagging>
    <noMOSComplianceFixes>false</noMOSComplianceFixes>
    <syntaxHighlightEditBox>false</syntaxHighlightEditBox>
    <highlightAllFind>false</highlightAllFind>
    <PreParseMode>false</PreParseMode>
    <NoAutoChanges>false</NoAutoChanges>
    <OnLoadAction>0</OnLoadAction>
    <DiffInBotMode>false</DiffInBotMode>
    <Minor>true</Minor>
    <AddToWatchlist>2</AddToWatchlist>
    <TimerEnabled>false</TimerEnabled>
    <SortListAlphabetically>false</SortListAlphabetically>
    <AddIgnoredToLog>false</AddIgnoredToLog>
    <EditToolbarEnabled>false</EditToolbarEnabled>
    <filterNonMainSpace>false</filterNonMainSpace>
    <AutoFilterDuplicates>false</AutoFilterDuplicates>
    <FocusAtEndOfEditBox>false</FocusAtEndOfEditBox>
    <scrollToUnbalancedBrackets>false</scrollToUnbalancedBrackets>
    <TextBoxSize>10</TextBoxSize>
    <TextBoxFont>Courier New</TextBoxFont>
    <LowThreadPriority>false</LowThreadPriority>
    <Beep>false</Beep>
    <Flash>false</Flash>
    <Minimize>false</Minimize>
    <LockSummary>false</LockSummary>
    <SaveArticleList>true</SaveArticleList>
    <SuppressUsingAWB>false</SuppressUsingAWB>
    <AddUsingAWBToActionSummaries>false</AddUsingAWBToActionSummaries>
    <IgnoreNoBots>false</IgnoreNoBots>
    <ClearPageListOnProjectChange>false</ClearPageListOnProjectChange>
    <SortInterWikiOrder>true</SortInterWikiOrder>
    <ReplaceReferenceTags>true</ReplaceReferenceTags>
    <LoggingEnabled>true</LoggingEnabled>
    <AlertPreferences />
  </General>
  <SkipOptions>
    <SkipNonexistent>true</SkipNonexistent>
    <Skipexistent>false</Skipexistent>
    <SkipDontCare>false</SkipDontCare>
    <SkipWhenNoChanges>false</SkipWhenNoChanges>
    <SkipSpamFilterBlocked>false</SkipSpamFilterBlocked>
    <SkipInuse>false</SkipInuse>
    <SkipWhenOnlyWhitespaceChanged>false</SkipWhenOnlyWhitespaceChanged>
    <SkipOnlyGeneralFixChanges>true</SkipOnlyGeneralFixChanges>
    <SkipOnlyMinorGeneralFixChanges>false</SkipOnlyMinorGeneralFixChanges>
    <SkipOnlyCosmetic>false</SkipOnlyCosmetic>
    <SkipOnlyCasingChanged>false</SkipOnlyCasingChanged>
    <SkipIfRedirect>false</SkipIfRedirect>
    <SkipIfNoAlerts>false</SkipIfNoAlerts>
    <SkipDoes>false</SkipDoes>
    <SkipDoesText />
    <SkipDoesRegex>false</SkipDoesRegex>
    <SkipDoesCaseSensitive>false</SkipDoesCaseSensitive>
    <SkipDoesAfterProcessing>false</SkipDoesAfterProcessing>
    <SkipDoesNot>false</SkipDoesNot>
    <SkipDoesNotText />
    <SkipDoesNotRegex>false</SkipDoesNotRegex>
    <SkipDoesNotCaseSensitive>false</SkipDoesNotCaseSensitive>
    <SkipDoesNotAfterProcessing>false</SkipDoesNotAfterProcessing>
    <SkipNoFindAndReplace>false</SkipNoFindAndReplace>
    <SkipMinorFindAndReplace>false</SkipMinorFindAndReplace>
    <SkipNoRegexTypoFix>false</SkipNoRegexTypoFix>
    <SkipNoDisambiguation>false</SkipNoDisambiguation>
    <SkipNoLinksOnPage>false</SkipNoLinksOnPage>
    <GeneralSkipList />
  </SkipOptions>
  <Module>
    <Enabled>true</Enabled>
    <Language>C# 4.0</Language>
    <Code>//---------------------------&lt; M A I N &gt;----------------------------------------------------------------------
//
// from a list of articles that have {{langx|&lt;TAG&gt;|...}}, extract Cyrillic- and Latin-script text from those
// templates that have both.
// 
// account for parameter aliases:
//		{{{2}}} (not named), |2=, |text=  should be Cyrillic-script text
//		{{{3}}} (not named), |3=, |translit=  should be Latin-script text
//
// for each template with both scripts, write a line to a local file:
//		*[[&lt;article name&gt;]]  &lt;code&gt;&lt;nowiki&gt;{{Lang-&lt;TAG&gt;-Cyrl-Latn|&lt;Cyrillic text&gt;|&lt;Latin&gt;}}&lt;/nowiki&gt;&lt;/code&gt;   {{Lang-&lt;TAG&gt;-Cyrl-Latn|&lt;Cyrillic text&gt;|&lt;Latin&gt;}}
//
// for 'cnr' search wikitext:
//		hastemplate:"Langx" insource:/\{ *[Ll]angx *\| *cnr *\|[^\}]+\|/
// for 'sh' search wikitext:
//		hastemplate:"Langx" insource:/\{ *[Ll]angx *\| *sh *\|[^\}]+\|/
// for 'sr' use:
//		Category:Langx uses unsupported language tag  this category currently lists all {{langx|sr|...}} pages
// for alternate 'sr' search wikitext:
//		hastemplate:"Langx" insource:/\{ *[Ll]angx *\| *sr *\|[^\}]+\|/
// for 'uz' search wikitext:
//		hastemplate:"Langx" insource:/\{ *[Ll]angx *\| *uz *\|[^\}]+\|/
//

static string TAG = "cnr";		// language tag; one of: 'cnr', 'sh', 'sr', 'uz' and then recompile

public string ProcessArticle(string ArticleText, string ArticleTitle, int wikiNamespace, out string Summary, out bool Skip)
	{
	Skip = false;
	Summary = "";

	string pattern = "";
	
	pattern = @"(\{\{\s*[Ll]angx\s*\|\s*" + TAG + @"\s*\|\s*)([^\{\}]*)(\}\})";									// regex to find {{langx|sr|...}} templates

	if (Regex.Match (ArticleText, pattern).Success)
		ArticleText = Regex.Replace (ArticleText, pattern,
			delegate(Match match)
				{
				string	template = match.Groups[0].Value;												// this will be returned if no changes
				
				if (Regex.Match (template, @"\[\[").Success)											// abandon this template if it contains wikilinks
					return template;																	// because wikilinks confuse the regex
				
				string	parameters_str =  match.Groups[2].Value;										// template without '{{langx|??|' and '}}'; to be split on the pipes
				
				string[] parameters_t = parameters_str.Split('|');										// split template parameters into an array of strings
				string	cyrl = "";
				string	latn = "";
				
				int i = 0;
				foreach (string parameter in parameters_t)
					{
					if (parameter.Contains ('='))														// if an assignment operator
						{
						Match pmatch = Regex.Match (parameter, @"[^=]+=.+");							// split in to parameter name and parameter value
						string	name = pmatch.Groups[1].Value.Trim();									// and trim extraneous white space
						string	value = pmatch.Groups[2].Value.Trim();
						
						if (("2" == name) || ("text" == name))											// if either of these,
							cyrl = value;																// assume Cyrillic
						
						if (("3" == name) || ("translit" == name))										// because either of these must be Latin
							latn = value;
						}

					else if (0 == i)																	// here for positional parameters
						{
						if (Regex.Match (parameter, @"[\p{IsCyrillic}\p{IsCyrillicSupplement}',""\d\s\-]{2,}").Success)	// has at least 2 Cyrillic characters? assume Cyrillic text
							{
							cyrl = parameter.Trim();
							i++;
							}
						else																			// here when first positional not Cyrillic
							{
							latn = parameter.Trim();													// must be Latin
							break;																		// so we're done looking
							}
						}

					else																				// have Cyrillic
						{
						latn = parameter.Trim();														// this positional parameter must be Latin
						break;																			// and we're done looking
						}
					}

				if (("" != cyrl) &amp;&amp; ("" != latn))														// generate output only when both are found
					{																					// can't do syntaxhighlight because expensive
					string	out_string = "*[[" + ArticleTitle + "]]  &lt;code&gt;&lt;nowiki&gt;{{Lang-" + TAG + "-Cyrl-Latn|" + cyrl + "|" + latn + "}}&lt;/nowiki&gt;&lt;/code&gt;   {{Lang-" + TAG + "-Cyrl-Latn|" + cyrl + "|" + latn + "}}";

					System.IO.StreamWriter sw;
					string	log_file = @"Z:\Wikipedia\AWB\Monkbot_tasks\Monkbot_task_20\lang_"+ TAG + "_data.txt";	// path to our file

					sw = System.IO.File.AppendText (log_file);											// open file for appending
					sw.WriteLine (out_string);															// write wikilinked article title header
					sw.Close();																			// and close the file til next time
					}

				return template;																		// and return unmolested template
				});
	
	Skip = true;
	return ArticleText;																					// and done
	}</Code>
  </Module>
  <ExternalProgram>
    <Enabled>false</Enabled>
    <Skip>false</Skip>
    <Program />
    <Parameters />
    <PassAsFile>true</PassAsFile>
    <OutputFile />
  </ExternalProgram>
  <Disambiguation>
    <Enabled>false</Enabled>
    <Link />
    <Variants />
    <ContextChars>20</ContextChars>
  </Disambiguation>
  <Special>
    <namespaceValues />
    <remDupes>true</remDupes>
    <sortAZ>true</sortAZ>
    <filterTitlesThatContain>false</filterTitlesThatContain>
    <filterTitlesThatContainText />
    <filterTitlesThatDontContain>false</filterTitlesThatDontContain>
    <filterTitlesThatDontContainText />
    <areRegex>false</areRegex>
    <opType>0</opType>
    <remove />
  </Special>
  <Tool>
    <ListComparerUseCurrentArticleList>0</ListComparerUseCurrentArticleList>
    <ListSplitterUseCurrentArticleList>0</ListSplitterUseCurrentArticleList>
    <DatabaseScannerUseCurrentArticleList>0</DatabaseScannerUseCurrentArticleList>
  </Tool>
  <Plugin>
    <PluginPrefs>
      <Name>CSV Loader</Name>
      <PluginSettings>
        <anyType xsi:type="PrefsKeyPair">
          <Name>TextMode</Name>
          <Setting xsi:type="xsd:string">Append</Setting>
        </anyType>
        <anyType xsi:type="PrefsKeyPair">
          <Name>InputText</Name>
          <Setting xsi:type="xsd:string" />
        </anyType>
        <anyType xsi:type="PrefsKeyPair">
          <Name>ColumnHeaders</Name>
          <Setting xsi:type="xsd:string" />
        </anyType>
        <anyType xsi:type="PrefsKeyPair">
          <Name>Skip</Name>
          <Setting xsi:type="xsd:boolean">true</Setting>
        </anyType>
        <anyType xsi:type="PrefsKeyPair">
          <Name>Separator</Name>
          <Setting xsi:type="xsd:string">,</Setting>
        </anyType>
        <anyType xsi:type="PrefsKeyPair">
          <Name>CreateLists</Name>
          <Setting xsi:type="xsd:boolean">false</Setting>
        </anyType>
        <anyType xsi:type="PrefsKeyPair">
          <Name>ListSeparator</Name>
          <Setting xsi:type="xsd:string">^</Setting>
        </anyType>
        <anyType xsi:type="PrefsKeyPair">
          <Name>FindReplace</Name>
          <Setting xsi:type="xsd:boolean">false</Setting>
        </anyType>
        <anyType xsi:type="PrefsKeyPair">
          <Name>EditSummary</Name>
          <Setting xsi:type="xsd:string" />
        </anyType>
      </PluginSettings>
    </PluginPrefs>
  </Plugin>
</AutoWikiBrowserPreferences>