User:Trappist the monk/lang-xx-Cyrl-Latn candidate lister
Appearance
awb script to list {{langx|<tag>|...}}
templates that have both Cyrillic- and Latin-script text for <tag>
values of cnr
, sh
, sr
, and uz
.
the c# module
[edit]//---------------------------< M A I N >----------------------------------------------------------------------
//
// from a list of articles that have {{langx|<TAG>|...}}, extract Cyrillic- and Latin-script text from those
// templates that have both.
//
// account for parameter aliases:
// {{{2}}} (not named), |2=, |text= – should be Cyrillic-script text
// {{{3}}} (not named), |3=, |translit= – should be Latin-script text
//
// for each template with both scripts, write a line to a local file:
// *[[<article name>]] – <code><nowiki>{{Lang-<TAG>-Cyrl-Latn|<Cyrillic text>|<Latin>}}</nowiki></code> → {{Lang-<TAG>-Cyrl-Latn|<Cyrillic text>|<Latin>}}
//
// for 'cnr' search wikitext:
// hastemplate:"Langx" insource:/\{ *[Ll]angx *\| *cnr *\|[^\}]+\|/
// for 'sh' search wikitext:
// hastemplate:"Langx" insource:/\{ *[Ll]angx *\| *sh *\|[^\}]+\|/
// for 'sr' use:
// Category:Langx uses unsupported language tag – this category currently lists all {{langx|sr|...}} pages
// for alternate 'sr' search wikitext:
// hastemplate:"Langx" insource:/\{ *[Ll]angx *\| *sr *\|[^\}]+\|/
// for 'uz' search wikitext:
// hastemplate:"Langx" insource:/\{ *[Ll]angx *\| *uz *\|[^\}]+\|/
//
static string TAG = "cnr"; // language tag; one of: 'cnr', 'sh', 'sr', 'uz' and then recompile
public string ProcessArticle(string ArticleText, string ArticleTitle, int wikiNamespace, out string Summary, out bool Skip)
{
Skip = false;
Summary = "";
string pattern = "";
pattern = @"(\{\{\s*[Ll]angx\s*\|\s*" + TAG + @"\s*\|\s*)([^\{\}]*)(\}\})"; // regex to find {{langx|sr|...}} templates
if (Regex.Match (ArticleText, pattern).Success)
ArticleText = Regex.Replace (ArticleText, pattern,
delegate(Match match)
{
string template = match.Groups[0].Value; // this will be returned if no changes
if (Regex.Match (template, @"\[\[").Success) // abandon this template if it contains wikilinks
return template; // because wikilinks confuse the regex
string parameters_str = match.Groups[2].Value; // template without '{{langx|??|' and '}}'; to be split on the pipes
string[] parameters_t = parameters_str.Split('|'); // split template parameters into an array of strings
string cyrl = "";
string latn = "";
int i = 0;
foreach (string parameter in parameters_t)
{
if (parameter.Contains ('=')) // if an assignment operator
{
Match pmatch = Regex.Match (parameter, @"[^=]+=.+"); // split in to parameter name and parameter value
string name = pmatch.Groups[1].Value.Trim(); // and trim extraneous white space
string value = pmatch.Groups[2].Value.Trim();
if (("2" == name) || ("text" == name)) // if either of these,
cyrl = value; // assume Cyrillic
if (("3" == name) || ("translit" == name)) // because either of these must be Latin
latn = value;
}
else if (0 == i) // here for positional parameters
{
if (Regex.Match (parameter, @"[\p{IsCyrillic}\p{IsCyrillicSupplement}',""\d\s\-]{2,}").Success) // has at least 2 Cyrillic characters? assume Cyrillic text
{
cyrl = parameter.Trim();
i++;
}
else // here when first positional not Cyrillic
{
latn = parameter.Trim(); // must be Latin
break; // so we're done looking
}
}
else // have Cyrillic
{
latn = parameter.Trim(); // this positional parameter must be Latin
break; // and we're done looking
}
}
if (("" != cyrl) && ("" != latn)) // generate output only when both are found
{ // can't do syntaxhighlight because expensive
string out_string = "*[[" + ArticleTitle + "]] – <code><nowiki>{{Lang-" + TAG + "-Cyrl-Latn|" + cyrl + "|" + latn + "}}</nowiki></code> → {{Lang-" + TAG + "-Cyrl-Latn|" + cyrl + "|" + latn + "}}";
System.IO.StreamWriter sw;
string log_file = @"Z:\Wikipedia\AWB\Monkbot_tasks\Monkbot_task_20\lang_"+ TAG + "_data.txt"; // path to our file
sw = System.IO.File.AppendText (log_file); // open file for appending
sw.WriteLine (out_string); // write wikilinked article title header
sw.Close(); // and close the file til next time
}
return template; // and return unmolested template
});
Skip = true;
return ArticleText; // and done
}
the awb settings file
[edit]<?xml version="1.0" encoding="utf-8"?>
<AutoWikiBrowserPreferences xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xml:space="preserve" Version="6.3.1.1">
<Project>wikipedia</Project>
<LanguageCode>en</LanguageCode>
<CustomProject />
<Protocol>https://</Protocol>
<LoginDomain />
<List>
<ListSource>hastemplate:"Langx" insource:/\{ *[Ll]angx *\| *cnr *\|[^\}]+\|/</ListSource>
<SelectedProvider>WikiSearchAllNSListProvider</SelectedProvider>
<ArticleList />
</List>
<FindAndReplace>
<Enabled>false</Enabled>
<IgnoreSomeText>false</IgnoreSomeText>
<IgnoreMoreText>false</IgnoreMoreText>
<AppendSummary>true</AppendSummary>
<Replacements />
<AdvancedReps />
<SubstTemplates />
<IncludeComments>false</IncludeComments>
<ExpandRecursively>true</ExpandRecursively>
<IgnoreUnformatted>false</IgnoreUnformatted>
</FindAndReplace>
<Editprefs>
<GeneralFixes>false</GeneralFixes>
<Tagger>false</Tagger>
<Unicodify>false</Unicodify>
<Recategorisation>0</Recategorisation>
<NewCategory />
<NewCategory2 />
<ReImage>0</ReImage>
<ImageFind />
<Replace />
<SkipIfNoCatChange>false</SkipIfNoCatChange>
<RemoveSortKey>false</RemoveSortKey>
<SkipIfNoImgChange>false</SkipIfNoImgChange>
<AppendText>false</AppendText>
<AppendTextMetaDataSort>false</AppendTextMetaDataSort>
<Append>true</Append>
<Text />
<Newlines>2</Newlines>
<AutoDelay>10</AutoDelay>
<BotMaxEdits>0</BotMaxEdits>
<SupressTag>false</SupressTag>
<RegexTypoFix>false</RegexTypoFix>
</Editprefs>
<General>
<AutoSaveEdit>
<Enabled>false</Enabled>
<SavePeriod>30</SavePeriod>
<SaveFile />
</AutoSaveEdit>
<SelectedSummary>no summary; nothing saved by this script</SelectedSummary>
<Summaries>
<string>clean up</string>
<string>re-categorisation per [[WP:CFD|CFD]]</string>
<string>clean up and re-categorisation per [[WP:CFD|CFD]]</string>
<string>removing category per [[WP:CFD|CFD]]</string>
<string>[[Wikipedia:Template substitution|subst:'ing]]</string>
<string>[[Wikipedia:WikiProject Stub sorting|stub sorting]]</string>
<string>[[WP:AWB/T|Typo fixing]]</string>
<string>bad link repair</string>
<string>Fixing [[Wikipedia:Disambiguation pages with links|links to disambiguation pages]]</string>
<string>Unicodifying</string>
<string>no summary; nothing saved by this script</string>
</Summaries>
<PasteMore>
<string />
<string />
<string />
<string />
<string />
<string />
<string />
<string />
<string />
<string />
</PasteMore>
<FindText />
<FindRegex>false</FindRegex>
<FindCaseSensitive>false</FindCaseSensitive>
<WordWrap>true</WordWrap>
<ToolBarEnabled>false</ToolBarEnabled>
<BypassRedirect>true</BypassRedirect>
<AutoSaveSettings>false</AutoSaveSettings>
<noSectionEditSummary>false</noSectionEditSummary>
<restrictDefaultsortAddition>true</restrictDefaultsortAddition>
<restrictOrphanTagging>true</restrictOrphanTagging>
<noMOSComplianceFixes>false</noMOSComplianceFixes>
<syntaxHighlightEditBox>false</syntaxHighlightEditBox>
<highlightAllFind>false</highlightAllFind>
<PreParseMode>false</PreParseMode>
<NoAutoChanges>false</NoAutoChanges>
<OnLoadAction>0</OnLoadAction>
<DiffInBotMode>false</DiffInBotMode>
<Minor>true</Minor>
<AddToWatchlist>2</AddToWatchlist>
<TimerEnabled>false</TimerEnabled>
<SortListAlphabetically>false</SortListAlphabetically>
<AddIgnoredToLog>false</AddIgnoredToLog>
<EditToolbarEnabled>false</EditToolbarEnabled>
<filterNonMainSpace>false</filterNonMainSpace>
<AutoFilterDuplicates>false</AutoFilterDuplicates>
<FocusAtEndOfEditBox>false</FocusAtEndOfEditBox>
<scrollToUnbalancedBrackets>false</scrollToUnbalancedBrackets>
<TextBoxSize>10</TextBoxSize>
<TextBoxFont>Courier New</TextBoxFont>
<LowThreadPriority>false</LowThreadPriority>
<Beep>false</Beep>
<Flash>false</Flash>
<Minimize>false</Minimize>
<LockSummary>false</LockSummary>
<SaveArticleList>true</SaveArticleList>
<SuppressUsingAWB>false</SuppressUsingAWB>
<AddUsingAWBToActionSummaries>false</AddUsingAWBToActionSummaries>
<IgnoreNoBots>false</IgnoreNoBots>
<ClearPageListOnProjectChange>false</ClearPageListOnProjectChange>
<SortInterWikiOrder>true</SortInterWikiOrder>
<ReplaceReferenceTags>true</ReplaceReferenceTags>
<LoggingEnabled>true</LoggingEnabled>
<AlertPreferences />
</General>
<SkipOptions>
<SkipNonexistent>true</SkipNonexistent>
<Skipexistent>false</Skipexistent>
<SkipDontCare>false</SkipDontCare>
<SkipWhenNoChanges>false</SkipWhenNoChanges>
<SkipSpamFilterBlocked>false</SkipSpamFilterBlocked>
<SkipInuse>false</SkipInuse>
<SkipWhenOnlyWhitespaceChanged>false</SkipWhenOnlyWhitespaceChanged>
<SkipOnlyGeneralFixChanges>true</SkipOnlyGeneralFixChanges>
<SkipOnlyMinorGeneralFixChanges>false</SkipOnlyMinorGeneralFixChanges>
<SkipOnlyCosmetic>false</SkipOnlyCosmetic>
<SkipOnlyCasingChanged>false</SkipOnlyCasingChanged>
<SkipIfRedirect>false</SkipIfRedirect>
<SkipIfNoAlerts>false</SkipIfNoAlerts>
<SkipDoes>false</SkipDoes>
<SkipDoesText />
<SkipDoesRegex>false</SkipDoesRegex>
<SkipDoesCaseSensitive>false</SkipDoesCaseSensitive>
<SkipDoesAfterProcessing>false</SkipDoesAfterProcessing>
<SkipDoesNot>false</SkipDoesNot>
<SkipDoesNotText />
<SkipDoesNotRegex>false</SkipDoesNotRegex>
<SkipDoesNotCaseSensitive>false</SkipDoesNotCaseSensitive>
<SkipDoesNotAfterProcessing>false</SkipDoesNotAfterProcessing>
<SkipNoFindAndReplace>false</SkipNoFindAndReplace>
<SkipMinorFindAndReplace>false</SkipMinorFindAndReplace>
<SkipNoRegexTypoFix>false</SkipNoRegexTypoFix>
<SkipNoDisambiguation>false</SkipNoDisambiguation>
<SkipNoLinksOnPage>false</SkipNoLinksOnPage>
<GeneralSkipList />
</SkipOptions>
<Module>
<Enabled>true</Enabled>
<Language>C# 4.0</Language>
<Code>//---------------------------< M A I N >----------------------------------------------------------------------
//
// from a list of articles that have {{langx|<TAG>|...}}, extract Cyrillic- and Latin-script text from those
// templates that have both.
//
// account for parameter aliases:
// {{{2}}} (not named), |2=, |text= – should be Cyrillic-script text
// {{{3}}} (not named), |3=, |translit= – should be Latin-script text
//
// for each template with both scripts, write a line to a local file:
// *[[<article name>]] – <code><nowiki>{{Lang-<TAG>-Cyrl-Latn|<Cyrillic text>|<Latin>}}</nowiki></code> → {{Lang-<TAG>-Cyrl-Latn|<Cyrillic text>|<Latin>}}
//
// for 'cnr' search wikitext:
// hastemplate:"Langx" insource:/\{ *[Ll]angx *\| *cnr *\|[^\}]+\|/
// for 'sh' search wikitext:
// hastemplate:"Langx" insource:/\{ *[Ll]angx *\| *sh *\|[^\}]+\|/
// for 'sr' use:
// Category:Langx uses unsupported language tag – this category currently lists all {{langx|sr|...}} pages
// for alternate 'sr' search wikitext:
// hastemplate:"Langx" insource:/\{ *[Ll]angx *\| *sr *\|[^\}]+\|/
// for 'uz' search wikitext:
// hastemplate:"Langx" insource:/\{ *[Ll]angx *\| *uz *\|[^\}]+\|/
//
static string TAG = "cnr"; // language tag; one of: 'cnr', 'sh', 'sr', 'uz' and then recompile
public string ProcessArticle(string ArticleText, string ArticleTitle, int wikiNamespace, out string Summary, out bool Skip)
{
Skip = false;
Summary = "";
string pattern = "";
pattern = @"(\{\{\s*[Ll]angx\s*\|\s*" + TAG + @"\s*\|\s*)([^\{\}]*)(\}\})"; // regex to find {{langx|sr|...}} templates
if (Regex.Match (ArticleText, pattern).Success)
ArticleText = Regex.Replace (ArticleText, pattern,
delegate(Match match)
{
string template = match.Groups[0].Value; // this will be returned if no changes
if (Regex.Match (template, @"\[\[").Success) // abandon this template if it contains wikilinks
return template; // because wikilinks confuse the regex
string parameters_str = match.Groups[2].Value; // template without '{{langx|??|' and '}}'; to be split on the pipes
string[] parameters_t = parameters_str.Split('|'); // split template parameters into an array of strings
string cyrl = "";
string latn = "";
int i = 0;
foreach (string parameter in parameters_t)
{
if (parameter.Contains ('=')) // if an assignment operator
{
Match pmatch = Regex.Match (parameter, @"[^=]+=.+"); // split in to parameter name and parameter value
string name = pmatch.Groups[1].Value.Trim(); // and trim extraneous white space
string value = pmatch.Groups[2].Value.Trim();
if (("2" == name) || ("text" == name)) // if either of these,
cyrl = value; // assume Cyrillic
if (("3" == name) || ("translit" == name)) // because either of these must be Latin
latn = value;
}
else if (0 == i) // here for positional parameters
{
if (Regex.Match (parameter, @"[\p{IsCyrillic}\p{IsCyrillicSupplement}',""\d\s\-]{2,}").Success) // has at least 2 Cyrillic characters? assume Cyrillic text
{
cyrl = parameter.Trim();
i++;
}
else // here when first positional not Cyrillic
{
latn = parameter.Trim(); // must be Latin
break; // so we're done looking
}
}
else // have Cyrillic
{
latn = parameter.Trim(); // this positional parameter must be Latin
break; // and we're done looking
}
}
if (("" != cyrl) && ("" != latn)) // generate output only when both are found
{ // can't do syntaxhighlight because expensive
string out_string = "*[[" + ArticleTitle + "]] – <code><nowiki>{{Lang-" + TAG + "-Cyrl-Latn|" + cyrl + "|" + latn + "}}</nowiki></code> → {{Lang-" + TAG + "-Cyrl-Latn|" + cyrl + "|" + latn + "}}";
System.IO.StreamWriter sw;
string log_file = @"Z:\Wikipedia\AWB\Monkbot_tasks\Monkbot_task_20\lang_"+ TAG + "_data.txt"; // path to our file
sw = System.IO.File.AppendText (log_file); // open file for appending
sw.WriteLine (out_string); // write wikilinked article title header
sw.Close(); // and close the file til next time
}
return template; // and return unmolested template
});
Skip = true;
return ArticleText; // and done
}</Code>
</Module>
<ExternalProgram>
<Enabled>false</Enabled>
<Skip>false</Skip>
<Program />
<Parameters />
<PassAsFile>true</PassAsFile>
<OutputFile />
</ExternalProgram>
<Disambiguation>
<Enabled>false</Enabled>
<Link />
<Variants />
<ContextChars>20</ContextChars>
</Disambiguation>
<Special>
<namespaceValues />
<remDupes>true</remDupes>
<sortAZ>true</sortAZ>
<filterTitlesThatContain>false</filterTitlesThatContain>
<filterTitlesThatContainText />
<filterTitlesThatDontContain>false</filterTitlesThatDontContain>
<filterTitlesThatDontContainText />
<areRegex>false</areRegex>
<opType>0</opType>
<remove />
</Special>
<Tool>
<ListComparerUseCurrentArticleList>0</ListComparerUseCurrentArticleList>
<ListSplitterUseCurrentArticleList>0</ListSplitterUseCurrentArticleList>
<DatabaseScannerUseCurrentArticleList>0</DatabaseScannerUseCurrentArticleList>
</Tool>
<Plugin>
<PluginPrefs>
<Name>CSV Loader</Name>
<PluginSettings>
<anyType xsi:type="PrefsKeyPair">
<Name>TextMode</Name>
<Setting xsi:type="xsd:string">Append</Setting>
</anyType>
<anyType xsi:type="PrefsKeyPair">
<Name>InputText</Name>
<Setting xsi:type="xsd:string" />
</anyType>
<anyType xsi:type="PrefsKeyPair">
<Name>ColumnHeaders</Name>
<Setting xsi:type="xsd:string" />
</anyType>
<anyType xsi:type="PrefsKeyPair">
<Name>Skip</Name>
<Setting xsi:type="xsd:boolean">true</Setting>
</anyType>
<anyType xsi:type="PrefsKeyPair">
<Name>Separator</Name>
<Setting xsi:type="xsd:string">,</Setting>
</anyType>
<anyType xsi:type="PrefsKeyPair">
<Name>CreateLists</Name>
<Setting xsi:type="xsd:boolean">false</Setting>
</anyType>
<anyType xsi:type="PrefsKeyPair">
<Name>ListSeparator</Name>
<Setting xsi:type="xsd:string">^</Setting>
</anyType>
<anyType xsi:type="PrefsKeyPair">
<Name>FindReplace</Name>
<Setting xsi:type="xsd:boolean">false</Setting>
</anyType>
<anyType xsi:type="PrefsKeyPair">
<Name>EditSummary</Name>
<Setting xsi:type="xsd:string" />
</anyType>
</PluginSettings>
</PluginPrefs>
</Plugin>
</AutoWikiBrowserPreferences>