User:Trappist the monk/IANA subtag registry file update
Appearance
Module:Lang relies upon a series of Lua data modules that are transcribed from the IANA language-subtag-registry file:
- Module:Lang/data/iana languages
- Module:Lang/data/iana scripts
- Module:Lang/data/iana regions
- Module:Lang/data/iana variants
- Module:Lang/data/iana suppressed scripts
- Module:ISO 639 name/ISO 639-1
All of these modules can be updated by using Module:Lang/data/iana_languages/make and copy/pasting the appropriate sections of that module's output into the related data module. This is a crude awb script that can be used to update all of the data modules in a more 'automatic' manner. The script fetches the language-subtag-registry file from the IANA website and then parses it apart and updates the various modules.
The process is:
- open awb and load the settings file.
- click start
- review the changes, and if acceptable, click save; repeat 'til done.
settings file
[edit]<?xml version="1.0" encoding="utf-8"?>
<AutoWikiBrowserPreferences xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xml:space="preserve" Version="6.3.0.0">
<Project>wikipedia</Project>
<LanguageCode>en</LanguageCode>
<CustomProject />
<Protocol>http://</Protocol>
<LoginDomain />
<List>
<ListSource />
<SelectedProvider>CategoryListProvider</SelectedProvider>
<ArticleList>
<Article NameSpaceKey="828">
<PreProcessed>false</PreProcessed>
<Name>Module:Lang/data/iana languages</Name>
</Article>
<Article NameSpaceKey="828">
<PreProcessed>false</PreProcessed>
<Name>Module:Lang/data/iana scripts</Name>
</Article>
<Article NameSpaceKey="828">
<PreProcessed>false</PreProcessed>
<Name>Module:Lang/data/iana regions</Name>
</Article>
<Article NameSpaceKey="828">
<PreProcessed>false</PreProcessed>
<Name>Module:Lang/data/iana variants</Name>
</Article>
<Article NameSpaceKey="828">
<PreProcessed>false</PreProcessed>
<Name>Module:Lang/data/iana suppressed scripts</Name>
</Article>
<Article NameSpaceKey="828">
<PreProcessed>false</PreProcessed>
<Name>Module:ISO 639 name/ISO 639-1</Name>
</Article>
</ArticleList>
</List>
<FindAndReplace>
<Enabled>false</Enabled>
<IgnoreSomeText>false</IgnoreSomeText>
<IgnoreMoreText>false</IgnoreMoreText>
<AppendSummary>false</AppendSummary>
<Replacements />
<AdvancedReps />
<SubstTemplates />
<IncludeComments>false</IncludeComments>
<ExpandRecursively>true</ExpandRecursively>
<IgnoreUnformatted>false</IgnoreUnformatted>
</FindAndReplace>
<Editprefs>
<GeneralFixes>false</GeneralFixes>
<Tagger>false</Tagger>
<Unicodify>false</Unicodify>
<Recategorisation>0</Recategorisation>
<NewCategory />
<NewCategory2 />
<ReImage>0</ReImage>
<ImageFind />
<Replace />
<SkipIfNoCatChange>false</SkipIfNoCatChange>
<RemoveSortKey>false</RemoveSortKey>
<SkipIfNoImgChange>false</SkipIfNoImgChange>
<AppendText>false</AppendText>
<AppendTextMetaDataSort>false</AppendTextMetaDataSort>
<Append>false</Append>
<Text />
<Newlines>0</Newlines>
<AutoDelay>0</AutoDelay>
<BotMaxEdits>0</BotMaxEdits>
<SupressTag>false</SupressTag>
<RegexTypoFix>false</RegexTypoFix>
</Editprefs>
<General>
<AutoSaveEdit>
<Enabled>false</Enabled>
<SavePeriod>30</SavePeriod>
<SaveFile />
</AutoSaveEdit>
<SelectedSummary />
<Summaries>
<string>clean up</string>
<string>re-categorisation per [[WP:CFD|CFD]]</string>
<string>clean up and re-categorisation per [[WP:CFD|CFD]]</string>
<string>removing category per [[WP:CFD|CFD]]</string>
<string>[[Wikipedia:Template substitution|subst:'ing]]</string>
<string>[[Wikipedia:WikiProject Stub sorting|stub sorting]]</string>
<string>[[WP:AWB/T|Typo fixing]]</string>
<string>bad link repair</string>
<string>Fixing [[Wikipedia:Disambiguation pages with links|links to disambiguation pages]]</string>
<string>Unicodifying</string>
<string>use [[Module:Lang]]; see [[Template talk:Lang#converting to lua]];</string>
</Summaries>
<PasteMore>
<string />
<string />
<string />
<string />
<string />
<string />
<string />
<string />
<string />
<string />
</PasteMore>
<FindText />
<FindRegex>false</FindRegex>
<FindCaseSensitive>false</FindCaseSensitive>
<WordWrap>true</WordWrap>
<ToolBarEnabled>false</ToolBarEnabled>
<BypassRedirect>true</BypassRedirect>
<AutoSaveSettings>false</AutoSaveSettings>
<noSectionEditSummary>true</noSectionEditSummary>
<restrictDefaultsortAddition>true</restrictDefaultsortAddition>
<restrictOrphanTagging>true</restrictOrphanTagging>
<noMOSComplianceFixes>false</noMOSComplianceFixes>
<syntaxHighlightEditBox>false</syntaxHighlightEditBox>
<highlightAllFind>false</highlightAllFind>
<PreParseMode>false</PreParseMode>
<NoAutoChanges>false</NoAutoChanges>
<OnLoadAction>0</OnLoadAction>
<DiffInBotMode>false</DiffInBotMode>
<Minor>true</Minor>
<AddToWatchlist>2</AddToWatchlist>
<TimerEnabled>false</TimerEnabled>
<SortListAlphabetically>false</SortListAlphabetically>
<AddIgnoredToLog>false</AddIgnoredToLog>
<EditToolbarEnabled>false</EditToolbarEnabled>
<filterNonMainSpace>false</filterNonMainSpace>
<AutoFilterDuplicates>false</AutoFilterDuplicates>
<FocusAtEndOfEditBox>false</FocusAtEndOfEditBox>
<scrollToUnbalancedBrackets>false</scrollToUnbalancedBrackets>
<TextBoxSize>10</TextBoxSize>
<TextBoxFont>Courier New</TextBoxFont>
<LowThreadPriority>false</LowThreadPriority>
<Beep>false</Beep>
<Flash>false</Flash>
<Minimize>false</Minimize>
<LockSummary>false</LockSummary>
<SaveArticleList>true</SaveArticleList>
<SuppressUsingAWB>false</SuppressUsingAWB>
<AddUsingAWBToActionSummaries>false</AddUsingAWBToActionSummaries>
<IgnoreNoBots>false</IgnoreNoBots>
<ClearPageListOnProjectChange>false</ClearPageListOnProjectChange>
<SortInterWikiOrder>true</SortInterWikiOrder>
<ReplaceReferenceTags>true</ReplaceReferenceTags>
<LoggingEnabled>true</LoggingEnabled>
<AlertPreferences />
</General>
<SkipOptions>
<SkipNonexistent>true</SkipNonexistent>
<Skipexistent>false</Skipexistent>
<SkipDontCare>false</SkipDontCare>
<SkipWhenNoChanges>false</SkipWhenNoChanges>
<SkipSpamFilterBlocked>false</SkipSpamFilterBlocked>
<SkipInuse>false</SkipInuse>
<SkipWhenOnlyWhitespaceChanged>false</SkipWhenOnlyWhitespaceChanged>
<SkipOnlyGeneralFixChanges>true</SkipOnlyGeneralFixChanges>
<SkipOnlyMinorGeneralFixChanges>false</SkipOnlyMinorGeneralFixChanges>
<SkipOnlyCosmetic>false</SkipOnlyCosmetic>
<SkipOnlyCasingChanged>false</SkipOnlyCasingChanged>
<SkipIfRedirect>false</SkipIfRedirect>
<SkipIfNoAlerts>false</SkipIfNoAlerts>
<SkipDoes>false</SkipDoes>
<SkipDoesText />
<SkipDoesRegex>false</SkipDoesRegex>
<SkipDoesCaseSensitive>false</SkipDoesCaseSensitive>
<SkipDoesAfterProcessing>false</SkipDoesAfterProcessing>
<SkipDoesNot>false</SkipDoesNot>
<SkipDoesNotText />
<SkipDoesNotRegex>false</SkipDoesNotRegex>
<SkipDoesNotCaseSensitive>false</SkipDoesNotCaseSensitive>
<SkipDoesNotAfterProcessing>false</SkipDoesNotAfterProcessing>
<SkipNoFindAndReplace>false</SkipNoFindAndReplace>
<SkipMinorFindAndReplace>false</SkipMinorFindAndReplace>
<SkipNoRegexTypoFix>false</SkipNoRegexTypoFix>
<SkipNoDisambiguation>false</SkipNoDisambiguation>
<SkipNoLinksOnPage>false</SkipNoLinksOnPage>
<GeneralSkipList />
</SkipOptions>
<Module>
<Enabled>true</Enabled>
<Language>C# 3.5</Language>
<Code>// this script is intended to simplify updates from:
// IANA [https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry language-subtag-registry file]
// to:
// [[Module:Lang/data/iana languages]]
// [[Module:Lang/data/iana scripts]]
// [[Module:Lang/data/iana regions]]
// [[Module:Lang/data/iana variants]]
// [[Module:Lang/data/iana suppressed scripts]]
// [[Module:ISO 639 name/ISO 639-1]]
//
// updated 2024–07-12 to account for shift of data files from Module:Language/data/~ to Module:Lang/data/~
//---------------------------< P R O C E S S A R T I C L E >--------------------------------------------------
//
//
//
public string ProcessArticle(string ArticleText, string ArticleTitle, int wikiNamespace, out string Summary, out bool Skip)
{
string replacement = null;
Skip = false;
if (null != ERROR_summary)
{
Summary = "ERROR: " + ERROR_summary;
Skip = true;
return ArticleText;
}
else if (null == DEBUG_summary)
Summary = "update to " + file_date.Split (':')[1].Trim() + " data;";
else
Summary = "DEBUG: " + DEBUG_summary;
switch (ArticleTitle)
{
case "Module:ISO 639 name/ISO 639-1":
replacement = String.Join(",\n\t", ISO_639_1.ToArray()); // concatenate the members of the list into a big damn string
break;
// case "Module:Lang/data/iana languages":
// replacement = String.Join(",\n\t", iana_languages.ToArray()); // concatenate the members of the list into a big damn string
// break;
case "Module:Lang/data/iana languages":
string active = String.Join(",\n\t", iana_languages.ToArray()); // concatenate the members of the list into a big damn string
string deprecated = String.Join(",\n\t", iana_languages_dep.ToArray()); // concatenate the members of the list into a big damn string
return "-- " + file_date + "\nlocal active = {\n\t" + active + "\n\t}\n\nlocal deprecated = {\n\t" + deprecated + "\n\t}\n\nreturn {\n\tactive = active,\n\tdeprecated = deprecated,\n\t}";
case "Module:Lang/data/iana scripts":
replacement = String.Join(",\n\t", iana_scripts.ToArray()); // concatenate the members of the list into a big damn string
break;
case "Module:Lang/data/iana regions":
replacement = String.Join(",\n\t", iana_regions.ToArray()); // concatenate the members of the list into a big damn string
break;
case "Module:Lang/data/iana suppressed scripts":
List<string> temp = new List<string>();
string script = null;
string[] subtags_array;
foreach (KeyValuePair<string, string> kvp in iana_suppressed_scripts)
{
script = "[\"" + kvp.Key + "\"] = ";
subtags_array = kvp.Value.Split (',');
temp.Add (script + "{" + prettify_suppressed_subtags (subtags_array) + "}");
}
replacement = String.Join(",\n\t", temp.ToArray()); // concatenate the members of the list into a big damn string
break;
case "Module:Lang/data/iana variants":
replacement = String.Join(",\n\t", iana_variants.ToArray()); // concatenate the members of the list into a big damn string
break;
default:
Summary = "Unexpected article: " + ArticleTitle;
Skip = true;
return ArticleText;
}
return "-- " + file_date + "\nreturn {\n\t" + replacement + "\n\t}"; // add file date, open and close the Lua table
}
//---------------------------< P R E T T I F Y _ S U P P R E S S E D _ S U B T A G S >------------------------
//
// Because Latn has a lot of suppressed subtags, to keep the list from running off the right side of the screen
// we prettify it so that it is several rows of 11 language subtags.
//
private string prettify_suppressed_subtags (string[] subtags_array)
{
int LIMIT = 11; // set the max number of subtags in a row of text
if (LIMIT >= subtags_array.Length) // not so many?
return String.Join (", ", subtags_array); // make a string and done
List<string> subtag_fragment_list = new List<string>(); // list of strings holding LIMIT number of subtags
string subtag_fragment_string = null; // assemble list fragments here
for (int i=0; subtags_array.Length > i; i+=LIMIT) // index fragments of LIMIT subtags
{
for (int count=0, src_idx=i; (LIMIT > count) && (subtags_array.Length > src_idx); count++, src_idx++) // loop through the source array and get a LIMIT number of subtags
{
if (0 == count)
subtag_fragment_string = subtags_array[src_idx]; // start or restart the string with first subtag of this fragment
else
subtag_fragment_string = subtag_fragment_string + ", " + subtags_array[src_idx]; // add subtag to comma separated string
}
subtag_fragment_list.Add (subtag_fragment_string); // save fragment in list
}
return String.Join (",\n\t\t\t\t", subtag_fragment_list.ToArray()); // concatenate the fragments and done
}
//==========================<< S T A T I C I N I T I A L I Z A T I O N >>===================================
//
// Read the plain-text registry file from a local drive, parse it apart much as is done by
// [[Module:Lang/data/iana languages/make]]. But, instead of one large file from which sections are copied
// and then pasted into the individual modules, create separate 'files' and then use awb to simply assign
// ArticleText = <new text for ArticleTitle>
// and then return. The script's xml file holds the list of modules to be updated.
// original snippets of this taken from [[Wikipedia:AutoWikiBrowser/Custom_Modules#Passing_text_to_external_program_for_processing]]
//
//---------------------------< S T A T I C D A T A >--------------------------------------------------------
static string subtag_reg_file_url = "https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry";
static string subtag_reg_file_name = "Z:\\Wikipedia\\language-subtag-registry_2020-07-17.txt";
static string[] sub_tag_records_array; // subtag registry is read into a local string and then split into this static array
static string file_date;
static string DEBUG_summary = null;
static string ERROR_summary = null;
static List<string> iana_languages = new List<string>();
static List<string> iana_languages_dep = new List<string>();
static List<string> iana_scripts = new List<string>();
static List<string> iana_regions = new List<string>();
static List<string> iana_variants = new List<string>();
static SortedList<string, string> iana_suppressed_scripts = new SortedList<string, string>();
static List<string> ISO_639_1 = new List<string>();
//---------------------------< S T A T I C C O N S T R U C T O R >------------------------------------------
//
// static constructor for the CustomModule class
//
static CustomModule()
{
string sub_tag_registry_text = @"";
try
{
// this WebRequest code courtesy of en.wiki editor User:DavidBrooks
System.Net.HttpWebRequest webRequest = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(subtag_reg_file_url);
webRequest.UserAgent = "IANA subtag registry file update (https://en.wikipedia.org/wiki/User:Trappist_the_monk/IANA_subtag_registry_file_update)";
System.IO.Stream str = webRequest.GetResponse().GetResponseStream();
sub_tag_registry_text = new System.IO.StreamReader(str).ReadToEnd();
// if (System.IO.File.Exists (subtag_reg_file_name))
// {
// using (System.IO.StreamReader reader = System.IO.File.OpenText (subtag_reg_file_name))
// sub_tag_registry_text = reader.ReadToEnd();
// }
// else
// ERROR_summary = "file not found: " + subtag_reg_file_name;
}
catch
{
ERROR_summary = "Exception occurred reading: " + subtag_reg_file_url;
// ERROR_summary = "Exception occurred reading: " + subtag_reg_file_name;
}
sub_tag_records_array = sub_tag_registry_text.Split (new string[] {"%%"}, StringSplitOptions.None);
sub_tag_registry_text = null; // unset this because we no longer need it
file_date = sub_tag_records_array[0].Trim(); // get the file date
foreach (string record in sub_tag_records_array.Skip(1)) // skip file date
{
string[] record_elements;
string trimmed_record = record.Trim(); // trim any leading and trailing whitespace
record_elements = trimmed_record.Split ('\n'); // split on the newlines between record elements
switch (record_elements[0].Trim())
{
case "Type: language":
lang_parse (record_elements);
break;
case "Type: script":
script_parse (record_elements);
break;
case "Type: region":
region_parse (record_elements);
break;
case "Type: variant":
variant_parse (record_elements);
break;
}
}
}
//---------------------------< V A R I A N T _ P A R S E >----------------------------------------------------
//
// Read a variant record and add the important bits to the iana_variants list
//
static void variant_parse (string[] record_elements)
{
string variant = null;
string description_list = null;
string prefix_list = null;
List<string> prefixes = new List<string>();
foreach (string element in record_elements)
{
string[] element_parts = element.Split(':');
if (0 == element_parts[0].IndexOf(" ")) // when line begins with two spaces it is a continuation of the previous line
{
description_list = description_list.Substring (0, description_list.Length-1); // remove trailing double quote character
description_list = description_list + " " + element_parts[0].Substring(2) + "\""; // add on the continuation and a new double quote character
}
if (String.Equals ("Subtag", element_parts[0]))
variant = element_parts[1].Trim();
if (String.Equals ("Description", element_parts[0])) // a variant name; may continue on the next line
{
string description = element_parts[1].Trim();
description = Regex.Replace (description, "\"", "\\\"");
if (null == description_list) // done this way because descriptions can continue on the next line
description_list = "\"" + description + "\"";
else
description_list = description_list + ", \"" + description + "\"";
}
if (String.Equals ("Prefix", element_parts[0]))
prefixes.Add ("\"" + element_parts[1].Trim().ToLower() + "\"");
if (String.Equals ("Deprecated", element_parts[0]))
return;
if (String.Equals ("Comments", element_parts[0]))
break; // ignore comments until the end; presume that these happen after Description elements
}
description_list = "[\"descriptions\"] = {" + description_list + "},";
prefix_list = "[\"prefixes\"] = {" + String.Join (", ", prefixes.ToArray()) + "},";
iana_variants.Add ("[\"" + variant.Trim() + "\"] = " + "{\n\t\t" + description_list + "\n\t\t" + prefix_list + "\n\t\t}");
}
//---------------------------< R E G I O N _ P A R S E >------------------------------------------------------
//
// Read a region record and add the important bits to the iana_regions list
//
static void region_parse (string[] record_elements)
{
string subtag = null; // not used here; required by lang_parse()
bool deprecated = false; // not used here; required by lang_parse()
string result = lang_script_region_parse (record_elements, ref subtag, ref deprecated);
if (null != result)
iana_regions.Add (result);
return;
}
//---------------------------< S C R I P T _ P A R S E >------------------------------------------------------
//
// Read a script record and add the important bits to the iana_scripts list//
//
static void script_parse (string[] record_elements)
{
string subtag = null; // not used here; required by lang_parse()
bool deprecated = false; // not used here; required by lang_parse()
string result = lang_script_region_parse (record_elements, ref subtag, ref deprecated);
if (null != result)
iana_scripts.Add (result);
return;
}
//---------------------------< L A N G _ P A R S E >----------------------------------------------------------
//
// Read a language record and add the important bits to the ISO_639_1 list (2-character subtags) and / or the
// iana_languages list (both 2- and 3-character subtags)
//
static void lang_parse (string[] record_elements)
{
string subtag = null;
bool deprecated = false;
string result = lang_script_region_parse (record_elements, ref subtag, ref deprecated);
if (null == result)
return;
if (deprecated)
iana_languages_dep.Add (result); // gets 2- and 3-character deprecated subtags
else
{
if (2 == subtag.Length)
ISO_639_1.Add (result); // gets only 2-character subtags
iana_languages.Add (result); // gets 2- and 3-character subtags
}
return;
}
//---------------------------< L A N G _ S C R I P T _ R E G I O N _ P A R S E >------------------------------
//
// common function for extracting the important bits from language, script, and region records
//
static string lang_script_region_parse (string[]record_elements, ref string subtag, ref bool deprecated)
{
string names = "";
foreach (string element in record_elements)
{
string[] element_parts = element.Split(':');
if (0 == element_parts[0].IndexOf(" ")) // line begins with two spaces it is a continuation of the previous line
{
names = names.Substring (0, names.Length-1); // remove trailing double quote character
names = names + " " + element_parts[0].Substring(2) + "\""; // add on the continuation and a new double quote character
}
if (String.Equals ("Subtag", element_parts[0]))
subtag = element_parts[1].Trim();
if (String.Equals ("Description", element_parts[0])) // a language name; may continue on the next line
{
if (String.Equals ("Private use", element_parts[1].Trim())) // private use subtags not supported
return null;
if (0 == names.Length) // done this way because descriptions can continue on the next line
names = "\"" + element_parts[1].Trim() + "\"";
else
names = names + ", \"" + element_parts[1].Trim() + "\"";
}
if (String.Equals ("Suppress-Script", element_parts[0]))
suppress_script_add (element_parts[1].Trim(), subtag); // add this subtag to this script's list
if (String.Equals ("Deprecated", element_parts[0]))
{
if (!String.Equals ("Type: language", record_elements[0])) // ignore deprecated script and region tags
return null;
deprecated = true; // report to calling function that this subtag is deprecated
}
if (String.Equals ("Comments", element_parts[0]))
break; // ignore comments until the end; presume that these happen after Description elements
}
return "[\"" + subtag.Trim() + "\"] = " + "{" + names + "}";
}
//---------------------------< S U P P R E S S _ S C R I P T _ A D D >----------------------------------------
//
// manages the addition of a new script and its suppressed subtag and manages the addition of a new subtag to
// a script's existing list of subtags
//
static void suppress_script_add (string script, string subtag)
{
if (iana_suppressed_scripts.ContainsKey (script))
{
string subtags = iana_suppressed_scripts[script]; // get the previous version of the list
subtags = subtags + ",\"" + subtag + "\""; // add this quoted subtag to the comma-separated list
iana_suppressed_scripts[script] = subtags; // save it back into the list
}
else
iana_suppressed_scripts.Add (script, "\"" + subtag + "\""); // start a new list for this script
return;
}
// Z:\Wikipedia\AWB\IANA_subtag_registry_file_update.cs</Code>
</Module>
<ExternalProgram>
<Enabled>false</Enabled>
<Skip>false</Skip>
<Program />
<Parameters />
<PassAsFile>true</PassAsFile>
<OutputFile />
</ExternalProgram>
<Disambiguation>
<Enabled>false</Enabled>
<Link />
<Variants />
<ContextChars>20</ContextChars>
</Disambiguation>
<Special>
<namespaceValues />
<remDupes>true</remDupes>
<sortAZ>true</sortAZ>
<filterTitlesThatContain>false</filterTitlesThatContain>
<filterTitlesThatContainText />
<filterTitlesThatDontContain>false</filterTitlesThatDontContain>
<filterTitlesThatDontContainText />
<areRegex>false</areRegex>
<opType>0</opType>
<remove />
</Special>
<Tool>
<ListComparerUseCurrentArticleList>0</ListComparerUseCurrentArticleList>
<ListSplitterUseCurrentArticleList>0</ListSplitterUseCurrentArticleList>
<DatabaseScannerUseCurrentArticleList>0</DatabaseScannerUseCurrentArticleList>
</Tool>
<Plugin>
<PluginPrefs>
<Name>CSV Loader</Name>
<PluginSettings>
<anyType xsi:type="PrefsKeyPair">
<Name>TextMode</Name>
<Setting xsi:type="xsd:string">Append</Setting>
</anyType>
<anyType xsi:type="PrefsKeyPair">
<Name>InputText</Name>
<Setting xsi:type="xsd:string" />
</anyType>
<anyType xsi:type="PrefsKeyPair">
<Name>ColumnHeaders</Name>
<Setting xsi:type="xsd:string" />
</anyType>
<anyType xsi:type="PrefsKeyPair">
<Name>Skip</Name>
<Setting xsi:type="xsd:boolean">true</Setting>
</anyType>
<anyType xsi:type="PrefsKeyPair">
<Name>Separator</Name>
<Setting xsi:type="xsd:string">,</Setting>
</anyType>
<anyType xsi:type="PrefsKeyPair">
<Name>CreateLists</Name>
<Setting xsi:type="xsd:boolean">false</Setting>
</anyType>
<anyType xsi:type="PrefsKeyPair">
<Name>ListSeparator</Name>
<Setting xsi:type="xsd:string">^</Setting>
</anyType>
<anyType xsi:type="PrefsKeyPair">
<Name>FindReplace</Name>
<Setting xsi:type="xsd:boolean">false</Setting>
</anyType>
<anyType xsi:type="PrefsKeyPair">
<Name>EditSummary</Name>
<Setting xsi:type="xsd:string" />
</anyType>
</PluginSettings>
</PluginPrefs>
</Plugin>
</AutoWikiBrowserPreferences>
c# module
[edit]// this script is intended to simplify updates from:
// IANA [https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry language-subtag-registry file]
// to:
// [[Module:Lang/data/iana languages]]
// [[Module:Lang/data/iana scripts]]
// [[Module:Lang/data/iana regions]]
// [[Module:Lang/data/iana variants]]
// [[Module:Lang/data/iana suppressed scripts]]
// [[Module:ISO 639 name/ISO 639-1]]
//
// updated 2024–07-12 to account for shift of data files from Module:Language/data/~ to Module:Lang/data/~
//---------------------------< P R O C E S S A R T I C L E >--------------------------------------------------
//
//
//
public string ProcessArticle(string ArticleText, string ArticleTitle, int wikiNamespace, out string Summary, out bool Skip)
{
string replacement = null;
Skip = false;
if (null != ERROR_summary)
{
Summary = "ERROR: " + ERROR_summary;
Skip = true;
return ArticleText;
}
else if (null == DEBUG_summary)
Summary = "update to " + file_date.Split (':')[1].Trim() + " data;";
else
Summary = "DEBUG: " + DEBUG_summary;
switch (ArticleTitle)
{
case "Module:ISO 639 name/ISO 639-1":
replacement = String.Join(",\n\t", ISO_639_1.ToArray()); // concatenate the members of the list into a big damn string
break;
// case "Module:Lang/data/iana languages":
// replacement = String.Join(",\n\t", iana_languages.ToArray()); // concatenate the members of the list into a big damn string
// break;
case "Module:Lang/data/iana languages":
string active = String.Join(",\n\t", iana_languages.ToArray()); // concatenate the members of the list into a big damn string
string deprecated = String.Join(",\n\t", iana_languages_dep.ToArray()); // concatenate the members of the list into a big damn string
return "-- " + file_date + "\nlocal active = {\n\t" + active + "\n\t}\n\nlocal deprecated = {\n\t" + deprecated + "\n\t}\n\nreturn {\n\tactive = active,\n\tdeprecated = deprecated,\n\t}";
case "Module:Lang/data/iana scripts":
replacement = String.Join(",\n\t", iana_scripts.ToArray()); // concatenate the members of the list into a big damn string
break;
case "Module:Lang/data/iana regions":
replacement = String.Join(",\n\t", iana_regions.ToArray()); // concatenate the members of the list into a big damn string
break;
case "Module:Lang/data/iana suppressed scripts":
List<string> temp = new List<string>();
string script = null;
string[] subtags_array;
foreach (KeyValuePair<string, string> kvp in iana_suppressed_scripts)
{
script = "[\"" + kvp.Key + "\"] = ";
subtags_array = kvp.Value.Split (',');
temp.Add (script + "{" + prettify_suppressed_subtags (subtags_array) + "}");
}
replacement = String.Join(",\n\t", temp.ToArray()); // concatenate the members of the list into a big damn string
break;
case "Module:Lang/data/iana variants":
replacement = String.Join(",\n\t", iana_variants.ToArray()); // concatenate the members of the list into a big damn string
break;
default:
Summary = "Unexpected article: " + ArticleTitle;
Skip = true;
return ArticleText;
}
return "-- " + file_date + "\nreturn {\n\t" + replacement + "\n\t}"; // add file date, open and close the Lua table
}
//---------------------------< P R E T T I F Y _ S U P P R E S S E D _ S U B T A G S >------------------------
//
// Because Latn has a lot of suppressed subtags, to keep the list from running off the right side of the screen
// we prettify it so that it is several rows of 11 language subtags.
//
private string prettify_suppressed_subtags (string[] subtags_array)
{
int LIMIT = 11; // set the max number of subtags in a row of text
if (LIMIT >= subtags_array.Length) // not so many?
return String.Join (", ", subtags_array); // make a string and done
List<string> subtag_fragment_list = new List<string>(); // list of strings holding LIMIT number of subtags
string subtag_fragment_string = null; // assemble list fragments here
for (int i=0; subtags_array.Length > i; i+=LIMIT) // index fragments of LIMIT subtags
{
for (int count=0, src_idx=i; (LIMIT > count) && (subtags_array.Length > src_idx); count++, src_idx++) // loop through the source array and get a LIMIT number of subtags
{
if (0 == count)
subtag_fragment_string = subtags_array[src_idx]; // start or restart the string with first subtag of this fragment
else
subtag_fragment_string = subtag_fragment_string + ", " + subtags_array[src_idx]; // add subtag to comma separated string
}
subtag_fragment_list.Add (subtag_fragment_string); // save fragment in list
}
return String.Join (",\n\t\t\t\t", subtag_fragment_list.ToArray()); // concatenate the fragments and done
}
//==========================<< S T A T I C I N I T I A L I Z A T I O N >>===================================
//
// Read the plain-text registry file from a local drive, parse it apart much as is done by
// [[Module:Lang/data/iana languages/make]]. But, instead of one large file from which sections are copied
// and then pasted into the individual modules, create separate 'files' and then use awb to simply assign
// ArticleText = <new text for ArticleTitle>
// and then return. The script's xml file holds the list of modules to be updated.
// original snippets of this taken from [[Wikipedia:AutoWikiBrowser/Custom_Modules#Passing_text_to_external_program_for_processing]]
//
//---------------------------< S T A T I C D A T A >--------------------------------------------------------
static string subtag_reg_file_url = "https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry";
static string subtag_reg_file_name = "Z:\\Wikipedia\\language-subtag-registry_2020-07-17.txt";
static string[] sub_tag_records_array; // subtag registry is read into a local string and then split into this static array
static string file_date;
static string DEBUG_summary = null;
static string ERROR_summary = null;
static List<string> iana_languages = new List<string>();
static List<string> iana_languages_dep = new List<string>();
static List<string> iana_scripts = new List<string>();
static List<string> iana_regions = new List<string>();
static List<string> iana_variants = new List<string>();
static SortedList<string, string> iana_suppressed_scripts = new SortedList<string, string>();
static List<string> ISO_639_1 = new List<string>();
//---------------------------< S T A T I C C O N S T R U C T O R >------------------------------------------
//
// static constructor for the CustomModule class
//
static CustomModule()
{
string sub_tag_registry_text = @"";
try
{
// this WebRequest code courtesy of en.wiki editor User:DavidBrooks
System.Net.HttpWebRequest webRequest = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(subtag_reg_file_url);
webRequest.UserAgent = "IANA subtag registry file update (https://en.wikipedia.org/wiki/User:Trappist_the_monk/IANA_subtag_registry_file_update)";
System.IO.Stream str = webRequest.GetResponse().GetResponseStream();
sub_tag_registry_text = new System.IO.StreamReader(str).ReadToEnd();
// if (System.IO.File.Exists (subtag_reg_file_name))
// {
// using (System.IO.StreamReader reader = System.IO.File.OpenText (subtag_reg_file_name))
// sub_tag_registry_text = reader.ReadToEnd();
// }
// else
// ERROR_summary = "file not found: " + subtag_reg_file_name;
}
catch
{
ERROR_summary = "Exception occurred reading: " + subtag_reg_file_url;
// ERROR_summary = "Exception occurred reading: " + subtag_reg_file_name;
}
sub_tag_records_array = sub_tag_registry_text.Split (new string[] {"%%"}, StringSplitOptions.None);
sub_tag_registry_text = null; // unset this because we no longer need it
file_date = sub_tag_records_array[0].Trim(); // get the file date
foreach (string record in sub_tag_records_array.Skip(1)) // skip file date
{
string[] record_elements;
string trimmed_record = record.Trim(); // trim any leading and trailing whitespace
record_elements = trimmed_record.Split ('\n'); // split on the newlines between record elements
switch (record_elements[0].Trim())
{
case "Type: language":
lang_parse (record_elements);
break;
case "Type: script":
script_parse (record_elements);
break;
case "Type: region":
region_parse (record_elements);
break;
case "Type: variant":
variant_parse (record_elements);
break;
}
}
}
//---------------------------< V A R I A N T _ P A R S E >----------------------------------------------------
//
// Read a variant record and add the important bits to the iana_variants list
//
static void variant_parse (string[] record_elements)
{
string variant = null;
string description_list = null;
string prefix_list = null;
List<string> prefixes = new List<string>();
foreach (string element in record_elements)
{
string[] element_parts = element.Split(':');
if (0 == element_parts[0].IndexOf(" ")) // when line begins with two spaces it is a continuation of the previous line
{
description_list = description_list.Substring (0, description_list.Length-1); // remove trailing double quote character
description_list = description_list + " " + element_parts[0].Substring(2) + "\""; // add on the continuation and a new double quote character
}
if (String.Equals ("Subtag", element_parts[0]))
variant = element_parts[1].Trim();
if (String.Equals ("Description", element_parts[0])) // a variant name; may continue on the next line
{
string description = element_parts[1].Trim();
description = Regex.Replace (description, "\"", "\\\"");
if (null == description_list) // done this way because descriptions can continue on the next line
description_list = "\"" + description + "\"";
else
description_list = description_list + ", \"" + description + "\"";
}
if (String.Equals ("Prefix", element_parts[0]))
prefixes.Add ("\"" + element_parts[1].Trim().ToLower() + "\"");
if (String.Equals ("Deprecated", element_parts[0]))
return;
if (String.Equals ("Comments", element_parts[0]))
break; // ignore comments until the end; presume that these happen after Description elements
}
description_list = "[\"descriptions\"] = {" + description_list + "},";
prefix_list = "[\"prefixes\"] = {" + String.Join (", ", prefixes.ToArray()) + "},";
iana_variants.Add ("[\"" + variant.Trim() + "\"] = " + "{\n\t\t" + description_list + "\n\t\t" + prefix_list + "\n\t\t}");
}
//---------------------------< R E G I O N _ P A R S E >------------------------------------------------------
//
// Read a region record and add the important bits to the iana_regions list
//
static void region_parse (string[] record_elements)
{
string subtag = null; // not used here; required by lang_parse()
bool deprecated = false; // not used here; required by lang_parse()
string result = lang_script_region_parse (record_elements, ref subtag, ref deprecated);
if (null != result)
iana_regions.Add (result);
return;
}
//---------------------------< S C R I P T _ P A R S E >------------------------------------------------------
//
// Read a script record and add the important bits to the iana_scripts list//
//
static void script_parse (string[] record_elements)
{
string subtag = null; // not used here; required by lang_parse()
bool deprecated = false; // not used here; required by lang_parse()
string result = lang_script_region_parse (record_elements, ref subtag, ref deprecated);
if (null != result)
iana_scripts.Add (result);
return;
}
//---------------------------< L A N G _ P A R S E >----------------------------------------------------------
//
// Read a language record and add the important bits to the ISO_639_1 list (2-character subtags) and / or the
// iana_languages list (both 2- and 3-character subtags)
//
static void lang_parse (string[] record_elements)
{
string subtag = null;
bool deprecated = false;
string result = lang_script_region_parse (record_elements, ref subtag, ref deprecated);
if (null == result)
return;
if (deprecated)
iana_languages_dep.Add (result); // gets 2- and 3-character deprecated subtags
else
{
if (2 == subtag.Length)
ISO_639_1.Add (result); // gets only 2-character subtags
iana_languages.Add (result); // gets 2- and 3-character subtags
}
return;
}
//---------------------------< L A N G _ S C R I P T _ R E G I O N _ P A R S E >------------------------------
//
// common function for extracting the important bits from language, script, and region records
//
static string lang_script_region_parse (string[]record_elements, ref string subtag, ref bool deprecated)
{
string names = "";
foreach (string element in record_elements)
{
string[] element_parts = element.Split(':');
if (0 == element_parts[0].IndexOf(" ")) // line begins with two spaces it is a continuation of the previous line
{
names = names.Substring (0, names.Length-1); // remove trailing double quote character
names = names + " " + element_parts[0].Substring(2) + "\""; // add on the continuation and a new double quote character
}
if (String.Equals ("Subtag", element_parts[0]))
subtag = element_parts[1].Trim();
if (String.Equals ("Description", element_parts[0])) // a language name; may continue on the next line
{
if (String.Equals ("Private use", element_parts[1].Trim())) // private use subtags not supported
return null;
if (0 == names.Length) // done this way because descriptions can continue on the next line
names = "\"" + element_parts[1].Trim() + "\"";
else
names = names + ", \"" + element_parts[1].Trim() + "\"";
}
if (String.Equals ("Suppress-Script", element_parts[0]))
suppress_script_add (element_parts[1].Trim(), subtag); // add this subtag to this script's list
if (String.Equals ("Deprecated", element_parts[0]))
{
if (!String.Equals ("Type: language", record_elements[0])) // ignore deprecated script and region tags
return null;
deprecated = true; // report to calling function that this subtag is deprecated
}
if (String.Equals ("Comments", element_parts[0]))
break; // ignore comments until the end; presume that these happen after Description elements
}
return "[\"" + subtag.Trim() + "\"] = " + "{" + names + "}";
}
//---------------------------< S U P P R E S S _ S C R I P T _ A D D >----------------------------------------
//
// manages the addition of a new script and its suppressed subtag and manages the addition of a new subtag to
// a script's existing list of subtags
//
static void suppress_script_add (string script, string subtag)
{
if (iana_suppressed_scripts.ContainsKey (script))
{
string subtags = iana_suppressed_scripts[script]; // get the previous version of the list
subtags = subtags + ",\"" + subtag + "\""; // add this quoted subtag to the comma-separated list
iana_suppressed_scripts[script] = subtags; // save it back into the list
}
else
iana_suppressed_scripts.Add (script, "\"" + subtag + "\""); // start a new list for this script
return;
}
// Z:\Wikipedia\AWB\IANA_subtag_registry_file_update.cs