Jump to content

User:Trappist the monk/IANA subtag registry file update

From Wikipedia, the free encyclopedia

Module:Lang relies upon a series of Lua data modules that are transcribed from the IANA language-subtag-registry file:

All of these modules can be updated by using Module:Lang/data/iana_languages/make and copy/pasting the appropriate sections of that module's output into the related data module. This is a crude awb script that can be used to update all of the data modules in a more 'automatic' manner. The script fetches the language-subtag-registry file from the IANA website and then parses it apart and updates the various modules.

The process is:

  1. open awb and load the settings file.
  2. click start
  3. review the changes, and if acceptable, click save; repeat 'til done.

settings file

[edit]
<?xml version="1.0" encoding="utf-8"?>
<AutoWikiBrowserPreferences xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xml:space="preserve" Version="6.3.0.0">
  <Project>wikipedia</Project>
  <LanguageCode>en</LanguageCode>
  <CustomProject />
  <Protocol>http://</Protocol>
  <LoginDomain />
  <List>
    <ListSource />
    <SelectedProvider>CategoryListProvider</SelectedProvider>
    <ArticleList>
      <Article NameSpaceKey="828">
        <PreProcessed>false</PreProcessed>
        <Name>Module:Lang/data/iana languages</Name>
      </Article>
      <Article NameSpaceKey="828">
        <PreProcessed>false</PreProcessed>
        <Name>Module:Lang/data/iana scripts</Name>
      </Article>
      <Article NameSpaceKey="828">
        <PreProcessed>false</PreProcessed>
        <Name>Module:Lang/data/iana regions</Name>
      </Article>
      <Article NameSpaceKey="828">
        <PreProcessed>false</PreProcessed>
        <Name>Module:Lang/data/iana variants</Name>
      </Article>
      <Article NameSpaceKey="828">
        <PreProcessed>false</PreProcessed>
        <Name>Module:Lang/data/iana suppressed scripts</Name>
      </Article>
      <Article NameSpaceKey="828">
        <PreProcessed>false</PreProcessed>
        <Name>Module:ISO 639 name/ISO 639-1</Name>
      </Article>
    </ArticleList>
  </List>
  <FindAndReplace>
    <Enabled>false</Enabled>
    <IgnoreSomeText>false</IgnoreSomeText>
    <IgnoreMoreText>false</IgnoreMoreText>
    <AppendSummary>false</AppendSummary>
    <Replacements />
    <AdvancedReps />
    <SubstTemplates />
    <IncludeComments>false</IncludeComments>
    <ExpandRecursively>true</ExpandRecursively>
    <IgnoreUnformatted>false</IgnoreUnformatted>
  </FindAndReplace>
  <Editprefs>
    <GeneralFixes>false</GeneralFixes>
    <Tagger>false</Tagger>
    <Unicodify>false</Unicodify>
    <Recategorisation>0</Recategorisation>
    <NewCategory />
    <NewCategory2 />
    <ReImage>0</ReImage>
    <ImageFind />
    <Replace />
    <SkipIfNoCatChange>false</SkipIfNoCatChange>
    <RemoveSortKey>false</RemoveSortKey>
    <SkipIfNoImgChange>false</SkipIfNoImgChange>
    <AppendText>false</AppendText>
    <AppendTextMetaDataSort>false</AppendTextMetaDataSort>
    <Append>false</Append>
    <Text />
    <Newlines>0</Newlines>
    <AutoDelay>0</AutoDelay>
    <BotMaxEdits>0</BotMaxEdits>
    <SupressTag>false</SupressTag>
    <RegexTypoFix>false</RegexTypoFix>
  </Editprefs>
  <General>
    <AutoSaveEdit>
      <Enabled>false</Enabled>
      <SavePeriod>30</SavePeriod>
      <SaveFile />
    </AutoSaveEdit>
    <SelectedSummary />
    <Summaries>
      <string>clean up</string>
      <string>re-categorisation per [[WP:CFD|CFD]]</string>
      <string>clean up and re-categorisation per [[WP:CFD|CFD]]</string>
      <string>removing category per [[WP:CFD|CFD]]</string>
      <string>[[Wikipedia:Template substitution|subst:'ing]]</string>
      <string>[[Wikipedia:WikiProject Stub sorting|stub sorting]]</string>
      <string>[[WP:AWB/T|Typo fixing]]</string>
      <string>bad link repair</string>
      <string>Fixing [[Wikipedia:Disambiguation pages with links|links to disambiguation pages]]</string>
      <string>Unicodifying</string>
      <string>use [[Module:Lang]]; see [[Template talk:Lang#converting to lua]];</string>
    </Summaries>
    <PasteMore>
      <string />
      <string />
      <string />
      <string />
      <string />
      <string />
      <string />
      <string />
      <string />
      <string />
    </PasteMore>
    <FindText />
    <FindRegex>false</FindRegex>
    <FindCaseSensitive>false</FindCaseSensitive>
    <WordWrap>true</WordWrap>
    <ToolBarEnabled>false</ToolBarEnabled>
    <BypassRedirect>true</BypassRedirect>
    <AutoSaveSettings>false</AutoSaveSettings>
    <noSectionEditSummary>true</noSectionEditSummary>
    <restrictDefaultsortAddition>true</restrictDefaultsortAddition>
    <restrictOrphanTagging>true</restrictOrphanTagging>
    <noMOSComplianceFixes>false</noMOSComplianceFixes>
    <syntaxHighlightEditBox>false</syntaxHighlightEditBox>
    <highlightAllFind>false</highlightAllFind>
    <PreParseMode>false</PreParseMode>
    <NoAutoChanges>false</NoAutoChanges>
    <OnLoadAction>0</OnLoadAction>
    <DiffInBotMode>false</DiffInBotMode>
    <Minor>true</Minor>
    <AddToWatchlist>2</AddToWatchlist>
    <TimerEnabled>false</TimerEnabled>
    <SortListAlphabetically>false</SortListAlphabetically>
    <AddIgnoredToLog>false</AddIgnoredToLog>
    <EditToolbarEnabled>false</EditToolbarEnabled>
    <filterNonMainSpace>false</filterNonMainSpace>
    <AutoFilterDuplicates>false</AutoFilterDuplicates>
    <FocusAtEndOfEditBox>false</FocusAtEndOfEditBox>
    <scrollToUnbalancedBrackets>false</scrollToUnbalancedBrackets>
    <TextBoxSize>10</TextBoxSize>
    <TextBoxFont>Courier New</TextBoxFont>
    <LowThreadPriority>false</LowThreadPriority>
    <Beep>false</Beep>
    <Flash>false</Flash>
    <Minimize>false</Minimize>
    <LockSummary>false</LockSummary>
    <SaveArticleList>true</SaveArticleList>
    <SuppressUsingAWB>false</SuppressUsingAWB>
    <AddUsingAWBToActionSummaries>false</AddUsingAWBToActionSummaries>
    <IgnoreNoBots>false</IgnoreNoBots>
    <ClearPageListOnProjectChange>false</ClearPageListOnProjectChange>
    <SortInterWikiOrder>true</SortInterWikiOrder>
    <ReplaceReferenceTags>true</ReplaceReferenceTags>
    <LoggingEnabled>true</LoggingEnabled>
    <AlertPreferences />
  </General>
  <SkipOptions>
    <SkipNonexistent>true</SkipNonexistent>
    <Skipexistent>false</Skipexistent>
    <SkipDontCare>false</SkipDontCare>
    <SkipWhenNoChanges>false</SkipWhenNoChanges>
    <SkipSpamFilterBlocked>false</SkipSpamFilterBlocked>
    <SkipInuse>false</SkipInuse>
    <SkipWhenOnlyWhitespaceChanged>false</SkipWhenOnlyWhitespaceChanged>
    <SkipOnlyGeneralFixChanges>true</SkipOnlyGeneralFixChanges>
    <SkipOnlyMinorGeneralFixChanges>false</SkipOnlyMinorGeneralFixChanges>
    <SkipOnlyCosmetic>false</SkipOnlyCosmetic>
    <SkipOnlyCasingChanged>false</SkipOnlyCasingChanged>
    <SkipIfRedirect>false</SkipIfRedirect>
    <SkipIfNoAlerts>false</SkipIfNoAlerts>
    <SkipDoes>false</SkipDoes>
    <SkipDoesText />
    <SkipDoesRegex>false</SkipDoesRegex>
    <SkipDoesCaseSensitive>false</SkipDoesCaseSensitive>
    <SkipDoesAfterProcessing>false</SkipDoesAfterProcessing>
    <SkipDoesNot>false</SkipDoesNot>
    <SkipDoesNotText />
    <SkipDoesNotRegex>false</SkipDoesNotRegex>
    <SkipDoesNotCaseSensitive>false</SkipDoesNotCaseSensitive>
    <SkipDoesNotAfterProcessing>false</SkipDoesNotAfterProcessing>
    <SkipNoFindAndReplace>false</SkipNoFindAndReplace>
    <SkipMinorFindAndReplace>false</SkipMinorFindAndReplace>
    <SkipNoRegexTypoFix>false</SkipNoRegexTypoFix>
    <SkipNoDisambiguation>false</SkipNoDisambiguation>
    <SkipNoLinksOnPage>false</SkipNoLinksOnPage>
    <GeneralSkipList />
  </SkipOptions>
  <Module>
    <Enabled>true</Enabled>
    <Language>C# 3.5</Language>
    <Code>// this script is intended to simplify updates from:
//		IANA [https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry language-subtag-registry file]
// to:
//		[[Module:Lang/data/iana languages]]
//		[[Module:Lang/data/iana scripts]]
//		[[Module:Lang/data/iana regions]]
//		[[Module:Lang/data/iana variants]]
//		[[Module:Lang/data/iana suppressed scripts]]
//		[[Module:ISO 639 name/ISO 639-1]]
//
// updated 2024–07-12 to account for shift of data files from Module:Language/data/~ to Module:Lang/data/~

//---------------------------&lt; P R O C E S S A R T I C L E &gt;--------------------------------------------------
//
//
//

public string ProcessArticle(string ArticleText, string ArticleTitle, int wikiNamespace, out string Summary, out bool Skip)
	{
	string replacement = null;
	Skip = false;
	
	if (null != ERROR_summary)
		{
		Summary = "ERROR: " + ERROR_summary;
		Skip = true;
		return ArticleText;
		}
	else if (null == DEBUG_summary)
		Summary = "update to " + file_date.Split (':')[1].Trim() + " data;";
	else
		Summary = "DEBUG: " + DEBUG_summary;
	
	switch (ArticleTitle)
		{
		case "Module:ISO 639 name/ISO 639-1":
			replacement = String.Join(",\n\t", ISO_639_1.ToArray());		// concatenate the members of the list into a big damn string
			break;
			
//		case "Module:Lang/data/iana languages":
//			replacement = String.Join(",\n\t", iana_languages.ToArray());	// concatenate the members of the list into a big damn string
//			break;
		
		case "Module:Lang/data/iana languages":
			string active = String.Join(",\n\t", iana_languages.ToArray());	// concatenate the members of the list into a big damn string
			string deprecated = String.Join(",\n\t", iana_languages_dep.ToArray());	// concatenate the members of the list into a big damn string
			return "-- " + file_date + "\nlocal active = {\n\t" + active + "\n\t}\n\nlocal deprecated = {\n\t" + deprecated + "\n\t}\n\nreturn {\n\tactive = active,\n\tdeprecated = deprecated,\n\t}";
		
		case "Module:Lang/data/iana scripts":
			replacement = String.Join(",\n\t", iana_scripts.ToArray());		// concatenate the members of the list into a big damn string
			break;

		case "Module:Lang/data/iana regions":
			replacement = String.Join(",\n\t", iana_regions.ToArray());		// concatenate the members of the list into a big damn string
			break;
		
		case "Module:Lang/data/iana suppressed scripts":
			List&lt;string&gt; temp = new List&lt;string&gt;();
			string script = null;
			string[] subtags_array;
			foreach (KeyValuePair&lt;string, string&gt; kvp in iana_suppressed_scripts)
				{
				script = "[\"" + kvp.Key + "\"] = ";
				subtags_array = kvp.Value.Split (',');

				temp.Add (script + "{" + prettify_suppressed_subtags (subtags_array) + "}");
				}
			replacement = String.Join(",\n\t", temp.ToArray());				// concatenate the members of the list into a big damn string
			break;

		case "Module:Lang/data/iana variants":
			replacement = String.Join(",\n\t", iana_variants.ToArray());	// concatenate the members of the list into a big damn string
			break;

		default:
			Summary = "Unexpected article: " + ArticleTitle;
			Skip = true;
			return ArticleText;
		}

	return "-- " + file_date + "\nreturn {\n\t" + replacement + "\n\t}";	// add file date, open and close the Lua table
	}
		

//---------------------------&lt; P R E T T I F Y _ S U P P R E S S E D _ S U B T A G S &gt;------------------------
//
// Because Latn has a lot of suppressed subtags, to keep the list from running off the right side of the screen
// we prettify it so that it is several rows of 11 language subtags.
//

private string prettify_suppressed_subtags (string[] subtags_array)
	{
	int LIMIT = 11;														// set the max number of subtags in a row of text
	
	if (LIMIT &gt;= subtags_array.Length)									// not so many?
		return String.Join (", ", subtags_array);						// make a string and done

	List&lt;string&gt; subtag_fragment_list = new List&lt;string&gt;();				// list of strings holding LIMIT number of subtags
	string subtag_fragment_string = null;								// assemble list fragments here
	
	for (int i=0; subtags_array.Length &gt; i; i+=LIMIT)					// index fragments of LIMIT subtags
		{
		for (int count=0, src_idx=i; (LIMIT &gt; count) &amp;&amp; (subtags_array.Length &gt; src_idx); count++, src_idx++)	// loop through the source array and get a LIMIT number of subtags
			{
			if (0 == count)
				subtag_fragment_string = subtags_array[src_idx];		// start or restart the string with first subtag of this fragment
			else
				subtag_fragment_string = subtag_fragment_string + ", " + subtags_array[src_idx];	// add subtag to comma separated string
			}

		subtag_fragment_list.Add (subtag_fragment_string);				// save fragment in list
		}
	
	return String.Join (",\n\t\t\t\t", subtag_fragment_list.ToArray());	// concatenate the fragments and done
	}


//==========================&lt;&lt; S T A T I C   I N I T I A L I Z A T I O N &gt;&gt;===================================
//
// Read the plain-text registry file from a local drive, parse it apart much as is done by
// [[Module:Lang/data/iana languages/make]].  But, instead of one large file from which sections are copied
// and then pasted into the individual modules, create separate 'files' and then use awb to simply assign
//		ArticleText = &lt;new text for ArticleTitle&gt;
// and then return.  The script's xml file holds the list of modules to be updated.

// original snippets of this taken from [[Wikipedia:AutoWikiBrowser/Custom_Modules#Passing_text_to_external_program_for_processing]]
//


//---------------------------&lt; S T A T I C   D A T A &gt;--------------------------------------------------------

	static string	subtag_reg_file_url = "https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry";
	static string	subtag_reg_file_name = "Z:\\Wikipedia\\language-subtag-registry_2020-07-17.txt";
	static string[]	sub_tag_records_array;								// subtag registry is read into a local string and then split into this static array
	static string	file_date;
	static string	DEBUG_summary = null;
	static string	ERROR_summary = null;

	static List&lt;string&gt; iana_languages = new List&lt;string&gt;();
	static List&lt;string&gt; iana_languages_dep = new List&lt;string&gt;();
	static List&lt;string&gt; iana_scripts = new List&lt;string&gt;();
	static List&lt;string&gt; iana_regions = new List&lt;string&gt;();
	static List&lt;string&gt; iana_variants = new List&lt;string&gt;();
	static SortedList&lt;string, string&gt; iana_suppressed_scripts = new SortedList&lt;string, string&gt;();
	static List&lt;string&gt; ISO_639_1 = new List&lt;string&gt;();



//---------------------------&lt; S T A T I C   C O N S T R U C T O R &gt;------------------------------------------
//
// static constructor for the CustomModule class
//

static CustomModule()
        {
		string sub_tag_registry_text = @"";
		try
			{
			// this WebRequest code courtesy of en.wiki editor User:DavidBrooks
			System.Net.HttpWebRequest webRequest = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(subtag_reg_file_url);
			webRequest.UserAgent = "IANA subtag registry file update (https://en.wikipedia.org/wiki/User:Trappist_the_monk/IANA_subtag_registry_file_update)";
			System.IO.Stream str = webRequest.GetResponse().GetResponseStream();
			sub_tag_registry_text = new System.IO.StreamReader(str).ReadToEnd();

//			if (System.IO.File.Exists (subtag_reg_file_name))
//				{
//				using (System.IO.StreamReader reader = System.IO.File.OpenText (subtag_reg_file_name))
//					sub_tag_registry_text = reader.ReadToEnd();
//				}
//			else
//				ERROR_summary = "file not found: " + subtag_reg_file_name;
			}
		catch
			{
			ERROR_summary = "Exception occurred reading: " + subtag_reg_file_url;
//			ERROR_summary = "Exception occurred reading: " + subtag_reg_file_name;
			}
		
		sub_tag_records_array = sub_tag_registry_text.Split (new string[] {"%%"}, StringSplitOptions.None);
		sub_tag_registry_text = null;									// unset this because we no longer need it
		
		
		file_date = sub_tag_records_array[0].Trim();					// get the file date
		foreach (string record in sub_tag_records_array.Skip(1))		// skip file date
			{
			string[]	record_elements;
			string		trimmed_record = record.Trim();					// trim any leading and trailing whitespace

			record_elements = trimmed_record.Split ('\n');				// split on the newlines between record elements

			switch (record_elements[0].Trim())
				{
				case "Type: language":
					lang_parse (record_elements);
					break;
				
				case "Type: script":
					script_parse (record_elements);
					break;

				case "Type: region":
					region_parse (record_elements);
					break;

				case "Type: variant":
					variant_parse (record_elements);
					break;
				}
			}
        }


//---------------------------&lt; V A R I A N T _ P A R S E &gt;----------------------------------------------------
//
// Read a variant record and add the important bits to the iana_variants list
//

static void variant_parse (string[] record_elements)
	{
	string	variant = null;
	string	description_list = null;
	string	prefix_list = null;
	List&lt;string&gt; prefixes = new List&lt;string&gt;(); 

	foreach (string element in record_elements)
		{
		string[] element_parts = element.Split(':');

		if (0 == element_parts[0].IndexOf("  "))						// when line begins with two spaces it is a continuation of the previous line
			{
			description_list = description_list.Substring (0, description_list.Length-1);		// remove trailing double quote character
			description_list = description_list + " " + element_parts[0].Substring(2) + "\"";	// add on the continuation and a new double quote character
			}

		if (String.Equals ("Subtag", element_parts[0]))
			variant = element_parts[1].Trim();

		if (String.Equals ("Description", element_parts[0]))			// a variant name; may continue on the next line
			{
			string description = element_parts[1].Trim();
			description = Regex.Replace (description, "\"", "\\\"");
			if (null == description_list)								// done this way because descriptions can continue on the next line
				description_list = "\"" + description + "\"";
			else
				description_list = description_list + ", \"" + description + "\"";
			}

		if (String.Equals ("Prefix", element_parts[0]))
			prefixes.Add ("\"" + element_parts[1].Trim().ToLower() + "\"");

		if (String.Equals ("Deprecated", element_parts[0]))
			return;

		if (String.Equals ("Comments", element_parts[0]))
			break;														// ignore comments until the end; presume that these happen after Description elements
		}
	
	description_list = "[\"descriptions\"] = {" + description_list + "},";
	prefix_list = "[\"prefixes\"] = {" + String.Join (", ", prefixes.ToArray()) + "},";

	iana_variants.Add ("[\"" + variant.Trim() + "\"] = " + "{\n\t\t" + description_list + "\n\t\t" + prefix_list + "\n\t\t}");
	}



//---------------------------&lt; R E G I O N _ P A R S E &gt;------------------------------------------------------
//
// Read a region record and add the important bits to the iana_regions list
//

static void region_parse (string[] record_elements)
	{
	string	subtag = null;												// not used here; required by lang_parse()
	bool	deprecated = false;											// not used here; required by lang_parse()

	string result = lang_script_region_parse (record_elements, ref subtag, ref deprecated);

	if (null != result)
		iana_regions.Add (result);
	return;
	}


//---------------------------&lt; S C R I P T _ P A R S E &gt;------------------------------------------------------
//
// Read a script record and add the important bits to the iana_scripts list// 
//

static void script_parse (string[] record_elements)
	{
	string	subtag = null;												// not used here; required by lang_parse()
	bool	deprecated = false;											// not used here; required by lang_parse()

	string result = lang_script_region_parse (record_elements, ref subtag, ref deprecated);
	
	if (null != result)
		iana_scripts.Add (result);
	return;
	}


//---------------------------&lt; L A N G _ P A R S E &gt;----------------------------------------------------------
//
// Read a language record and add the important bits to the ISO_639_1 list (2-character subtags) and / or the
// iana_languages list (both 2- and 3-character subtags)
//

static void lang_parse (string[] record_elements)
	{
	string	subtag = null;
	bool	deprecated = false;

	string result = lang_script_region_parse (record_elements, ref subtag, ref deprecated);
	
	if (null == result)
		return;
	
	if (deprecated)
		iana_languages_dep.Add (result);								// gets 2- and 3-character deprecated subtags
	else
		{
		if (2 == subtag.Length)
			ISO_639_1.Add (result);										// gets only 2-character subtags

		iana_languages.Add (result);									// gets 2- and 3-character subtags
		}
	return;
	}


//---------------------------&lt; L A N G _ S C R I P T _ R E G I O N _ P A R S E &gt;------------------------------
//
// common function for extracting the important bits from language, script, and region records
//

static string lang_script_region_parse (string[]record_elements, ref string subtag, ref bool deprecated)
	{
	string	names = "";

	foreach (string element in record_elements)
		{
		string[] element_parts = element.Split(':');

		if (0 == element_parts[0].IndexOf("  "))						// line begins with two spaces it is a continuation of the previous line
			{
			names = names.Substring (0, names.Length-1);				// remove trailing double quote character
			names = names + " " + element_parts[0].Substring(2) + "\"";	// add on the continuation and a new double quote character
			}

		if (String.Equals ("Subtag", element_parts[0]))
			subtag = element_parts[1].Trim();

		if (String.Equals ("Description", element_parts[0]))			// a language name; may continue on the next line
			{
			if (String.Equals ("Private use", element_parts[1].Trim()))	// private use subtags not supported
				return null;
			if (0 == names.Length)										// done this way because descriptions can continue on the next line
				names = "\"" + element_parts[1].Trim() + "\"";
			else
				names = names + ", \"" + element_parts[1].Trim() + "\"";
			}

		if (String.Equals ("Suppress-Script", element_parts[0]))
			suppress_script_add (element_parts[1].Trim(), subtag);		// add this subtag to this script's list

		if (String.Equals ("Deprecated", element_parts[0]))
			{
			if (!String.Equals ("Type: language", record_elements[0]))	// ignore deprecated script and region tags
				return null;
			deprecated = true;											// report to calling function that this subtag is deprecated
			}

		if (String.Equals ("Comments", element_parts[0]))
			break;														// ignore comments until the end; presume that these happen after Description elements
		}
		
	return "[\"" + subtag.Trim() + "\"] = " + "{" + names + "}";
	}



//---------------------------&lt; S U P P R E S S _ S C R I P T _ A D D &gt;----------------------------------------
//
// manages the addition of a new script and its suppressed subtag and manages the addition of a new subtag to
// a script's existing list of subtags
//

static void suppress_script_add (string script, string subtag)
	{
	if (iana_suppressed_scripts.ContainsKey (script))
		{
		string subtags = iana_suppressed_scripts[script];				// get the previous version of the list
		subtags = subtags + ",\"" + subtag + "\"";						// add this quoted subtag to the comma-separated list
		iana_suppressed_scripts[script] = subtags;						// save it back into the list
		}
	else
		iana_suppressed_scripts.Add (script, "\"" + subtag + "\"");		// start a new list for this script
	return;
	}


// Z:\Wikipedia\AWB\IANA_subtag_registry_file_update.cs</Code>
  </Module>
  <ExternalProgram>
    <Enabled>false</Enabled>
    <Skip>false</Skip>
    <Program />
    <Parameters />
    <PassAsFile>true</PassAsFile>
    <OutputFile />
  </ExternalProgram>
  <Disambiguation>
    <Enabled>false</Enabled>
    <Link />
    <Variants />
    <ContextChars>20</ContextChars>
  </Disambiguation>
  <Special>
    <namespaceValues />
    <remDupes>true</remDupes>
    <sortAZ>true</sortAZ>
    <filterTitlesThatContain>false</filterTitlesThatContain>
    <filterTitlesThatContainText />
    <filterTitlesThatDontContain>false</filterTitlesThatDontContain>
    <filterTitlesThatDontContainText />
    <areRegex>false</areRegex>
    <opType>0</opType>
    <remove />
  </Special>
  <Tool>
    <ListComparerUseCurrentArticleList>0</ListComparerUseCurrentArticleList>
    <ListSplitterUseCurrentArticleList>0</ListSplitterUseCurrentArticleList>
    <DatabaseScannerUseCurrentArticleList>0</DatabaseScannerUseCurrentArticleList>
  </Tool>
  <Plugin>
    <PluginPrefs>
      <Name>CSV Loader</Name>
      <PluginSettings>
        <anyType xsi:type="PrefsKeyPair">
          <Name>TextMode</Name>
          <Setting xsi:type="xsd:string">Append</Setting>
        </anyType>
        <anyType xsi:type="PrefsKeyPair">
          <Name>InputText</Name>
          <Setting xsi:type="xsd:string" />
        </anyType>
        <anyType xsi:type="PrefsKeyPair">
          <Name>ColumnHeaders</Name>
          <Setting xsi:type="xsd:string" />
        </anyType>
        <anyType xsi:type="PrefsKeyPair">
          <Name>Skip</Name>
          <Setting xsi:type="xsd:boolean">true</Setting>
        </anyType>
        <anyType xsi:type="PrefsKeyPair">
          <Name>Separator</Name>
          <Setting xsi:type="xsd:string">,</Setting>
        </anyType>
        <anyType xsi:type="PrefsKeyPair">
          <Name>CreateLists</Name>
          <Setting xsi:type="xsd:boolean">false</Setting>
        </anyType>
        <anyType xsi:type="PrefsKeyPair">
          <Name>ListSeparator</Name>
          <Setting xsi:type="xsd:string">^</Setting>
        </anyType>
        <anyType xsi:type="PrefsKeyPair">
          <Name>FindReplace</Name>
          <Setting xsi:type="xsd:boolean">false</Setting>
        </anyType>
        <anyType xsi:type="PrefsKeyPair">
          <Name>EditSummary</Name>
          <Setting xsi:type="xsd:string" />
        </anyType>
      </PluginSettings>
    </PluginPrefs>
  </Plugin>
</AutoWikiBrowserPreferences>

c# module

[edit]
// this script is intended to simplify updates from:
//		IANA [https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry language-subtag-registry file]
// to:
//		[[Module:Lang/data/iana languages]]
//		[[Module:Lang/data/iana scripts]]
//		[[Module:Lang/data/iana regions]]
//		[[Module:Lang/data/iana variants]]
//		[[Module:Lang/data/iana suppressed scripts]]
//		[[Module:ISO 639 name/ISO 639-1]]
//
// updated 2024–07-12 to account for shift of data files from Module:Language/data/~ to Module:Lang/data/~

//---------------------------< P R O C E S S A R T I C L E >--------------------------------------------------
//
//
//

public string ProcessArticle(string ArticleText, string ArticleTitle, int wikiNamespace, out string Summary, out bool Skip)
	{
	string replacement = null;
	Skip = false;
	
	if (null != ERROR_summary)
		{
		Summary = "ERROR: " + ERROR_summary;
		Skip = true;
		return ArticleText;
		}
	else if (null == DEBUG_summary)
		Summary = "update to " + file_date.Split (':')[1].Trim() + " data;";
	else
		Summary = "DEBUG: " + DEBUG_summary;
	
	switch (ArticleTitle)
		{
		case "Module:ISO 639 name/ISO 639-1":
			replacement = String.Join(",\n\t", ISO_639_1.ToArray());		// concatenate the members of the list into a big damn string
			break;
			
//		case "Module:Lang/data/iana languages":
//			replacement = String.Join(",\n\t", iana_languages.ToArray());	// concatenate the members of the list into a big damn string
//			break;
		
		case "Module:Lang/data/iana languages":
			string active = String.Join(",\n\t", iana_languages.ToArray());	// concatenate the members of the list into a big damn string
			string deprecated = String.Join(",\n\t", iana_languages_dep.ToArray());	// concatenate the members of the list into a big damn string
			return "-- " + file_date + "\nlocal active = {\n\t" + active + "\n\t}\n\nlocal deprecated = {\n\t" + deprecated + "\n\t}\n\nreturn {\n\tactive = active,\n\tdeprecated = deprecated,\n\t}";
		
		case "Module:Lang/data/iana scripts":
			replacement = String.Join(",\n\t", iana_scripts.ToArray());		// concatenate the members of the list into a big damn string
			break;

		case "Module:Lang/data/iana regions":
			replacement = String.Join(",\n\t", iana_regions.ToArray());		// concatenate the members of the list into a big damn string
			break;
		
		case "Module:Lang/data/iana suppressed scripts":
			List<string> temp = new List<string>();
			string script = null;
			string[] subtags_array;
			foreach (KeyValuePair<string, string> kvp in iana_suppressed_scripts)
				{
				script = "[\"" + kvp.Key + "\"] = ";
				subtags_array = kvp.Value.Split (',');

				temp.Add (script + "{" + prettify_suppressed_subtags (subtags_array) + "}");
				}
			replacement = String.Join(",\n\t", temp.ToArray());				// concatenate the members of the list into a big damn string
			break;

		case "Module:Lang/data/iana variants":
			replacement = String.Join(",\n\t", iana_variants.ToArray());	// concatenate the members of the list into a big damn string
			break;

		default:
			Summary = "Unexpected article: " + ArticleTitle;
			Skip = true;
			return ArticleText;
		}

	return "-- " + file_date + "\nreturn {\n\t" + replacement + "\n\t}";	// add file date, open and close the Lua table
	}
		

//---------------------------< P R E T T I F Y _ S U P P R E S S E D _ S U B T A G S >------------------------
//
// Because Latn has a lot of suppressed subtags, to keep the list from running off the right side of the screen
// we prettify it so that it is several rows of 11 language subtags.
//

private string prettify_suppressed_subtags (string[] subtags_array)
	{
	int LIMIT = 11;														// set the max number of subtags in a row of text
	
	if (LIMIT >= subtags_array.Length)									// not so many?
		return String.Join (", ", subtags_array);						// make a string and done

	List<string> subtag_fragment_list = new List<string>();				// list of strings holding LIMIT number of subtags
	string subtag_fragment_string = null;								// assemble list fragments here
	
	for (int i=0; subtags_array.Length > i; i+=LIMIT)					// index fragments of LIMIT subtags
		{
		for (int count=0, src_idx=i; (LIMIT > count) && (subtags_array.Length > src_idx); count++, src_idx++)	// loop through the source array and get a LIMIT number of subtags
			{
			if (0 == count)
				subtag_fragment_string = subtags_array[src_idx];		// start or restart the string with first subtag of this fragment
			else
				subtag_fragment_string = subtag_fragment_string + ", " + subtags_array[src_idx];	// add subtag to comma separated string
			}

		subtag_fragment_list.Add (subtag_fragment_string);				// save fragment in list
		}
	
	return String.Join (",\n\t\t\t\t", subtag_fragment_list.ToArray());	// concatenate the fragments and done
	}


//==========================<< S T A T I C   I N I T I A L I Z A T I O N >>===================================
//
// Read the plain-text registry file from a local drive, parse it apart much as is done by
// [[Module:Lang/data/iana languages/make]].  But, instead of one large file from which sections are copied
// and then pasted into the individual modules, create separate 'files' and then use awb to simply assign
//		ArticleText = <new text for ArticleTitle>
// and then return.  The script's xml file holds the list of modules to be updated.

// original snippets of this taken from [[Wikipedia:AutoWikiBrowser/Custom_Modules#Passing_text_to_external_program_for_processing]]
//


//---------------------------< S T A T I C   D A T A >--------------------------------------------------------

	static string	subtag_reg_file_url = "https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry";
	static string	subtag_reg_file_name = "Z:\\Wikipedia\\language-subtag-registry_2020-07-17.txt";
	static string[]	sub_tag_records_array;								// subtag registry is read into a local string and then split into this static array
	static string	file_date;
	static string	DEBUG_summary = null;
	static string	ERROR_summary = null;

	static List<string> iana_languages = new List<string>();
	static List<string> iana_languages_dep = new List<string>();
	static List<string> iana_scripts = new List<string>();
	static List<string> iana_regions = new List<string>();
	static List<string> iana_variants = new List<string>();
	static SortedList<string, string> iana_suppressed_scripts = new SortedList<string, string>();
	static List<string> ISO_639_1 = new List<string>();



//---------------------------< S T A T I C   C O N S T R U C T O R >------------------------------------------
//
// static constructor for the CustomModule class
//

static CustomModule()
        {
		string sub_tag_registry_text = @"";
		try
			{
			// this WebRequest code courtesy of en.wiki editor User:DavidBrooks
			System.Net.HttpWebRequest webRequest = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(subtag_reg_file_url);
			webRequest.UserAgent = "IANA subtag registry file update (https://en.wikipedia.org/wiki/User:Trappist_the_monk/IANA_subtag_registry_file_update)";
			System.IO.Stream str = webRequest.GetResponse().GetResponseStream();
			sub_tag_registry_text = new System.IO.StreamReader(str).ReadToEnd();

//			if (System.IO.File.Exists (subtag_reg_file_name))
//				{
//				using (System.IO.StreamReader reader = System.IO.File.OpenText (subtag_reg_file_name))
//					sub_tag_registry_text = reader.ReadToEnd();
//				}
//			else
//				ERROR_summary = "file not found: " + subtag_reg_file_name;
			}
		catch
			{
			ERROR_summary = "Exception occurred reading: " + subtag_reg_file_url;
//			ERROR_summary = "Exception occurred reading: " + subtag_reg_file_name;
			}
		
		sub_tag_records_array = sub_tag_registry_text.Split (new string[] {"%%"}, StringSplitOptions.None);
		sub_tag_registry_text = null;									// unset this because we no longer need it
		
		
		file_date = sub_tag_records_array[0].Trim();					// get the file date
		foreach (string record in sub_tag_records_array.Skip(1))		// skip file date
			{
			string[]	record_elements;
			string		trimmed_record = record.Trim();					// trim any leading and trailing whitespace

			record_elements = trimmed_record.Split ('\n');				// split on the newlines between record elements

			switch (record_elements[0].Trim())
				{
				case "Type: language":
					lang_parse (record_elements);
					break;
				
				case "Type: script":
					script_parse (record_elements);
					break;

				case "Type: region":
					region_parse (record_elements);
					break;

				case "Type: variant":
					variant_parse (record_elements);
					break;
				}
			}
        }


//---------------------------< V A R I A N T _ P A R S E >----------------------------------------------------
//
// Read a variant record and add the important bits to the iana_variants list
//

static void variant_parse (string[] record_elements)
	{
	string	variant = null;
	string	description_list = null;
	string	prefix_list = null;
	List<string> prefixes = new List<string>(); 

	foreach (string element in record_elements)
		{
		string[] element_parts = element.Split(':');

		if (0 == element_parts[0].IndexOf("  "))						// when line begins with two spaces it is a continuation of the previous line
			{
			description_list = description_list.Substring (0, description_list.Length-1);		// remove trailing double quote character
			description_list = description_list + " " + element_parts[0].Substring(2) + "\"";	// add on the continuation and a new double quote character
			}

		if (String.Equals ("Subtag", element_parts[0]))
			variant = element_parts[1].Trim();

		if (String.Equals ("Description", element_parts[0]))			// a variant name; may continue on the next line
			{
			string description = element_parts[1].Trim();
			description = Regex.Replace (description, "\"", "\\\"");
			if (null == description_list)								// done this way because descriptions can continue on the next line
				description_list = "\"" + description + "\"";
			else
				description_list = description_list + ", \"" + description + "\"";
			}

		if (String.Equals ("Prefix", element_parts[0]))
			prefixes.Add ("\"" + element_parts[1].Trim().ToLower() + "\"");

		if (String.Equals ("Deprecated", element_parts[0]))
			return;

		if (String.Equals ("Comments", element_parts[0]))
			break;														// ignore comments until the end; presume that these happen after Description elements
		}
	
	description_list = "[\"descriptions\"] = {" + description_list + "},";
	prefix_list = "[\"prefixes\"] = {" + String.Join (", ", prefixes.ToArray()) + "},";

	iana_variants.Add ("[\"" + variant.Trim() + "\"] = " + "{\n\t\t" + description_list + "\n\t\t" + prefix_list + "\n\t\t}");
	}



//---------------------------< R E G I O N _ P A R S E >------------------------------------------------------
//
// Read a region record and add the important bits to the iana_regions list
//

static void region_parse (string[] record_elements)
	{
	string	subtag = null;												// not used here; required by lang_parse()
	bool	deprecated = false;											// not used here; required by lang_parse()

	string result = lang_script_region_parse (record_elements, ref subtag, ref deprecated);

	if (null != result)
		iana_regions.Add (result);
	return;
	}


//---------------------------< S C R I P T _ P A R S E >------------------------------------------------------
//
// Read a script record and add the important bits to the iana_scripts list// 
//

static void script_parse (string[] record_elements)
	{
	string	subtag = null;												// not used here; required by lang_parse()
	bool	deprecated = false;											// not used here; required by lang_parse()

	string result = lang_script_region_parse (record_elements, ref subtag, ref deprecated);
	
	if (null != result)
		iana_scripts.Add (result);
	return;
	}


//---------------------------< L A N G _ P A R S E >----------------------------------------------------------
//
// Read a language record and add the important bits to the ISO_639_1 list (2-character subtags) and / or the
// iana_languages list (both 2- and 3-character subtags)
//

static void lang_parse (string[] record_elements)
	{
	string	subtag = null;
	bool	deprecated = false;

	string result = lang_script_region_parse (record_elements, ref subtag, ref deprecated);
	
	if (null == result)
		return;
	
	if (deprecated)
		iana_languages_dep.Add (result);								// gets 2- and 3-character deprecated subtags
	else
		{
		if (2 == subtag.Length)
			ISO_639_1.Add (result);										// gets only 2-character subtags

		iana_languages.Add (result);									// gets 2- and 3-character subtags
		}
	return;
	}


//---------------------------< L A N G _ S C R I P T _ R E G I O N _ P A R S E >------------------------------
//
// common function for extracting the important bits from language, script, and region records
//

static string lang_script_region_parse (string[]record_elements, ref string subtag, ref bool deprecated)
	{
	string	names = "";

	foreach (string element in record_elements)
		{
		string[] element_parts = element.Split(':');

		if (0 == element_parts[0].IndexOf("  "))						// line begins with two spaces it is a continuation of the previous line
			{
			names = names.Substring (0, names.Length-1);				// remove trailing double quote character
			names = names + " " + element_parts[0].Substring(2) + "\"";	// add on the continuation and a new double quote character
			}

		if (String.Equals ("Subtag", element_parts[0]))
			subtag = element_parts[1].Trim();

		if (String.Equals ("Description", element_parts[0]))			// a language name; may continue on the next line
			{
			if (String.Equals ("Private use", element_parts[1].Trim()))	// private use subtags not supported
				return null;
			if (0 == names.Length)										// done this way because descriptions can continue on the next line
				names = "\"" + element_parts[1].Trim() + "\"";
			else
				names = names + ", \"" + element_parts[1].Trim() + "\"";
			}

		if (String.Equals ("Suppress-Script", element_parts[0]))
			suppress_script_add (element_parts[1].Trim(), subtag);		// add this subtag to this script's list

		if (String.Equals ("Deprecated", element_parts[0]))
			{
			if (!String.Equals ("Type: language", record_elements[0]))	// ignore deprecated script and region tags
				return null;
			deprecated = true;											// report to calling function that this subtag is deprecated
			}

		if (String.Equals ("Comments", element_parts[0]))
			break;														// ignore comments until the end; presume that these happen after Description elements
		}
		
	return "[\"" + subtag.Trim() + "\"] = " + "{" + names + "}";
	}



//---------------------------< S U P P R E S S _ S C R I P T _ A D D >----------------------------------------
//
// manages the addition of a new script and its suppressed subtag and manages the addition of a new subtag to
// a script's existing list of subtags
//

static void suppress_script_add (string script, string subtag)
	{
	if (iana_suppressed_scripts.ContainsKey (script))
		{
		string subtags = iana_suppressed_scripts[script];				// get the previous version of the list
		subtags = subtags + ",\"" + subtag + "\"";						// add this quoted subtag to the comma-separated list
		iana_suppressed_scripts[script] = subtags;						// save it back into the list
		}
	else
		iana_suppressed_scripts.Add (script, "\"" + subtag + "\"");		// start a new list for this script
	return;
	}


// Z:\Wikipedia\AWB\IANA_subtag_registry_file_update.cs