Module:tsg-jawi sc

From Wiktionary, the free dictionary
Jump to navigation Jump to search


local export = {}

local lang = require("Module:languages").getByCode("tsg")
local sc = require("Module:scripts").getByCode("Arab")

local rfind = mw.ustring.find
local rmatch = mw.ustring.match
local rsubn = mw.ustring.gsub
local rsplit = mw.text.split
local u = require("Module:string/char")

local m_str_utils = require("Module:string utilities")
local ulen = m_str_utils.len
local ulower = m_str_utils.lower
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD

-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
	local retval = rsubn(term, foo, bar)
	return retval
end

-- apply rsub() repeatedly until no change
local function rsub_repeatedly(term, foo, bar)
	while true do
		local new_term = rsub(term, foo, bar)
		if new_term == term then
			return term
		end
		term = new_term
	end
end

local V = "[aeiou]"
local separator = "[ -]"
local MACRON = u(0x0304) -- macron =  ̄
local UMACRON = u(0x0331) -- under macron
local TILDE = u(0x0303) -- tilde =  ̃
local DIA = u(0x0308) -- diaeresis =  ̈
local DOT_BELOW = u(0x0323) -- dot below ̣

--------------------------- hamja processing ------------------------------

-- hamja (Arabic hamza) variants
local HAMJA            = u(0x0621) -- hamja on the line (stand-alone hamza) = ء
local HAMJA_ON_ALIP    = u(0x0623) -- أ
local HAMJA_ON_WAW     = u(0x0624) -- ؤ
local HAMJA_UNDER_ALIF = u(0x0625) -- إ
local HAMJA_ON_YA      = u(0x0626) -- ئ
local HAMJA_ANY        = "[" .. HAMJA .. HAMJA_ON_ALIP .. HAMJA_UNDER_ALIF .. HAMJA_ON_WAW .. HAMJA_ON_YA .. "]"
local HAMJA_PH         = u(0xFFF0) -- hamja placeholder

-- Arabic diacritics
local A  = u(0x064E) -- hataas (fatḥa)
local U  = u(0x064F) -- dapan (ḍamma)
local I  = u(0x0650) -- hababa' (kasra)

local PT = u(0x0652) -- patay (sukūn) = no vowel
local SB = u(0x0651) -- sabtu' (šadda) = gemination of consonants

-- Arabic exclusive diacritics
local AN = u(0x064B) -- duwa baris hataas (fatḥatān, fatḥa tanwīn)
local UN = u(0x064C) -- duwa baris dapan (ḍammatān, ḍamma tanwīn)
local IN = u(0x064D) -- duwa baris hababa' (kasratān, kasra tanwīn)
local DAGGER_ALIP = u(0x0670) --  ٰ
local DIACRITIC_ANY_BUT_SB = "[" .. A .. I .. U .. AN .. IN .. UN .. PT .. DAGGER_ALIP .. "]"

-- Pattern matching short vowels
local AIU = "[" .. A .. I .. U .. "]"
-- Pattern matching any diacritics that may be on a consonant
local DIACRITIC = SB .. "?" .. DIACRITIC_ANY_BUT_SB

-- various letters and signs
local ALIP   = u(0x0627) -- alip (ʾalif) = ا
local AMAQ   = u(0x0649) -- (ʾalif maqṣūra) = ى
local AMAD   = u(0x0622) -- alip madda (ʾalif madda) = آ
local WAW    = u(0x0648) -- wāw = و
local YA     = u(0x064A) -- yā = ي
local DUP	 = u(0x0662) -- 2 ٢ for reduplications

local correspondence = {
	["'"] = HAMJA, -- ء
	["b"] = u(0x0628), -- bā' ب
	["t"] = u(0x062A), -- tā' ت
	["ṯ"] = u(0x062B), -- sā' ث
	["ñ"] = u(0x067E), -- nyā' پ
	["j"] = u(0x062C), -- jīm ج
	["ḥ"] = u(0x062D), -- hā' ح
	["ḵ"] = u(0x062E), -- khā' خ
	["ĉ"] = u(0x0686), -- chā' چ
	["d"] = u(0x062F), -- dāl د
	["ḏ"] = u(0x0630), -- jāl ذ 
	["r"] = u(0x0631), -- rā' ر
	["z"] = u(0x0632), -- jāy' ز
	["s"] = u(0x0633), -- sīn س
	["š"] = u(0x0634), -- shīn ش
	["ṣ"] = u(0x0635), -- sād ص
	["ḍ"] = u(0x0636), -- dād ض
	["ṭ"] = u(0x0637), -- ṭā' ط
	["ẓ"] = u(0x0638), -- lā' ظ
	["ʕ"] = u(0x0639), -- ʕayn ع
	["ḡ"] = u(0x063A), -- ghayn غ
	["ŋ"] = u(0x06A0), -- ngā' ڠ
	["p"] = u(0x0641), -- pā' ف
	["q"] = u(0x0642), -- ḳāp ق
	["k"] = u(0x0643), -- kāp ك
	["g"] = u(0x0762), -- gā' ݢ
	["l"] = u(0x0644), -- lām ل
	["m"] = u(0x0645), -- mīm م
	["n"] = u(0x0646), -- nūn ن
	["h"] = u(0x0647), -- hā' ه
	["w"] = WAW, -- wāw و
	["y"] = YA, -- yā' ي
}

local vowels = { ["a"] = A, ["i"] = I, ["u"] = U }
local vowels_hamja = { ["a"] = HAMJA_ON_ALIP, ["i"] = HAMJA_ON_YA, ["u"] = HAMJA_ON_WAW }
local semivowel = { ["i"] = YA, ["u"] = WAW }

local function decompose(text)
	text = toNFD(text)
	text = rsub(text, ".[" .. TILDE .. DIA .. UMACRON .. DOT_BELOW .. MACRON .. "]", {
		["n" .. TILDE] = "ñ",
		["N" .. TILDE] = "Ñ",
		["u" .. DIA] = "ü",
		["U" .. DIA] = "Ü",
		["e" .. DIA] = "ë",
		["E" .. DIA] = "Ë",
		["t" .. UMACRON] = "ṯ",
		["k" .. UMACRON] = "ḵ",
		["d" .. UMACRON] = "ḏ",
		["h" .. DOT_BELOW] = "ḥ",
		["s" .. DOT_BELOW] = "ṣ",
		["d" .. DOT_BELOW] = "ḍ",
		["t" .. DOT_BELOW] = "ṭ",
		["z" .. DOT_BELOW] = "ẓ",
		["g" .. MACRON] = "ḡ"
	})
	return text
end

-- canonicalize multiple spaces and remove leading and trailing spaces
local function canon_spaces(text)
	text = rsub(text, "%s+", " ")
	text = rsub(text, "^ ", "")
	text = rsub(text, " $", "")
	return text
end

local function preprocess(text, nordp)
	text = ulower(text)
	text = decompose(text)
	text = canon_spaces(text)
	
	-- Convert vowels --
	text = rsub(text, "[ë]", "a")
	text = rsub(text, "e", "i")
	text = rsub(text, "[oü]", "u")

	-- Convert consonants like clusters --
	text = rsub(text, "ng", "ŋ")
	text = rsub(text, "ny", "ñ")
	text = rsub(text, "th", "ṯ")
	text = rsub(text, "dh", "ḏ")
	text = rsub(text, "gh", "ḡ")
	text = rsub(text, "kh", "ḵ")
	text = rsub(text, "ch", "ĉ")
	text = rsub(text, "ts", "ĉ")
	text = rsub(text, "sh", "š")
	text = rsub(text, "x", "ks")

	--c, gü/gu+e or i, q
	text = rsub(text, "c([iey])", "s%1")
	text = rsub(text, "(" .. V .. ")gü([ie])", "%1ɡw%2")
	text = rsub(text, "gü([ie])", "ɡuw%1")
	text = rsub(text, "gui([aeëo])", "ɡy%1")
	text = rsub(text, "gu([ie])", "ɡ%1")
	text = rsub(text, "qu([ie])", "k%1")
	
	text = rsub(text, "Ꞌ", "'")
	text = rsub(text, "c", "k")
	text = rsub(text, "f", "p")
	text = rsub(text, "v", "b")

	-- Double consonants
	text = rsub(text, "dj", "jj")
	text = rsub(text, "nñ", "ññ")
	
	-- Correction for vowels with in-between glottal stop, now default
	text = rsub_repeatedly(text, "(" .. V .. "[" .. MACRON .. "]?)(" .. V .. ")", "%1'%2")
	text = rsub(text, "(i)" .. MACRON, "iy")
	text = rsub(text, "(u)" .. MACRON, "uw")
	text = rsub(text, "(".. V .. ")" .. MACRON, "%1%1")
	
	words = rsplit(text, " ")
	for idx, word in ipairs(words) do
		local add_dup = false
		local w_affix = {
			["pre"] = "",
			["b1"] = "",
			["inf"] = "",
			["b2"] = "",
			["suf"] = ""
		}
		
		words[idx] = "#" .. word .. "#"
		words[idx] = rsub(words[idx], "(#)x(" .. V .. ")", "%1s%2") -- x to s
		words[idx] = rsub(words[idx], "([#-])(" .. V .. ")", "%1'%2") -- Add initial glottal stop
		words[idx] = rsub(words[idx], "#", "")
		
		rsub(words[idx], "(.*)(.)(.+)-?%2%3(.*)$", function(pre, b1, b2, suf)
			add_dup = true
			w_affix["pre"] = pre
			w_affix["b1"] = b1
			w_affix["inf"] = ""
			w_affix["b2"] = b2
			w_affix["suf"] = suf
			
			--Last character of base
			if not rfind(rmatch(b2, ".$"), V) and suf ~= "" then
				w_affix["suf"] = rmatch(b2, ".$") .. 	w_affix["suf"]
			end
		end)
		
		-- Try to decompose again without dash
		if not add_dup then
			rsub(words[idx], "(.*)(.)(.*)(.+)-%2%4(.*)$", function(pre, b1, inf, b2, suf)
				add_dup = true
				w_affix["pre"] = pre
				w_affix["b1"] = b1
				w_affix["inf"] = inf
				w_affix["b2"] = b2
				w_affix["suf"] = suf
				
				--Last character of base
				if not rfind(rmatch(b2, ".$"), V) and suf ~= "" then
					w_affix["suf"] = rmatch(b2, ".$") .. 	w_affix["suf"]
				end
			end)
		end
		
		-- Try to decompose again without dash
		if not add_dup then
			rsub(words[idx], "(.*)(.)(.*)(.+)%2%4(.*)$", function(pre, b1, inf, b2, suf)
				add_dup = true
				w_affix["pre"] = pre
				w_affix["b1"] = b1
				w_affix["inf"] = inf
				w_affix["b2"] = b2
				w_affix["suf"] = suf
				
				--Last character of base
				if not rfind(rmatch(b2, ".$"), V) and suf ~= "" then
					w_affix["suf"] = rmatch(b2, ".$") .. 	w_affix["suf"]
				end
			end)
		end
		
		if (w_affix["suf"] ~= "" and 
		w_affix["b2"]:sub(-1) ~= w_affix["suf"]:sub(1,1) and 
		(rfind(rmatch(w_affix["b2"], ".$"), V ) and w_affix["suf"]:sub(1,1) ~= "h"))
		or w_affix["inf"] == "ag" or w_affix["inf"] == "al"
		then
			-- TO DO: ACTUALLY CHECK IF TAUSUG PREFIXES
			add_dup = false
		end

		if add_dup and not nordp then
			words[idx] = w_affix["pre"] .. w_affix["b1"] .. w_affix["inf"] .. w_affix["b2"] .. DUP .. w_affix["suf"]
		end
	end

	text = table.concat(words, " ")
	
	return text	
end

local function convert_jawi(text)
	local result = ""
	words = rsplit(text, " ")
	
	local function append(char)
		result = result .. char
	end
	
	local function is_vowel(char)
		return rfind(char or "", V)	
	end
	
	for wordct, word in ipairs(words) do
		word_component = rsplit(word, "-")
		for comp_ct, comp_word in ipairs(word_component) do
			local i = 1
			local chars = rsplit(comp_word, "")
			
			while chars[i] do
				local next_c = chars[i+1]
				local c = chars[i]
				local last_c = chars[i-1]
				-- Vowels
				if is_vowel(c) and vowels[c] then
					if last_c == "a" and c == last_c then
						append(ALIP)
					elseif is_vowel(last_c) then
						append(semivowel[c])
						append(PT)
					else
						append(vowels[c])
					end
				-- Consonants
				else
					if(correspondence[c]) then
						if last_c and last_c == c and not is_vowel(last_c) and chars[i-2] ~= c then
							append(SB)
						else
							-- Process hamja
							if correspondence[c] == HAMJA then
								if i == 1 and is_vowel(next_c) then
									append(ALIP)
								elseif(chars[i-2] ~= "a") then
									if next_c and (next_c == "i" or next_c == "u")  then
										append(vowels_hamja[next_c])
									elseif last_c == DUP and is_vowel(next_c) then
										append(vowels_hamja[next_c])
									elseif is_vowel(last_c) then
										append(vowels_hamja[last_c])
									else
										append(correspondence[c])
									end
								elseif (not is_vowel(last_c) and last_c ~= "y" and last_c ~= "w" and last_c ~= DUP) and is_vowel(next_c)  then
									append(vowels_hamja[next_c])
								elseif last_c == "y" and next_c and not is_vowel(next_c) and next_c ~= DUP then
									append(vowels_hamja["i"])
								elseif last_c == "w" and next_c and not is_vowel(next_c) and next_c ~= DUP then
									append(vowels_hamja["u"])
								else
									append(correspondence[c])
								end
							else	
								append(correspondence[c])
							end
						end
						if not is_vowel(next_c) and (next_c ~= c or last_c == c) then
							append(PT)
						end
					else
						append(c)
					end
				end
				i = i+1
			end
		end
		
		if wordct ~= #words then
			append(" ")
		end
	end
	
	result = rsub(result, "[" .. ALIP .. HAMJA_ON_ALIP .. "]" .. A .. "?" .. ALIP, AMAD)

	return result
end

local function postprocess(text)
	text = canon_spaces(text)
	
	-- Convert punctuation -- 
	text = rsub(text, "[.,?!:;]", { 
		["."] = ".", 
		["?"] = "؟", 
		[","] = "،", 
		["!"] = "!", 
		[":"] = ":", 
		[";"] = "؛"
	})
	
	text = toNFC(text)
	
	return text	
end

function export.transcribe(text, nordp)
	text = preprocess(text, nordp)
	text = convert_jawi(text)
	text = postprocess(text)
	return text
end

function export.show(frame)
	local params = {
		[1] = {},
		["nordp"] = {}
	}
	local parargs = frame:getParent().args
	local args = require("Module:parameters").process(parargs, params)
	
	local text = args[1] or mw.title.getCurrentTitle().subpageText
	local nordp = args.nordp or false

	text = export.transcribe(text, nordp)
	return text
end

return export