Module:tsg-jawi sc
Jump to navigation
Jump to search
- This module lacks a documentation subpage. Please create it.
- Useful links: subpage list • links • transclusions • testcases • sandbox
local export = {}
local lang = require("Module:languages").getByCode("tsg")
local sc = require("Module:scripts").getByCode("Arab")
local rfind = mw.ustring.find
local rmatch = mw.ustring.match
local rsubn = mw.ustring.gsub
local rsplit = mw.text.split
local u = require("Module:string/char")
local m_str_utils = require("Module:string utilities")
local ulen = m_str_utils.len
local ulower = m_str_utils.lower
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD
-- version of rsubn() that discards all but the first return value
local function rsub(term, foo, bar)
local retval = rsubn(term, foo, bar)
return retval
end
-- apply rsub() repeatedly until no change
local function rsub_repeatedly(term, foo, bar)
while true do
local new_term = rsub(term, foo, bar)
if new_term == term then
return term
end
term = new_term
end
end
local V = "[aeiou]"
local separator = "[ -]"
local MACRON = u(0x0304) -- macron = ̄
local UMACRON = u(0x0331) -- under macron
local TILDE = u(0x0303) -- tilde = ̃
local DIA = u(0x0308) -- diaeresis = ̈
local DOT_BELOW = u(0x0323) -- dot below ̣
--------------------------- hamja processing ------------------------------
-- hamja (Arabic hamza) variants
local HAMJA = u(0x0621) -- hamja on the line (stand-alone hamza) = ء
local HAMJA_ON_ALIP = u(0x0623) -- أ
local HAMJA_ON_WAW = u(0x0624) -- ؤ
local HAMJA_UNDER_ALIF = u(0x0625) -- إ
local HAMJA_ON_YA = u(0x0626) -- ئ
local HAMJA_ANY = "[" .. HAMJA .. HAMJA_ON_ALIP .. HAMJA_UNDER_ALIF .. HAMJA_ON_WAW .. HAMJA_ON_YA .. "]"
local HAMJA_PH = u(0xFFF0) -- hamja placeholder
-- Arabic diacritics
local A = u(0x064E) -- hataas (fatḥa)
local U = u(0x064F) -- dapan (ḍamma)
local I = u(0x0650) -- hababa' (kasra)
local PT = u(0x0652) -- patay (sukūn) = no vowel
local SB = u(0x0651) -- sabtu' (šadda) = gemination of consonants
-- Arabic exclusive diacritics
local AN = u(0x064B) -- duwa baris hataas (fatḥatān, fatḥa tanwīn)
local UN = u(0x064C) -- duwa baris dapan (ḍammatān, ḍamma tanwīn)
local IN = u(0x064D) -- duwa baris hababa' (kasratān, kasra tanwīn)
local DAGGER_ALIP = u(0x0670) -- ٰ
local DIACRITIC_ANY_BUT_SB = "[" .. A .. I .. U .. AN .. IN .. UN .. PT .. DAGGER_ALIP .. "]"
-- Pattern matching short vowels
local AIU = "[" .. A .. I .. U .. "]"
-- Pattern matching any diacritics that may be on a consonant
local DIACRITIC = SB .. "?" .. DIACRITIC_ANY_BUT_SB
-- various letters and signs
local ALIP = u(0x0627) -- alip (ʾalif) = ا
local AMAQ = u(0x0649) -- (ʾalif maqṣūra) = ى
local AMAD = u(0x0622) -- alip madda (ʾalif madda) = آ
local WAW = u(0x0648) -- wāw = و
local YA = u(0x064A) -- yā = ي
local DUP = u(0x0662) -- 2 ٢ for reduplications
local correspondence = {
["'"] = HAMJA, -- ء
["b"] = u(0x0628), -- bā' ب
["t"] = u(0x062A), -- tā' ت
["ṯ"] = u(0x062B), -- sā' ث
["ñ"] = u(0x067E), -- nyā' پ
["j"] = u(0x062C), -- jīm ج
["ḥ"] = u(0x062D), -- hā' ح
["ḵ"] = u(0x062E), -- khā' خ
["ĉ"] = u(0x0686), -- chā' چ
["d"] = u(0x062F), -- dāl د
["ḏ"] = u(0x0630), -- jāl ذ
["r"] = u(0x0631), -- rā' ر
["z"] = u(0x0632), -- jāy' ز
["s"] = u(0x0633), -- sīn س
["š"] = u(0x0634), -- shīn ش
["ṣ"] = u(0x0635), -- sād ص
["ḍ"] = u(0x0636), -- dād ض
["ṭ"] = u(0x0637), -- ṭā' ط
["ẓ"] = u(0x0638), -- lā' ظ
["ʕ"] = u(0x0639), -- ʕayn ع
["ḡ"] = u(0x063A), -- ghayn غ
["ŋ"] = u(0x06A0), -- ngā' ڠ
["p"] = u(0x0641), -- pā' ف
["q"] = u(0x0642), -- ḳāp ق
["k"] = u(0x0643), -- kāp ك
["g"] = u(0x0762), -- gā' ݢ
["l"] = u(0x0644), -- lām ل
["m"] = u(0x0645), -- mīm م
["n"] = u(0x0646), -- nūn ن
["h"] = u(0x0647), -- hā' ه
["w"] = WAW, -- wāw و
["y"] = YA, -- yā' ي
}
local vowels = { ["a"] = A, ["i"] = I, ["u"] = U }
local vowels_hamja = { ["a"] = HAMJA_ON_ALIP, ["i"] = HAMJA_ON_YA, ["u"] = HAMJA_ON_WAW }
local semivowel = { ["i"] = YA, ["u"] = WAW }
local function decompose(text)
text = toNFD(text)
text = rsub(text, ".[" .. TILDE .. DIA .. UMACRON .. DOT_BELOW .. MACRON .. "]", {
["n" .. TILDE] = "ñ",
["N" .. TILDE] = "Ñ",
["u" .. DIA] = "ü",
["U" .. DIA] = "Ü",
["e" .. DIA] = "ë",
["E" .. DIA] = "Ë",
["t" .. UMACRON] = "ṯ",
["k" .. UMACRON] = "ḵ",
["d" .. UMACRON] = "ḏ",
["h" .. DOT_BELOW] = "ḥ",
["s" .. DOT_BELOW] = "ṣ",
["d" .. DOT_BELOW] = "ḍ",
["t" .. DOT_BELOW] = "ṭ",
["z" .. DOT_BELOW] = "ẓ",
["g" .. MACRON] = "ḡ"
})
return text
end
-- canonicalize multiple spaces and remove leading and trailing spaces
local function canon_spaces(text)
text = rsub(text, "%s+", " ")
text = rsub(text, "^ ", "")
text = rsub(text, " $", "")
return text
end
local function preprocess(text, nordp)
text = ulower(text)
text = decompose(text)
text = canon_spaces(text)
-- Convert vowels --
text = rsub(text, "[ë]", "a")
text = rsub(text, "e", "i")
text = rsub(text, "[oü]", "u")
-- Convert consonants like clusters --
text = rsub(text, "ng", "ŋ")
text = rsub(text, "ny", "ñ")
text = rsub(text, "th", "ṯ")
text = rsub(text, "dh", "ḏ")
text = rsub(text, "gh", "ḡ")
text = rsub(text, "kh", "ḵ")
text = rsub(text, "ch", "ĉ")
text = rsub(text, "ts", "ĉ")
text = rsub(text, "sh", "š")
text = rsub(text, "x", "ks")
--c, gü/gu+e or i, q
text = rsub(text, "c([iey])", "s%1")
text = rsub(text, "(" .. V .. ")gü([ie])", "%1ɡw%2")
text = rsub(text, "gü([ie])", "ɡuw%1")
text = rsub(text, "gui([aeëo])", "ɡy%1")
text = rsub(text, "gu([ie])", "ɡ%1")
text = rsub(text, "qu([ie])", "k%1")
text = rsub(text, "Ꞌ", "'")
text = rsub(text, "c", "k")
text = rsub(text, "f", "p")
text = rsub(text, "v", "b")
-- Double consonants
text = rsub(text, "dj", "jj")
text = rsub(text, "nñ", "ññ")
-- Correction for vowels with in-between glottal stop, now default
text = rsub_repeatedly(text, "(" .. V .. "[" .. MACRON .. "]?)(" .. V .. ")", "%1'%2")
text = rsub(text, "(i)" .. MACRON, "iy")
text = rsub(text, "(u)" .. MACRON, "uw")
text = rsub(text, "(".. V .. ")" .. MACRON, "%1%1")
words = rsplit(text, " ")
for idx, word in ipairs(words) do
local add_dup = false
local w_affix = {
["pre"] = "",
["b1"] = "",
["inf"] = "",
["b2"] = "",
["suf"] = ""
}
words[idx] = "#" .. word .. "#"
words[idx] = rsub(words[idx], "(#)x(" .. V .. ")", "%1s%2") -- x to s
words[idx] = rsub(words[idx], "([#-])(" .. V .. ")", "%1'%2") -- Add initial glottal stop
words[idx] = rsub(words[idx], "#", "")
rsub(words[idx], "(.*)(.)(.+)-?%2%3(.*)$", function(pre, b1, b2, suf)
add_dup = true
w_affix["pre"] = pre
w_affix["b1"] = b1
w_affix["inf"] = ""
w_affix["b2"] = b2
w_affix["suf"] = suf
--Last character of base
if not rfind(rmatch(b2, ".$"), V) and suf ~= "" then
w_affix["suf"] = rmatch(b2, ".$") .. w_affix["suf"]
end
end)
-- Try to decompose again without dash
if not add_dup then
rsub(words[idx], "(.*)(.)(.*)(.+)-%2%4(.*)$", function(pre, b1, inf, b2, suf)
add_dup = true
w_affix["pre"] = pre
w_affix["b1"] = b1
w_affix["inf"] = inf
w_affix["b2"] = b2
w_affix["suf"] = suf
--Last character of base
if not rfind(rmatch(b2, ".$"), V) and suf ~= "" then
w_affix["suf"] = rmatch(b2, ".$") .. w_affix["suf"]
end
end)
end
-- Try to decompose again without dash
if not add_dup then
rsub(words[idx], "(.*)(.)(.*)(.+)%2%4(.*)$", function(pre, b1, inf, b2, suf)
add_dup = true
w_affix["pre"] = pre
w_affix["b1"] = b1
w_affix["inf"] = inf
w_affix["b2"] = b2
w_affix["suf"] = suf
--Last character of base
if not rfind(rmatch(b2, ".$"), V) and suf ~= "" then
w_affix["suf"] = rmatch(b2, ".$") .. w_affix["suf"]
end
end)
end
if (w_affix["suf"] ~= "" and
w_affix["b2"]:sub(-1) ~= w_affix["suf"]:sub(1,1) and
(rfind(rmatch(w_affix["b2"], ".$"), V ) and w_affix["suf"]:sub(1,1) ~= "h"))
or w_affix["inf"] == "ag" or w_affix["inf"] == "al"
then
-- TO DO: ACTUALLY CHECK IF TAUSUG PREFIXES
add_dup = false
end
if add_dup and not nordp then
words[idx] = w_affix["pre"] .. w_affix["b1"] .. w_affix["inf"] .. w_affix["b2"] .. DUP .. w_affix["suf"]
end
end
text = table.concat(words, " ")
return text
end
local function convert_jawi(text)
local result = ""
words = rsplit(text, " ")
local function append(char)
result = result .. char
end
local function is_vowel(char)
return rfind(char or "", V)
end
for wordct, word in ipairs(words) do
word_component = rsplit(word, "-")
for comp_ct, comp_word in ipairs(word_component) do
local i = 1
local chars = rsplit(comp_word, "")
while chars[i] do
local next_c = chars[i+1]
local c = chars[i]
local last_c = chars[i-1]
-- Vowels
if is_vowel(c) and vowels[c] then
if last_c == "a" and c == last_c then
append(ALIP)
elseif is_vowel(last_c) then
append(semivowel[c])
append(PT)
else
append(vowels[c])
end
-- Consonants
else
if(correspondence[c]) then
if last_c and last_c == c and not is_vowel(last_c) and chars[i-2] ~= c then
append(SB)
else
-- Process hamja
if correspondence[c] == HAMJA then
if i == 1 and is_vowel(next_c) then
append(ALIP)
elseif(chars[i-2] ~= "a") then
if next_c and (next_c == "i" or next_c == "u") then
append(vowels_hamja[next_c])
elseif last_c == DUP and is_vowel(next_c) then
append(vowels_hamja[next_c])
elseif is_vowel(last_c) then
append(vowels_hamja[last_c])
else
append(correspondence[c])
end
elseif (not is_vowel(last_c) and last_c ~= "y" and last_c ~= "w" and last_c ~= DUP) and is_vowel(next_c) then
append(vowels_hamja[next_c])
elseif last_c == "y" and next_c and not is_vowel(next_c) and next_c ~= DUP then
append(vowels_hamja["i"])
elseif last_c == "w" and next_c and not is_vowel(next_c) and next_c ~= DUP then
append(vowels_hamja["u"])
else
append(correspondence[c])
end
else
append(correspondence[c])
end
end
if not is_vowel(next_c) and (next_c ~= c or last_c == c) then
append(PT)
end
else
append(c)
end
end
i = i+1
end
end
if wordct ~= #words then
append(" ")
end
end
result = rsub(result, "[" .. ALIP .. HAMJA_ON_ALIP .. "]" .. A .. "?" .. ALIP, AMAD)
return result
end
local function postprocess(text)
text = canon_spaces(text)
-- Convert punctuation --
text = rsub(text, "[.,?!:;]", {
["."] = ".",
["?"] = "؟",
[","] = "،",
["!"] = "!",
[":"] = ":",
[";"] = "؛"
})
text = toNFC(text)
return text
end
function export.transcribe(text, nordp)
text = preprocess(text, nordp)
text = convert_jawi(text)
text = postprocess(text)
return text
end
function export.show(frame)
local params = {
[1] = {},
["nordp"] = {}
}
local parargs = frame:getParent().args
local args = require("Module:parameters").process(parargs, params)
local text = args[1] or mw.title.getCurrentTitle().subpageText
local nordp = args.nordp or false
text = export.transcribe(text, nordp)
return text
end
return export