模組:Hani-sortkey
This module will sort text in the 漢字. It is used to sort 錦語、南奄美大島語、中部白語、北部白語、南部白語、標敏語、碧約語、閩東語、晉語、官話、北部平話、洋涇浜英語、莆仙語、澳門皮欽葡萄牙語、南部平話、徽語、閩中語、東干語、達斡爾語、誒話、贛語、客家語、濊貊語、海南話、湘語、日語、八丈語、喜界語、拉瑪白語、中古漢語、雷州話、文言文、羯語、柔然語、吐谷渾語、拓跋語、烏桓語、鮮卑語、中古越南語、高欄語、閩北語、跨語言、宮古語、閩南語、大田話、泉漳話、海陸豐話、龍巖話、潮州話、浙南閩語、三鄉話、儂語、上古漢語、上古日語、沖永良部語、古回鶻語、布依語、百濟語、北奄美大島語、八重山語、沖繩語、Shaojiang Min、水語、白狼語、德之島語、阿勒楚喀語、巴拉語、恰喀拉語、岱依語、越南語、吳語、瓦鄉話、古典藏語、中古蒙古語、扶餘語、國頭語、與那國語、與論語、粵語、壯語、柔若語、漢語、韶州土話、四川話、台山話、高句麗語、扎話、契丹語和伽耶語.
The module should preferably not be called directly from templates or other modules.
To use it from a template, use {{sortkey}}
.
Within a module, use Module:languages#Language:makeSortKey.
For testcases, see Module:Hani-sortkey/testcases.
Functions
makeSortKey(text, lang, sc)
- Generates a sortkey for a given piece of
text
written in the script specified by the codesc
, and language specified by the codelang
. - When the sort fails, returns
nil
.
產生如下所示內容的示範函數位於Module:Hani-sortkey/templates中。模組的修改可以在Module:Hani-sortkey/sandbox中進行測試。單一字元的排序鍵是從178個資料模組之一檢索的。Module:Hani-sortkey/data為這些模組建立文件。
顯示排序鍵
- PS/2接口 (
PS2手08口00
) - gas爐/gas炉 (
gas火16
) - γ粒子 (
γ米05子00
) - 命裡有時終須有,命裡無時莫強求 (
口05衣07月02日06糸05頁03月02口05衣07火08日06艸07弓08水02
) - 得個……字 (
彳08人08子03
) - 濕𣲷𣲷/湿𣲷𣲷 (
水14水05水05
) - 赛车 (
貝10車00
)
- 鿪 (
火13
)
表意文字描述序列
- ⿰亻革 (
⿰人00革00
) - ⿰亻革家語/⿰亻革家语 (
⿰人00革00宀07言07
) - ⿺辶⿳穴⿲月⿱⿲幺言幺⿲長馬長刂心⿺辶⿳穴⿲月⿱⿲幺言幺⿲長馬長刂心麵/⿺辶⿳穴⿲月⿱⿲幺言幺⿲长马长刂心⿺辶⿳穴⿲月⿱⿲幺言幺⿲长马长刂心面 (
⿺辵00⿳穴00⿲月00⿱⿲幺00言00幺00⿲長00馬00長00刀00心00⿺辵00⿳穴00⿲月00⿱⿲幺00言00幺00⿲長00馬00長00刀00心00麥09
)
local export = {}
local m_str_utils = require("Module:string utilities")
local byte = string.byte
local codepoint = m_str_utils.codepoint
local concat = table.concat
local convert_iteration_marks = require("Module:Hani").convert_iteration_marks
local explode = m_str_utils.explode_utf8
local format = string.format
local gmatch = string.gmatch
local gsub = string.gsub
local insert = table.insert
local sub = string.sub
local u = m_str_utils.char
local ugsub = mw.ustring.gsub
local umatch = mw.ustring.match
local upper = m_str_utils.upper
local m_data = require("Module:Hani-sortkey/data/serialized")
local m_data_core = mw.loadData("Module:Hani-sortkey/data/core")
local cache = {}
--[[
Returns the index in the string where the ideographic description sequence
(IDS) ends, or the index of the end of the string. Iterates whenever
another ideographic description character (IDC) is found.
]]
local function findEndOfIDS(text, IDchar, i)
if not (text and IDchar and i) then
return nil
end
local j = i
local component = 1
-- Number of components expected after current IDC.
local components = m_data_core.ids[IDchar]
while component <= components do
j = j + 1
local char = text[j]
if not char then
break
elseif m_data_core.ids[char] then
j = findEndOfIDS(text, char, j)
end
component = component + 1
end
--[[
If the expected number of components has been found,
return the current index in the text.
]]
if component - components == 1 then
return j
else
return nil
end
end
local function unserialize(a, b)
return m_data_core.radicals[byte(a)] .. format("%02d", byte(b) - 10)
end
-- The data is stored in [[Module:Hani-sortkey/data]]. This data is not accessed directly (due to the large amount of memory this would consume), but is instead stored in a serialized form as [[Module:Hani-sortkey/data/serialized]]. If the data is changed, the new serialized data can be generated with [[Module:Hani-sortkey/data/serializer]].
function export.getData(char)
if type(char) == "string" then
char = codepoint(char)
elseif type(char) ~= "number" then
error("getData must operate on a single character or codepoint.")
end
local offset, s, f, lookup = 0
for i = 2, m_data_core.ranges.n, 2 do
s, f = m_data_core.ranges[i - 1], m_data_core.ranges[i]
if char > f then
offset = offset + f - s + 1
elseif char >= s and char <= f then
lookup = 2 * (offset + char - s + 1)
return (gsub(sub(m_data, lookup - 1, lookup), "(.)(.)", unserialize))
end
end
return u(char)
end
function export.makeSortKey(text, lang, sc)
local scripts = {
Hani = true,
Hans = true,
Hant = true,
Jpan = true,
Kore = true
}
if sc and not scripts[sc] then
return upper(text)
end
-- Convert any iteration marks into full characters, and remove any spaces. Also remove punctuation if the term contains non-punctuation (so that entries for punctuation characters can still be sorted properly).
text = ugsub(convert_iteration_marks(text), "%s+", "")
if not umatch(text, "^%p+$") then
text = ugsub(text, "%p+", "")
end
text = explode(text)
local sort, text_len, i = {}, #text, 0
while i < text_len do
i = i + 1
local char = text[i]
if m_data_core.preconvert[char] then
local j = 0
for c in gmatch(m_data_core.preconvert[char], ".[\128-\191]*") do
if j == 0 then
text[i] = c
else
insert(text, i + j, c)
end
j = j + 1
end
char = text[i]
text_len = #text
end
--[=[
If we encounter an ideographic description character (IDC),
find out if it begins a valid ideographic description sequence (IDS).
If the IDS is valid and a sortkey for it is listed in
[[Module:Hani-sortkey/data/unsupported]], then return
the sortkey, and move to the next character after the
IDS.
Otherwise, insert the IDC into the sortkey and move to the next
character after the IDC.
If the IDS is valid and no sortkey for it is found, track it.
]=]
if m_data_core.ids[char] then
local j = findEndOfIDS(text, char, i)
local IDS, data
if j then
IDS = concat(text, nil, i, j)
data = m_data_core.unsupported[IDS]
end
if not data then
if IDS then
require("Module:debug").track("Hani-sortkey/IDS-without-sortkey")
mw.log("ideographic description sequence without sortkey: '"
.. IDS .. "'")
else
require("Module:debug").track("Hani-sortkey/invalid-IDS")
mw.log("invalid ideographic description sequence at the beginning of '"
.. text[i] .. "'")
end
end
if IDS and data then
insert(sort, data)
i = j
else
insert(sort, char)
end
else
if not cache[char] then
cache[char] = export.getData(char)
end
insert(sort, cache[char])
end
end
return concat(sort)
end
return export
- 漢字
- Sortkey-generating modules without a testcases subpage
- 各文字排序鍵生成模塊
- 漢字模塊
- 排序鍵生成模塊
- 洋涇浜英語模塊
- 與那國語模塊
- 達斡爾語模塊
- 南部平話模塊
- Shaojiang Min模塊
- 羯語模塊
- 大田話模塊
- 中部白語模塊
- 跨語言模塊
- 拓跋語模塊
- 錦語模塊
- 韶州土話模塊
- 壯語模塊
- 伽耶語模塊
- 四川話模塊
- 北部平話模塊
- 高句麗語模塊
- 喜界語模塊
- 官話模塊
- 與論語模塊
- 台山話模塊
- 北奄美大島語模塊
- 恰喀拉語模塊
- 三鄉話模塊
- 儂語模塊
- 文言文模塊
- 扎話模塊
- 贛語模塊
- 百濟語模塊
- 上古漢語模塊
- 烏桓語模塊
- 拉瑪白語模塊
- 客家語模塊
- 浙南閩語模塊
- 莆仙語模塊
- 濊貊語模塊
- 晉語模塊
- 澳門皮欽葡萄牙語模塊
- 沖永良部語模塊
- 漢語模塊
- 東干語模塊
- 閩北語模塊
- 日語模塊
- 國頭語模塊
- 雷州話模塊
- 白狼語模塊
- 八丈語模塊
- 閩南語模塊
- 宮古語模塊
- 沖繩語模塊
- 誒話模塊
- 吳語模塊
- 潮州話模塊
- 海南話模塊
- 古典藏語模塊
- 吐谷渾語模塊
- 鮮卑語模塊
- 巴拉語模塊
- 北部白語模塊
- 八重山語模塊
- 德之島語模塊
- 柔若語模塊
- 中古漢語模塊
- 閩中語模塊
- 龍巖話模塊
- 中古蒙古語模塊
- 粵語模塊
- 閩東語模塊
- 瓦鄉話模塊
- 南奄美大島語模塊
- 泉漳話模塊
- 湘語模塊
- 岱依語模塊
- 中古越南語模塊
- 阿勒楚喀語模塊
- 高欄語模塊
- 上古日語模塊
- 標敏語模塊
- 水語模塊
- 越南語模塊
- 古回鶻語模塊
- 契丹語模塊
- 海陸豐話模塊
- 碧約語模塊
- 扶餘語模塊
- 徽語模塊
- 布依語模塊
- 南部白語模塊
- 柔然語模塊