import { DomUtils, parseDocument } from "htmlparser2";
import { Element as DomElement } from "domhandler";
import type { AnyNode, Element, Text } from "domhandler";
import type { SemanticNode, SemanticNodeState, SemanticTreeOptions } from "./types";
type DescendantTextOptions = {
excludeIds?: Set;
};
type StaticContext = {
options: Required>;
nextId: number;
ids: Map;
referencedIds: Set;
collapsedControlledIds: Set;
labelsByFor: Map;
slotAssignments: Map | undefined;
};
const defaultOptions = {
includeAttributes: true,
excludeLikelyAds: false,
includeHidden: false,
includeSelectOptions: true,
includeTextNodes: false,
maxTextLength: 240,
mode: "compact",
excludeLikelyBoilerplate: false,
maxChildrenPerNode: 80,
maxLinkFarmChildren: 24,
maxRepeatedSubtreeInstances: 3,
pruneCollapsedSubtrees: true,
pruneLikelyClosedOverlays: true,
summarizeLargeSubtrees: true,
summarizeLikelyLinkFarms: true,
summarizeRepeatedSubtrees: true,
} satisfies StaticContext["options"];
const interactiveRoles = new Set([
"button",
"checkbox",
"combobox",
"link",
"listbox",
"menuitem",
"menuitemcheckbox",
"menuitemradio",
"option",
"radio",
"searchbox",
"slider",
"spinbutton",
"switch",
"tab",
"textbox",
"treeitem",
]);
const landmarkTags: Record = {
article: "article",
aside: "complementary",
footer: "contentinfo",
header: "banner",
main: "main",
nav: "navigation",
section: "region",
};
const rolesNamedFromContents = new Set([
"button",
"cell",
"checkbox",
"columnheader",
"heading",
"link",
"listitem",
"menuitem",
"menuitemcheckbox",
"menuitemradio",
"option",
"radio",
"rowheader",
"switch",
"tab",
"treeitem",
]);
const hiddenStylePattern = /(?:^|;)\s*(display\s*:\s*none|visibility\s*:\s*hidden|content-visibility\s*:\s*hidden|opacity\s*:\s*0(?:\.0+)?)(?:;|$)/i;
const nonSemanticTags = new Set(["head", "link", "meta", "script", "style", "template"]);
export type StaticSemanticTreeOptions = Pick<
SemanticTreeOptions,
"excludeLikelyAds" | "excludeLikelyBoilerplate" | "includeAttributes" | "includeHidden" | "includeSelectOptions" | "includeTextNodes" | "maxChildrenPerNode" | "maxLinkFarmChildren" | "maxRepeatedSubtreeInstances" | "maxTextLength" | "mode" | "pruneCollapsedSubtrees" | "pruneLikelyClosedOverlays" | "summarizeLargeSubtrees" | "summarizeLikelyLinkFarms" | "summarizeRepeatedSubtrees"
>;
export function extractStaticSemanticTree(html: string, options: StaticSemanticTreeOptions = {}): SemanticNode {
const document = parseDocument(html, {
lowerCaseAttributeNames: true,
lowerCaseTags: true,
recognizeSelfClosing: true,
});
const context: StaticContext = {
options: resolveStaticOptions(document.children, html, options),
nextId: 1,
ids: new Map(),
referencedIds: new Set(),
collapsedControlledIds: new Set(),
labelsByFor: new Map(),
slotAssignments: undefined,
};
indexDocument(document.children, context);
const root = findElement(document.children, "body") ?? findElement(document.children, "html") ?? fragmentRoot(document.children);
return walkElement(root, context) ?? unavailableNode(context, "document", "HTML has no inspectable root");
}
export { extractStaticSemanticTree as extract };
function resolveStaticOptions(nodes: AnyNode[], html: string, options: StaticSemanticTreeOptions): StaticContext["options"] {
const inferred = inferStaticSourceProfile(nodes, html);
const resolved = { ...defaultOptions };
if (inferred.wikiLike) {
resolved.maxChildrenPerNode = 400;
resolved.maxLinkFarmChildren = 80;
}
if (inferred.forumLike) {
resolved.maxLinkFarmChildren = 19;
}
return { ...resolved, ...options };
}
function inferStaticSourceProfile(nodes: AnyNode[], html: string): { wikiLike: boolean; forumLike: boolean } {
const root = findElement(nodes, "html") ?? fragmentRoot(nodes);
const body = findElement(nodes, "body");
const profileText = [
attr(root, "class"),
attr(root, "id"),
body ? attr(body, "class") : "",
body ? attr(body, "id") : "",
firstMetaContent(root, "generator"),
firstMetaContent(root, "application-name"),
firstMetaContent(root, "twitter:site"),
].filter(Boolean).join(" ").toLowerCase();
return {
wikiLike: /\b(mediawiki|mw-parser-output|wikipedia|wikimedia)\b/.test(profileText)
|| /\b(?:id|class)=["'][^"']*\bmw-parser-output\b/i.test(html),
forumLike: /\b(5ch|2ch|dcinside|ruliweb|clien|bbs|board|forum|gallery|gall|thread|subback)\b/.test(profileText)
|| /\b(?:id|class)=["'][^"']*\b(?:gall_list|threadlist|thread-list|board-list|article-list|subback|bbs|forum)\b/i.test(html)
|| /(?:갤러리|게시판|댓글|개념글|스레드|レス|話題度)/.test(html),
};
}
function firstMetaContent(root: Element | undefined, name: string): string {
/* v8 ignore next -- internal callers pass a fragment fallback when html is absent. */
if (!root) return "";
const stack = [...root.children];
while (stack.length > 0) {
const node = stack.shift();
/* v8 ignore next -- guarded for noUncheckedIndexedAccess; loop condition prevents this. */
if (!node) continue;
if (!isElement(node)) continue;
if (node.name === "meta" && (attr(node, "name") === name || attr(node, "property") === name)) {
return attr(node, "content") ?? "";
}
stack.unshift(...node.children);
}
return "";
}
function indexDocument(nodes: AnyNode[], context: StaticContext): void {
for (const node of nodes) {
if (!isElement(node)) continue;
const id = attr(node, "id");
if (id) context.ids.set(id, node);
for (const referencedId of referencedIds(node)) {
context.referencedIds.add(referencedId);
}
if (attr(node, "aria-expanded") === "false") {
for (const controlledId of (attr(node, "aria-controls") ?? "").split(/\s+/)) {
if (controlledId) context.collapsedControlledIds.add(controlledId);
}
}
if (node.name === "label") {
const target = attr(node, "for");
if (target) context.labelsByFor.set(target, node);
}
indexDocument(node.children, context);
}
}
function referencedIds(element: Element): string[] {
return [
attr(element, "aria-labelledby"),
attr(element, "aria-describedby"),
attr(element, "aria-details"),
attr(element, "aria-errormessage"),
attr(element, "aria-controls"),
attr(element, "aria-owns"),
attr(element, "aria-flowto"),
attr(element, "aria-activedescendant"),
]
.filter((value): value is string => Boolean(value))
.flatMap((value) => value.split(/\s+/).map((item) => item.trim()).filter(Boolean));
}
function descriptionReferenceIds(element: Element): Set {
return new Set([
attr(element, "aria-describedby"),
attr(element, "aria-details"),
attr(element, "aria-errormessage"),
]
.filter((value): value is string => Boolean(value))
.flatMap((value) => value.split(/\s+/).map((item) => item.trim()).filter(Boolean)));
}
function walkElement(element: Element | undefined, context: StaticContext): SemanticNode | null {
/* v8 ignore next -- public extraction always supplies body/html or a fragment root. */
if (!element) return null;
if (shouldSkipElement(element, context)) return null;
if (!context.options.includeHidden && isHidden(element)) return null;
if (context.options.excludeLikelyAds && isLikelyAd(element)) return null;
if (context.options.excludeLikelyBoilerplate && isLikelyBoilerplateTable(element)) return flattenBoilerplateTable(element, context);
if (context.options.excludeLikelyBoilerplate && isLikelyBoilerplate(element)) return null;
if (!context.options.includeHidden && isCollapsedControlledElement(element, context)) return null;
if (!context.options.includeHidden && isLikelyClosedOverlay(element, context)) return null;
const role = getRole(element);
const state = getState(element);
const focusable = isFocusable(element, role);
const interactive = isInteractive(element, role, focusable);
const name = role ? computeName(element, role, context) : "";
const tag = element.name;
const children = shouldSkipChildrenForCollapsedElement(element, context) ? [] : collectChildren(element, context);
if (tag === "iframe" && children.length === 0 && attr(element, "src") && !attr(element, "srcdoc")) {
children.push(unavailableNode(context, "iframe", "iframe content unavailable in static HTML"));
}
if (context.options.mode === "interactive" && !interactive) {
return children.length > 0 ? containerNode(context, tag, children) : null;
}
if (shouldPruneCustomElementWrapper(element, role, name, interactive, children, context)) {
return children.length === 1 ? children[0] ?? null : containerNode(context, "fragment", children);
}
if (shouldPruneListItemWrapper(role, children, context)) {
return children.length === 1 ? children[0] ?? null : containerNode(context, tag, children);
}
if (shouldPrune(element, role, name, interactive, children, context)) {
if (children.length === 0) return null;
return children.length === 1 ? children[0] ?? null : containerNode(context, tag, children);
}
const node: SemanticNode = {
id: nextId(context),
tag,
role,
name,
interactive,
focusable,
selector: getSelector(element),
xpath: getXPath(element),
children,
};
const description = computeDescription(element, context);
if (description) node.description = description;
const text = directText(element, context.options.maxTextLength);
if (text) node.text = text;
const value = getValue(element);
if (value) node.value = value;
if (Object.keys(state).length > 0) node.state = state;
if (context.options.includeAttributes) node.attributes = { ...element.attribs };
return node;
}
function collectChildren(element: Element, context: StaticContext): SemanticNode[] {
const children: SemanticNode[] = [];
const repeatedSignatures = new Map();
let omitted = 0;
const shadowTemplate = element.children.find((child): child is Element => isElement(child) && isDeclarativeShadowTemplate(child));
if (shadowTemplate) {
const previousAssignments = context.slotAssignments;
context.slotAssignments = collectSlotAssignments(element);
for (const child of shadowTemplate.children) {
if (!isElement(child)) continue;
const semanticChild = walkElement(child, context);
omitted += appendSemanticChild(element, semanticChild, children, repeatedSignatures, context);
}
context.slotAssignments = previousAssignments;
const linkFarmSummary = summarizeLikelyLinkFarmChildren(element, children, context);
if (linkFarmSummary.omitted > 0) {
children.splice(0, children.length, ...linkFarmSummary.children);
omitted += linkFarmSummary.omitted;
}
if (omitted > 0) children.push(omittedNode(context, omitted));
return children;
}
if (element.name === "slot" && context.slotAssignments) {
const slotName = attr(element, "name") ?? "";
const assignedChildren = context.slotAssignments.get(slotName) ?? [];
const projectedChildren = assignedChildren.length > 0 ? assignedChildren : element.children.filter(isElement);
for (const child of projectedChildren) {
if (isElement(child)) {
const semanticChild = walkElement(child, context);
omitted += appendSemanticChild(element, semanticChild, children, repeatedSignatures, context);
} else if (context.options.includeTextNodes && isText(child)) {
const text = normalizeText(child.data, context.options.maxTextLength);
if (text) {
children.push({
id: nextId(context),
tag: "#text",
role: "text",
name: text,
text,
interactive: false,
focusable: false,
children: [],
});
}
}
}
if (omitted > 0) children.push(omittedNode(context, omitted));
return children;
}
for (const child of element.children) {
if (isElement(child)) {
if (!context.options.includeSelectOptions && element.name === "select") continue;
const semanticChild = walkElement(child, context);
omitted += appendSemanticChild(element, semanticChild, children, repeatedSignatures, context);
} else if (context.options.includeTextNodes && isText(child)) {
const text = normalizeText(child.data, context.options.maxTextLength);
if (text) {
const textNode: SemanticNode = {
id: nextId(context),
tag: "#text",
role: "text",
name: text,
text,
interactive: false,
focusable: false,
children: [],
};
if (shouldSummarizeMoreChildren(element, children, context)) {
omitted += 1;
} else {
children.push(textNode);
}
}
}
}
const linkFarmSummary = summarizeLikelyLinkFarmChildren(element, children, context);
if (linkFarmSummary.omitted > 0) {
children.splice(0, children.length, ...linkFarmSummary.children);
omitted += linkFarmSummary.omitted;
}
if (omitted > 0) children.push(omittedNode(context, omitted));
return children;
}
function collectSlotAssignments(host: Element): Map {
const assignments = new Map();
for (const child of host.children) {
if (!isUsefulSlotAssignment(child)) continue;
const slotName = isElement(child) ? attr(child, "slot") ?? "" : "";
const assigned = assignments.get(slotName) ?? [];
assigned.push(child);
assignments.set(slotName, assigned);
}
return assignments;
}
function isUsefulSlotAssignment(node: AnyNode): boolean {
if (isText(node)) return normalizeText(node.data, 120) !== "";
if (!isElement(node)) return false;
return !isDeclarativeShadowTemplate(node);
}
function appendSemanticChild(
parent: Element,
child: SemanticNode | null,
children: SemanticNode[],
repeatedSignatures: Map,
context: StaticContext,
): number {
if (!child) return 0;
if (shouldSummarizeRepeatedChild(parent, child, repeatedSignatures, context)) {
return countSemanticNodes(child);
}
if (shouldSummarizeMoreChildren(parent, children, context)) {
return countSemanticNodes(child);
}
children.push(child);
return 0;
}
function shouldSkipElement(element: Element, context: StaticContext): boolean {
if (context.options.mode === "full") return false;
if (isDeclarativeShadowTemplate(element)) return false;
if (nonSemanticTags.has(element.name)) return true;
if (element.name === "noscript") return true;
return false;
}
function isDeclarativeShadowTemplate(element: Element): boolean {
if (element.name !== "template") return false;
const mode = attr(element, "shadowrootmode") ?? attr(element, "shadowroot");
return mode === "open" || mode === "closed";
}
function shouldSummarizeMoreChildren(element: Element, children: SemanticNode[], context: StaticContext): boolean {
if (!context.options.summarizeLargeSubtrees || context.options.mode === "full") return false;
if (!isLargeSubtreeCandidate(element)) return false;
return children.length >= context.options.maxChildrenPerNode;
}
function isLargeSubtreeCandidate(element: Element): boolean {
return ["nav", "ul", "ol", "div", "section", "footer", "header", "main"].includes(element.name);
}
function summarizeLikelyLinkFarmChildren(
element: Element,
children: SemanticNode[],
context: StaticContext,
): { children: SemanticNode[]; omitted: number } {
if (!context.options.summarizeLikelyLinkFarms || context.options.mode === "full") return { children, omitted: 0 };
if (children.length <= context.options.maxLinkFarmChildren) return { children, omitted: 0 };
if (!isLikelyLinkFarmContainer(element)) return { children, omitted: 0 };
const stats = childLinkFarmStats(children);
if (stats.linkishChildren < Math.max(8, Math.floor(children.length * 0.65))) return { children, omitted: 0 };
if (stats.contentRichChildren > Math.max(2, Math.floor(children.length * 0.2))) return { children, omitted: 0 };
const kept: SemanticNode[] = [];
let omitted = 0;
let keptLinkish = 0;
for (const child of children) {
/* v8 ignore next 4 -- covered by link-farm tests, but V8 maps this continue branch unreliably through TS output. */
if (!isLinkishSummaryChild(child)) {
kept.push(child);
continue;
}
if (keptLinkish < context.options.maxLinkFarmChildren) {
kept.push(child);
keptLinkish += 1;
} else {
omitted += countSemanticNodes(child);
}
}
return omitted > 0 ? { children: kept, omitted } : { children, omitted: 0 };
}
function isLikelyLinkFarmContainer(element: Element): boolean {
if (["nav", "ul", "ol", "aside", "footer", "header"].includes(element.name)) return true;
if (!["div", "section"].includes(element.name)) return false;
const value = [
attr(element, "id"),
attr(element, "class"),
attr(element, "role"),
attr(element, "aria-label"),
attr(element, "title"),
].filter(Boolean).join(" ").toLowerCase();
if (/\b(article|body|content|contents|entry|main|post|story|text|view)\b/.test(value)) return false;
return /\b(board|category|comment|footer|gallery|gnb|header|issue|list|menu|nav|popular|recent|recommend|related|reply|sidebar|tab)\b/.test(value)
|| /갤러리|댓글|개념글|관련|목록|베스트|인기|최근|추천|카테고리/.test(value);
}
function childLinkFarmStats(children: SemanticNode[]): { linkishChildren: number; contentRichChildren: number } {
let linkishChildren = 0;
let contentRichChildren = 0;
for (const child of children) {
if (isLinkishSummaryChild(child)) linkishChildren += 1;
if (isContentRichSummaryChild(child)) contentRichChildren += 1;
}
return { linkishChildren, contentRichChildren };
}
function isLinkishSummaryChild(node: SemanticNode): boolean {
const stats = semanticRoleStats(node);
return stats.links > 0
&& stats.formControls === 0
&& stats.tables === 0
&& stats.paragraphs <= 1
&& stats.contentContainers === 0;
}
function isContentRichSummaryChild(node: SemanticNode): boolean {
const stats = semanticRoleStats(node);
return stats.paragraphs > 1 || stats.tables > 0 || stats.contentContainers > 0 || stats.formControls > 0;
}
function semanticRoleStats(node: SemanticNode): {
links: number;
paragraphs: number;
tables: number;
formControls: number;
contentContainers: number;
} {
const role = node.role ?? node.tag;
const stats = {
links: role === "link" ? 1 : 0,
paragraphs: role === "p" || role === "text" ? 1 : 0,
tables: role === "table" || role === "row" || role === "cell" ? 1 : 0,
formControls: role === "textbox" || role === "searchbox" || role === "combobox" || role === "listbox" || role === "checkbox" || role === "radio" || role === "slider" || role === "spinbutton" || role === "switch" ? 1 : 0,
contentContainers: role === "article" || role === "main" ? 1 : 0,
};
for (const child of node.children) {
const childStats = semanticRoleStats(child);
stats.links += childStats.links;
stats.paragraphs += childStats.paragraphs;
stats.tables += childStats.tables;
stats.formControls += childStats.formControls;
stats.contentContainers += childStats.contentContainers;
}
return stats;
}
function shouldSummarizeRepeatedChild(
parent: Element,
child: SemanticNode,
signatures: Map,
context: StaticContext,
): boolean {
if (!context.options.summarizeRepeatedSubtrees || context.options.mode === "full") return false;
if (!isRepeatedSubtreeCandidate(parent)) return false;
const signature = semanticSignature(child);
const count = signatures.get(signature) ?? 0;
signatures.set(signature, count + 1);
return count >= context.options.maxRepeatedSubtreeInstances;
}
function isRepeatedSubtreeCandidate(element: Element): boolean {
return ["body", "main", "nav", "ul", "ol", "div", "section", "footer", "header", "aside"].includes(element.name);
}
function semanticSignature(node: SemanticNode): string {
const childSignatures = node.children.map(semanticSignature).join(",");
return `${node.tag}|${node.role ?? ""}|${node.name}|${node.text ?? ""}|${node.value ?? ""}|${node.interactive ? "i" : ""}[${childSignatures}]`;
}
function countSemanticNodes(node: SemanticNode): number {
let count = 1;
for (const child of node.children) count += countSemanticNodes(child);
return count;
}
function shouldSkipChildrenForCollapsedElement(element: Element, context: StaticContext): boolean {
if (!context.options.pruneCollapsedSubtrees || context.options.includeHidden) return false;
if (attr(element, "aria-expanded") === "false") return true;
if (element.name === "details" && attr(element, "open") === null) return true;
if (element.name === "dialog" && attr(element, "open") === null) return true;
if (attr(element, "popover") !== null && attr(element, "open") === null) return true;
return false;
}
function isCollapsedControlledElement(element: Element, context: StaticContext): boolean {
const id = attr(element, "id");
return Boolean(id && context.options.pruneCollapsedSubtrees && context.collapsedControlledIds.has(id));
}
function isLikelyClosedOverlay(element: Element, context: StaticContext): boolean {
if (!context.options.pruneLikelyClosedOverlays || context.options.mode === "full") return false;
if (hasUsefulOpenSignal(element)) return false;
if (!hasOverlaySignal(element)) return false;
if (hasDirectFocusableIntent(element)) return false;
return hasOffscreenOrClosedStyle(element) || hasClosedClassSignal(element) || hasInertSignal(element);
}
function hasUsefulOpenSignal(element: Element): boolean {
return attr(element, "open") !== null
|| attr(element, "aria-expanded") === "true"
|| attr(element, "aria-modal") === "true"
|| attr(element, "data-open") === "true"
|| attr(element, "data-state") === "open";
}
function hasOverlaySignal(element: Element): boolean {
const value = [element.name, attr(element, "id"), attr(element, "class"), attr(element, "role"), attr(element, "aria-label")]
.filter(Boolean)
.join(" ")
.toLowerCase();
return /\b(drawer|modal|dialog|popover|overlay|hamburger|menu|sidebar|sheet|flyout|dropdown)\b/.test(value);
}
function hasDirectFocusableIntent(element: Element): boolean {
const tabindex = attr(element, "tabindex");
return tabindex !== null && Number(tabindex) >= 0;
}
function hasInertSignal(element: Element): boolean {
return attr(element, "inert") !== null || attr(element, "aria-hidden") === "true";
}
function hasClosedClassSignal(element: Element): boolean {
const className = attr(element, "class") ?? "";
return /\b(closed|collapsed|hidden|inactive|is-closed|is-hidden)\b/i.test(className);
}
function hasOffscreenOrClosedStyle(element: Element): boolean {
const style = attr(element, "style") ?? "";
if (!style) return false;
const normalized = style.replace(/\s+/g, "").toLowerCase();
return /(?:^|;)(?:left|right|top|bottom):-\d{2,}(?:px|rem|em|vw|vh|%)/.test(normalized)
|| /(?:^|;)transform:translate[xy]?\(-[1-9]\d*%/.test(normalized)
|| /(?:^|;)(?:max-height|height):0(?:px|rem|em|%)?/.test(normalized)
|| /(?:^|;)pointer-events:none/.test(normalized);
}
function shouldPrune(
element: Element,
role: string | null,
name: string,
interactive: boolean,
children: SemanticNode[],
context: StaticContext,
): boolean {
if (context.options.mode === "full") return false;
if (role === "none" || role === "presentation") return true;
if (interactive) return false;
if (role && role !== "generic") return false;
if (name) return false;
if (isReferencedIdTarget(element, context)) return false;
if (children.length === 0) return true;
if (attr(element, "id") || attr(element, "aria-label") || attr(element, "aria-labelledby")) return false;
return children.length > 0;
}
function isReferencedIdTarget(element: Element, context: StaticContext): boolean {
const id = attr(element, "id");
return Boolean(id && context.referencedIds.has(id));
}
function shouldPruneListItemWrapper(role: string | null, children: SemanticNode[], context: StaticContext): boolean {
if (context.options.mode === "full") return false;
if (role !== "listitem") return false;
return children.some((child) => child.role === "link" || child.role === "button");
}
function shouldPruneCustomElementWrapper(
element: Element,
role: string | null,
name: string,
interactive: boolean,
children: SemanticNode[],
context: StaticContext,
): boolean {
if (context.options.mode === "full") return false;
if (!isCustomElement(element)) return false;
if (interactive) return false;
if (role && role !== "generic") return false;
if (name) return false;
if (children.length === 0) return false;
if (hasUsefulHostSignal(element)) return false;
return true;
}
function isCustomElement(element: Element): boolean {
return element.name.includes("-");
}
function hasUsefulHostSignal(element: Element): boolean {
return Boolean(
attr(element, "id")
|| attr(element, "aria-label")
|| attr(element, "aria-labelledby")
|| attr(element, "aria-describedby")
|| attr(element, "aria-controls")
|| attr(element, "aria-expanded")
|| attr(element, "aria-selected")
|| attr(element, "aria-current")
|| attr(element, "tabindex")
);
}
function getRole(element: Element): string | null {
const explicit = firstToken(attr(element, "role"));
if (explicit) return explicit;
const tag = element.name;
if (tag === "section" && !hasExplicitNameSource(element)) return null;
if (tag === "form" && !hasExplicitNameSource(element)) return null;
if (tag in landmarkTags) return landmarkTags[tag] ?? null;
if (/^h[1-6]$/.test(tag)) return "heading";
if (tag === "a" || tag === "area") return attr(element, "href") ? "link" : null;
if (tag === "button") return "button";
if (tag === "details" || tag === "fieldset") return "group";
if (tag === "dialog") return "dialog";
if (tag === "figure") return "figure";
if (tag === "form") return "form";
if (tag === "iframe") return "iframe";
if (tag === "img") return "img";
if (tag === "input") return inputRole(element);
if (tag === "li") return "listitem";
if (tag === "ol" || tag === "ul") return "list";
if (tag === "option") return "option";
if (tag === "p") return "p";
if (tag === "progress") return "progressbar";
if (tag === "select") return attr(element, "multiple") !== null ? "listbox" : "combobox";
if (tag === "summary") return "button";
if (tag === "table") return "table";
if (tag === "td") return "cell";
if (tag === "textarea") return "textbox";
if (tag === "th") return attr(element, "scope") === "row" ? "rowheader" : "columnheader";
if (tag === "tr") return "row";
return null;
}
function inputRole(element: Element): string | null {
const type = (attr(element, "type") ?? "text").toLowerCase();
if (type === "hidden") return null;
if (type === "button" || type === "submit" || type === "reset") return "button";
if (type === "checkbox") return "checkbox";
if (type === "image") return "button";
if (type === "radio") return "radio";
if (type === "range") return "slider";
if (type === "search") return "searchbox";
if (type === "number") return "spinbutton";
return "textbox";
}
function computeName(element: Element, role: string, context: StaticContext): string {
const labelledBy = attr(element, "aria-labelledby");
if (labelledBy) {
const value = labelledBy
.split(/\s+/)
.map((id) => context.ids.get(id))
.filter((item): item is Element => Boolean(item))
.map((item) => descendantText(item, context))
.join(" ");
const normalized = normalizeText(value, context.options.maxTextLength);
if (normalized) return normalized;
}
const ariaLabel = normalizeText(attr(element, "aria-label") ?? "", context.options.maxTextLength);
if (ariaLabel) return ariaLabel;
const labelled = labelName(element, context);
if (labelled) return labelled;
const valueName = elementValueName(element);
if (valueName) return normalizeText(valueName, context.options.maxTextLength);
if (role === "img") {
const alt = normalizeText(attr(element, "alt") ?? "", context.options.maxTextLength);
if (alt) return alt;
}
if (rolesNamedFromContents.has(role)) {
const contents = normalizeText(descendantText(element, context, { excludeIds: descriptionReferenceIds(element) }), context.options.maxTextLength);
if (contents) return contents;
}
const title = normalizeText(attr(element, "title") ?? "", context.options.maxTextLength);
if (title) return title;
return "";
}
function labelName(element: Element, context: StaticContext): string {
const id = attr(element, "id");
if (id) {
const label = context.labelsByFor.get(id);
if (label) {
const value = normalizeText(descendantText(label, context), context.options.maxTextLength);
if (value) return value;
}
}
const label = findClosestLabel(element);
return label ? normalizeText(descendantText(label, context), context.options.maxTextLength) : "";
}
function findClosestLabel(element: Element): Element | null {
let parent = element.parent;
while (parent) {
if (isElement(parent) && parent.name === "label") return parent;
parent = parent.parent;
}
return null;
}
function elementValueName(element: Element): string {
if (element.name === "input") {
const type = (attr(element, "type") ?? "text").toLowerCase();
if (type === "button" || type === "submit" || type === "reset") return attr(element, "value") ?? "";
}
return "";
}
function getState(element: Element): SemanticNodeState {
const state: SemanticNodeState = {};
if (attr(element, "disabled") !== null || attr(element, "aria-disabled") === "true") state.disabled = true;
const busy = attr(element, "aria-busy");
if (busy === "true") state.busy = true;
if (busy === "false") state.busy = false;
const multiselectable = attr(element, "aria-multiselectable");
if (multiselectable === "true") state.multiselectable = true;
if (multiselectable === "false") state.multiselectable = false;
const sort = attr(element, "aria-sort");
if (sort) state.sort = normalizeText(sort, 40);
const grabbed = attr(element, "aria-grabbed");
if (grabbed === "true") state.grabbed = true;
if (grabbed === "false") state.grabbed = false;
const dropEffect = attr(element, "aria-dropeffect");
if (dropEffect) state.dropEffect = normalizeText(dropEffect, 80);
if (attr(element, "required") !== null || attr(element, "aria-required") === "true") state.required = true;
if (attr(element, "readonly") !== null || attr(element, "aria-readonly") === "true") state.readonly = true;
const checked = attr(element, "aria-checked") ?? (attr(element, "checked") !== null ? "true" : null);
if (checked === "true") state.checked = true;
if (checked === "false") state.checked = false;
if (checked === "mixed") state.checked = "mixed";
if (attr(element, "selected") !== null || attr(element, "aria-selected") === "true") state.selected = true;
const expanded = attr(element, "aria-expanded");
if (expanded === "true") state.expanded = true;
if (expanded === "false") state.expanded = false;
const pressed = attr(element, "aria-pressed");
if (pressed === "true") state.pressed = true;
if (pressed === "false") state.pressed = false;
if (pressed === "mixed") state.pressed = "mixed";
const invalid = attr(element, "aria-invalid");
if (invalid && invalid !== "false") state.invalid = invalid === "true" ? true : invalid;
const current = attr(element, "aria-current");
if (current && current !== "false") state.current = current === "true" ? true : current;
const haspopup = attr(element, "aria-haspopup");
if (haspopup && haspopup !== "false") state.haspopup = haspopup === "true" ? true : haspopup;
const controls = attr(element, "aria-controls");
if (controls) state.controls = normalizeText(controls, 120);
const live = attr(element, "aria-live");
if (live) state.live = normalizeText(live, 120);
if (attr(element, "aria-modal") === "true") state.modal = true;
const orientation = attr(element, "aria-orientation");
if (orientation) state.orientation = normalizeText(orientation, 40);
const valueMin = ariaNumber(attr(element, "aria-valuemin"));
if (typeof valueMin === "number") state.valueMin = valueMin;
const valueMax = ariaNumber(attr(element, "aria-valuemax"));
if (typeof valueMax === "number") state.valueMax = valueMax;
const valueNow = ariaNumber(attr(element, "aria-valuenow"));
if (typeof valueNow === "number") state.valueNow = valueNow;
const valueText = attr(element, "aria-valuetext");
if (valueText) state.valueText = normalizeText(valueText, 120);
return state;
}
function ariaNumber(value: string | null): number | undefined {
if (value === null || value.trim() === "") return undefined;
const parsed = Number(value);
return Number.isFinite(parsed) ? parsed : undefined;
}
function isInteractive(element: Element, role: string | null, focusable: boolean): boolean {
if (role && interactiveRoles.has(role)) return true;
if (focusable) return true;
return ["button", "input", "select", "textarea"].includes(element.name);
}
function isFocusable(element: Element, role: string | null): boolean {
if (attr(element, "disabled") !== null) return false;
const tabindex = attr(element, "tabindex");
if (tabindex !== null && Number(tabindex) >= 0) return true;
if (role && interactiveRoles.has(role)) return true;
return element.name === "a" && attr(element, "href") !== null;
}
function isHidden(element: Element): boolean {
if (attr(element, "hidden") !== null) return true;
if (attr(element, "aria-hidden") === "true") return true;
const style = attr(element, "style");
return style ? hiddenStylePattern.test(style) : false;
}
function isLikelyAd(element: Element): boolean {
const value = [
attr(element, "id"),
attr(element, "class"),
attr(element, "aria-label"),
attr(element, "title"),
attr(element, "data-testid"),
].filter(Boolean).join(" ").toLowerCase();
return /\b(ad|ads|advert|advertisement|banner|sponsor|sponsored|promotion|promoted|powerlink)\b/.test(value)
|| /파워링크|광고|직접홍보|홍보/.test(value);
}
function isLikelyBoilerplate(element: Element): boolean {
if (element.name === "footer") return true;
if (element.name === "main" || element.name === "article") return false;
const role = firstToken(attr(element, "role"));
if (role === "main" || role === "article") return false;
const value = [
attr(element, "id"),
attr(element, "class"),
attr(element, "aria-label"),
attr(element, "title"),
].filter(Boolean).join(" ").toLowerCase();
if (!value) return false;
if (/\b(content|contents|entry|post-body|article-body|story-body|view-content)\b/.test(value)) return false;
return /\b(footer|sidebar)\b/.test(value) || /푸터/.test(value);
}
function isLikelyBoilerplateTable(element: Element): boolean {
if (element.name !== "table") return false;
const value = [
attr(element, "id"),
attr(element, "class"),
attr(element, "aria-label"),
attr(element, "title"),
].filter(Boolean).join(" ").toLowerCase();
return /\bgall[_-]?list\b/.test(value) || /\bbottom[_-]?list\w*\b/.test(value);
}
function flattenBoilerplateTable(element: Element, context: StaticContext): SemanticNode | null {
const children = collectFlattenedBoilerplateItems(element, context);
if (children.length === 0) return null;
return containerNode(context, element.name, children);
}
function collectFlattenedBoilerplateItems(element: Element, context: StaticContext): SemanticNode[] {
const children: SemanticNode[] = [];
for (const child of element.children) {
if (!isElement(child)) continue;
if (shouldSkipElement(child, context)) continue;
if (!context.options.includeHidden && isHidden(child)) continue;
if (context.options.excludeLikelyAds && isLikelyAd(child)) continue;
const role = getRole(child);
const focusable = isFocusable(child, role);
const interactive = isInteractive(child, role, focusable);
const name = role ? computeName(child, role, context) : "";
if (role && name && (interactive || role === "heading" || role === "img")) {
const node: SemanticNode = {
id: nextId(context),
tag: child.name,
role,
name,
interactive,
focusable,
selector: getSelector(child),
xpath: getXPath(child),
children: [],
};
const description = computeDescription(child, context);
if (description) node.description = description;
const value = getValue(child);
if (value) node.value = value;
const state = getState(child);
if (Object.keys(state).length > 0) node.state = state;
if (context.options.includeAttributes) node.attributes = { ...child.attribs };
children.push(node);
continue;
}
children.push(...collectFlattenedBoilerplateItems(child, context));
}
return children;
}
function hasExplicitNameSource(element: Element): boolean {
return attr(element, "aria-label") !== null || attr(element, "aria-labelledby") !== null || attr(element, "title") !== null;
}
function directText(element: Element, maxLength: number): string {
return normalizeText(
element.children
.filter(isText)
.map((node) => node.data)
.join(" "),
maxLength,
);
}
function descendantText(element: Element, context?: StaticContext, options: DescendantTextOptions = {}): string {
const parts: string[] = [];
const shadowTemplate = element.children.find((child): child is Element => isElement(child) && isDeclarativeShadowTemplate(child));
if (shadowTemplate && context) {
const previousAssignments = context.slotAssignments;
context.slotAssignments = collectSlotAssignments(element);
collectDescendantText(shadowTemplate.children, parts, context, options);
context.slotAssignments = previousAssignments;
return parts.join(" ");
}
collectDescendantText(element.children, parts, context, options);
return parts.join(" ");
}
function collectDescendantText(nodes: AnyNode[], parts: string[], context?: StaticContext, options: DescendantTextOptions = {}): void {
for (const node of nodes) {
if (isText(node)) {
parts.push(node.data);
continue;
}
if (!isElement(node)) continue;
const nodeId = attr(node, "id");
if (nodeId && options.excludeIds?.has(nodeId)) continue;
if (node.name === "slot" && context?.slotAssignments) {
const slotName = attr(node, "name") ?? "";
const assigned = context.slotAssignments.get(slotName) ?? [];
collectDescendantText(assigned.length > 0 ? assigned : node.children, parts, context, options);
continue;
}
if (nonSemanticTags.has(node.name) || node.name === "noscript") continue;
collectDescendantText(node.children, parts, context, options);
}
}
function normalizeText(value: string, maxLength: number): string {
const normalized = value.replace(/\s+/g, " ").trim();
return normalized.length > maxLength ? `${normalized.slice(0, maxLength - 1)}...` : normalized;
}
function computeDescription(element: Element, context: StaticContext): string {
const describedBy = attr(element, "aria-describedby");
if (describedBy) {
const text = describedBy
.split(/\s+/)
.map((id) => context.ids.get(id))
.filter((item): item is Element => Boolean(item))
.map((item) => descendantText(item, context))
.filter(Boolean)
.join(" ");
if (text) return normalizeText(text, context.options.maxTextLength);
}
return normalizeText(attr(element, "title") ?? "", context.options.maxTextLength);
}
function getValue(element: Element): string {
return normalizeText(attr(element, "value") ?? attr(element, "aria-valuetext") ?? attr(element, "aria-valuenow") ?? "", 240);
}
function getSelector(element: Element): string {
const id = attr(element, "id");
if (id) return `#${cssEscape(id)}`;
const parent = element.parent;
if (!parent || !("children" in parent)) return element.name;
const siblings = parent.children.filter((node): node is Element => isElement(node) && node.name === element.name);
const index = siblings.indexOf(element);
return index > 0 ? `${element.name}:nth-of-type(${index + 1})` : element.name;
}
function getXPath(element: Element): string {
const parts: string[] = [];
let current: Element | null = element;
while (current) {
const parent: AnyNode | null = current.parent;
/* v8 ignore next 4 -- htmlparser2 assigns parents for nodes returned by the public extract API. */
if (!parent || !("children" in parent)) {
parts.unshift(current.name);
break;
}
const siblings = parent.children.filter((node): node is Element => isElement(node) && node.name === current?.name);
const index = siblings.indexOf(current) + 1;
parts.unshift(`${current.name}[${index}]`);
current = isElement(parent) ? parent : null;
}
return `/${parts.join("/")}`;
}
function containerNode(context: StaticContext, tag: string, children: SemanticNode[]): SemanticNode {
return {
id: nextId(context),
tag,
role: null,
name: "",
interactive: false,
focusable: false,
children,
};
}
/* v8 ignore start -- defensive fallback for impossible parser roots. */
function unavailableNode(context: StaticContext, tag: string, unavailableReason: string): SemanticNode {
return {
id: nextId(context),
tag,
role: null,
name: "",
interactive: false,
focusable: false,
children: [],
unavailableReason,
};
}
/* v8 ignore stop */
function omittedNode(context: StaticContext, omitted: number): SemanticNode {
return {
id: nextId(context),
tag: "omitted",
role: "note",
name: `${omitted} static nodes omitted`,
interactive: false,
focusable: false,
children: [],
};
}
function nextId(context: StaticContext): string {
return `static-${context.nextId++}`;
}
function attr(element: Element, name: string): string | null {
return Object.prototype.hasOwnProperty.call(element.attribs, name) ? element.attribs[name] ?? "" : null;
}
function firstToken(value: string | null): string | null {
return value?.trim().split(/\s+/)[0] || null;
}
function cssEscape(value: string): string {
return value.replace(/[^a-zA-Z0-9_-]/g, (char) => `\\${char}`);
}
function isElement(node: AnyNode): node is Element {
return node.type === "tag" || node.type === "script" || node.type === "style";
}
function isText(node: AnyNode): node is Text {
return node.type === "text";
}
function findElement(nodes: AnyNode[], name: string): Element | undefined {
for (const node of nodes) {
if (!isElement(node)) continue;
if (node.name === name) return node;
const child = findElement(node.children, name);
if (child) return child;
}
return undefined;
}
function fragmentRoot(children: AnyNode[]): Element {
return new DomElement("fragment", {}, children);
}