From 56c453a982693c64bb519240af7a6dc62aba8c62 Mon Sep 17 00:00:00 2001 From: Remy Wang Date: Thu, 20 Nov 2025 12:39:37 -0800 Subject: [PATCH 1/4] Add text nodes. --- README.md | 3 +- src/htmlparser.lua | 24 ++++++++++ src/htmlparser/ElementNode.lua | 22 +++++---- tst/init.lua | 84 ++++++++++++++++++++++++++-------- 4 files changed, 102 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index 37149aa..aa0d5f7 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ Supported selectors are a subset of [jQuery's selectors][1]: Selectors can be combined; e.g. `".class:not([attribute]) element.class"` ## Element type -All tree elements provide, apart from `:select` and `()`, the following accessors: +All tree elements provide, apart from `:select` and `()`, the following accessors (a text node have `nil` as its tagname): ### Basic - `.name` the element's tagname @@ -96,7 +96,6 @@ All tree elements provide, apart from `:select` and `()`, the following accessor - Attribute values in selector strings cannot contain any spaces - The spaces before and after the `>` in a `parent > child` relation are mandatory - `line1
line2

")`, `root.nodes[1]:getcontent()` is `"line1
line2"`, while `root.nodes[1].nodes[1].name` is `"br"` - No start or end tags are implied when [omitted](http://www.w3.org/TR/html5/syntax.html#optional-tags). Only the [void elements](http://www.w3.org/TR/html5/syntax.html#void-elements) should not have an end tag - No validation is done for tag or attribute names or nesting of element types. The list of void elements is in fact the only part specific to HTML diff --git a/src/htmlparser.lua b/src/htmlparser.lua index c6226be..ce9ac8d 100644 --- a/src/htmlparser.lua +++ b/src/htmlparser.lua @@ -144,6 +144,7 @@ local function parse(text,limit) -- {{{ local index = 0 local root = ElementNode:new(index, str(text)) local node, descend, tpos, opentags = root, true, 1, {} + local lasttagend = nil -- position after last tag ended (nil = start of document) while true do -- MainLoop {{{ if index == limit then -- {{{ @@ -160,6 +161,15 @@ local function parse(text,limit) -- {{{ dbg("[MainLoop]:#LINE# openstart=%s || tpos=%s || name=%s",str(openstart),str(tpos),str(name)) -- }}} if not name then break end + -- Create text node for any text before this element {{{ + if lasttagend and openstart and lasttagend < openstart then + index = index + 1 + local textnode = ElementNode:new(index, nil, node, descend, lasttagend, openstart - 1) + textnode:close() + dbg("[MainLoop]:#LINE# Created text node from %d to %d", lasttagend, openstart - 1) + end + lasttagend = tpos + 1 + -- }}} -- Some more vars {{{ index = index + 1 local tag = ElementNode:new(index, str(name), (node or {}), descend, openstart, tpos) @@ -236,6 +246,20 @@ local function parse(text,limit) -- {{{ if not closing or closing == "" then break end tag = table.remove(opentags[closename] or {}) or tag -- kludges for the cases of closing void or non-opened tags + + -- Create text node for any text before this closing tag {{{ + if closing == "/" then + local tagstart = root._text:find("<", closestart) + if lasttagend and tagstart and lasttagend < tagstart then + index = index + 1 + -- Text before closing tag should be a child of the tag being closed + local textnode = ElementNode:new(index, nil, tag, true, lasttagend, tagstart - 1) + textnode:close() + dbg("[TagCloseLoop]:#LINE# Created text node from %d to %d", lasttagend, tagstart - 1) + end + lasttagend = closeend + 1 + end + -- }}} closestart = root._text:find("<", closestart) dbg("[TagCloseLoop]:#LINE# closestart=%s",str(closestart)) tag:close(closestart, closeend + 1) diff --git a/src/htmlparser/ElementNode.lua b/src/htmlparser/ElementNode.lua index 329043d..41f1011 100644 --- a/src/htmlparser/ElementNode.lua +++ b/src/htmlparser/ElementNode.lua @@ -167,15 +167,19 @@ function ElementNode:close(closestart, closeend) node = node.parent if not node then break end node.deepernodes:add(self) - insert(node.deeperelements, self.name, self) - for k in pairs(self.attributes) do - insert(node.deeperattributes, k, self) - end - if self.id then - insert(node.deeperids, self.id, self) - end - for _,v in ipairs(self.classes) do - insert(node.deeperclasses, v, self) + -- text nodes (name == nil) are added to deepernodes for * selector, + -- but not to other indexes since they can't be selected by element name, etc. + if self.name ~= nil then + insert(node.deeperelements, self.name, self) + for k in pairs(self.attributes) do + insert(node.deeperattributes, k, self) + end + if self.id then + insert(node.deeperids, self.id, self) + end + for _,v in ipairs(self.classes) do + insert(node.deeperclasses, v, self) + end end end end diff --git a/tst/init.lua b/tst/init.lua index eb1bbf6..81ac87e 100644 --- a/tst/init.lua +++ b/tst/init.lua @@ -38,11 +38,11 @@ function test_void()

]]) - assert_equal(5, #tree.nodes, "top level") + assert_equal(9, #tree.nodes, "top level") for _,n in ipairs(tree.nodes) do if n.name == "p" then - assert_equal(4, #n.nodes, "deeper level") - else + assert_equal(9, #n.nodes, "deeper level") + elseif n.name then assert_equal("br", n.name, "name") assert_equal("", n:getcontent(), "content") end @@ -70,7 +70,7 @@ function test_class() ]]) - assert_equal(3, #tree.nodes, "top level") + assert_equal(5, #tree.nodes, "top level") assert_equal(1, #tree(".one"), ".one") assert_equal(2, #tree(".two"), ".two") assert_equal(2, #tree(".three"), ".three") @@ -126,10 +126,10 @@ function test_attr_notequal() ]]) - assert_equal(4, #tree.nodes, "top level") - assert_equal(3, #tree("[a1!='a1']"), "a1!='a1'") - assert_equal(4, #tree("[a1!='b1']"), "a1!='b1'") - assert_equal(3, #tree("[a1!='']"), "a1!=''") + assert_equal(7, #tree.nodes, "top level") + assert_equal(6, #tree("[a1!='a1']"), "a1!='a1'") + assert_equal(7, #tree("[a1!='b1']"), "a1!='b1'") + assert_equal(6, #tree("[a1!='']"), "a1!=''") assert_equal(3, #tree("[a1!=]"), "a1!=") end @@ -141,7 +141,7 @@ function test_attr_prefix_start_end() ]]) - assert_equal(5, #tree.nodes, "top level") + assert_equal(9, #tree.nodes, "top level") assert_equal(3, #tree("[a1|='en']"), "a1|='en'") assert_equal(4, #tree("[a1^='en']"), "a1^='en'") assert_equal(2, #tree("[a1$='en']"), "a1$='en'") @@ -154,7 +154,7 @@ function test_attr_word() ]]) - assert_equal(4, #tree.nodes, "top level") + assert_equal(7, #tree.nodes, "top level") assert_equal(1, #tree("[a1~='two']"), "a1~='two'") assert_equal(2, #tree("[a1~='three']"), "a1~='three'") assert_equal(1, #tree("[a1~='four']"), "a1~='four'") @@ -169,7 +169,7 @@ function test_attr_contains() ]]) - assert_equal(6, #tree.nodes, "top level") + assert_equal(11, #tree.nodes, "top level") assert_equal(2, #tree("[a1*='one']"), "a1*='one'") assert_equal(2, #tree("[a1*='t']"), "a1*='t'") assert_equal(1, #tree("[a1*='f']"), "a1*='f'") @@ -238,11 +238,11 @@ function test_not() ]]) - assert_equal(2, #tree.nodes, "top level") - assert_equal(1, #tree(":not([a1=1])"), ":not([a1=1])") - assert_equal(1, #tree(":not([a2])"), ":not([a2])") - assert_equal(1, #tree(":not(n)"), ":not(n)") - assert_equal(2, #tree(":not(m)"), ":not(m)") + assert_equal(3, #tree.nodes, "top level") + assert_equal(4, #tree(":not([a1=1])"), ":not([a1=1])") + assert_equal(4, #tree(":not([a2])"), ":not([a2])") + assert_equal(4, #tree(":not(n)"), ":not(n)") + assert_equal(5, #tree(":not(m)"), ":not(m)") end function test_combine() @@ -256,7 +256,7 @@ function test_combine() ]]) - assert_equal(2, #tree.nodes, "top level") + assert_equal(3, #tree.nodes, "top level") assert_equal(2, #tree("e.c:not([a|='1']) > n[b*='2']"), "e.c:not([a|='1']) > n[b*='2']") assert_equal(3, #tree("e.c:not([a|='1']) n[b*='2']"), "e.c:not([a|='1']) n[b*='2']") assert_equal(1, #tree("#123 .c[b]"), "#123 .c[b]") @@ -290,9 +290,14 @@ function test_order() assert_equal(i, tonumber(v:getcontent()), "n order") end local notn = tree(":not(n)") - assert_equal(4, #notn, "notn") + assert_equal(31, #notn, "notn") + local blanks = 0 for i,v in pairs(notn) do - assert_equal(i, tonumber(v.name), "notn order") + if v.name then + assert_equal(i, blanks+tonumber(v.name), "notn order") + else + blanks = blanks + 1 + end end end @@ -327,5 +332,44 @@ function test_loop_limit() with unclosed attribute
]]) -- issue#42 - assert(#tree.nodes==17) + assert(#tree.nodes==33) +end + +function test_text_nodes() + local tree = htmlparser.parse("

line1
line2

") + assert_equal(1, #tree.nodes, "top level") + local p = tree.nodes[1] + assert_equal("p", p.name, "p element") + assert_equal(3, #p.nodes, "p should have 3 children") + + assert_equal(nil, p.nodes[1].name, "first child should be text node") + assert_equal("line1", p.nodes[1]:gettext(), "first text content") + + assert_equal("br", p.nodes[2].name, "second child should be br") + + assert_equal(nil, p.nodes[3].name, "third child should be text node") + assert_equal("line2", p.nodes[3]:gettext(), "third text content") + + assert_equal("line1
line2", p:getcontent(), "getcontent backward compatibility") +end + +function test_text_nodes_whitespace() + local tree = htmlparser.parse("


") + assert_equal(1, #tree.nodes, "top level") + local p = tree.nodes[1] + assert_equal(3, #p.nodes, "p should have 3 children including whitespace") + assert_equal(nil, p.nodes[1].name, "first should be whitespace text node") + assert_equal(nil, p.nodes[3].name, "third should be whitespace text node") +end + +function test_text_nodes_selectors() + local tree = htmlparser.parse("
textinnermore
") + local div = tree.nodes[1] + + local all = div:select("*") + assert_equal(4, #all, "* selector should return 4 nodes (3 text + 1 span)") + + local spans = div:select("span") + assert_equal(1, #spans, "span selector should return 1 node") + assert_equal("span", spans[1].name, "should be span element") end From 7682c5e2e5cb0db06011f31530d01aa9bb047bd6 Mon Sep 17 00:00:00 2001 From: Remy Wang Date: Thu, 20 Nov 2025 13:05:21 -0800 Subject: [PATCH 2/4] Fix typo. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index aa0d5f7..f7ec795 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ Supported selectors are a subset of [jQuery's selectors][1]: Selectors can be combined; e.g. `".class:not([attribute]) element.class"` ## Element type -All tree elements provide, apart from `:select` and `()`, the following accessors (a text node have `nil` as its tagname): +All tree elements provide, apart from `:select` and `()`, the following accessors (a text node has `nil` as its tagname): ### Basic - `.name` the element's tagname From 6e413618e47b9231c1f6f20ea0d6ce846022c461 Mon Sep 17 00:00:00 2001 From: Remy Wang Date: Thu, 20 Nov 2025 13:23:20 -0800 Subject: [PATCH 3/4] Use _text as tagname for text nodes --- README.md | 2 +- src/htmlparser.lua | 16 +++++++-------- src/htmlparser/ElementNode.lua | 36 +++++++++++++++------------------- tst/init.lua | 22 +++++++++++++-------- 4 files changed, 39 insertions(+), 37 deletions(-) diff --git a/README.md b/README.md index f7ec795..d3f8e26 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,7 @@ All tree elements provide, apart from `:select` and `()`, the following accessor ### Other - `.index` sequence number of elements in order of appearance; root index is `0` -- `:gettext()` the complete element text, starting with `""` or `""` +- `:getrawtext()` the complete element text, starting with `""` or `""` - `.level` how deep the element is in the tree; root level is `0` - `.root` the root element of the tree; `root.root` is `root` - `.deepernodes` a [Set][1] containing all elements in the tree beneath this element, including this element's `.nodes`; `{}` if none diff --git a/src/htmlparser.lua b/src/htmlparser.lua index ce9ac8d..b89479c 100644 --- a/src/htmlparser.lua +++ b/src/htmlparser.lua @@ -153,7 +153,7 @@ local function parse(text,limit) -- {{{ end -- }}} -- openstart/tpos Definitions {{{ local openstart, name - openstart, tpos, name = root._text:find( + openstart, tpos, name = root._rawtext:find( "<" .. -- an uncaptured starting "<" "([%w-]+)" .. -- name = the first word, directly following the "<" "[^>]*>", -- include, but not capture everything up to the next ">" @@ -164,7 +164,7 @@ local function parse(text,limit) -- {{{ -- Create text node for any text before this element {{{ if lasttagend and openstart and lasttagend < openstart then index = index + 1 - local textnode = ElementNode:new(index, nil, node, descend, lasttagend, openstart - 1) + local textnode = ElementNode:new(index, "_text", node, descend, lasttagend, openstart - 1) textnode:close() dbg("[MainLoop]:#LINE# Created text node from %d to %d", lasttagend, openstart - 1) end @@ -175,7 +175,7 @@ local function parse(text,limit) -- {{{ local tag = ElementNode:new(index, str(name), (node or {}), descend, openstart, tpos) node = tag local tagloop - local tagst, apos = tag:gettext(), 1 + local tagst, apos = tag:getrawtext(), 1 -- }}} while true do -- TagLoop {{{ dbg("[TagLoop]:#LINE# tag.name=%s, tagloop=%s",str(tag.name),str(tagloop)) @@ -240,7 +240,7 @@ local function parse(text,limit) -- {{{ end local closestart, closing, closename - closestart, closeend, closing, closename = root._text:find("[^<]*<(/?)([%w-]+)", closeend) + closestart, closeend, closing, closename = root._rawtext:find("[^<]*<(/?)([%w-]+)", closeend) dbg("[TagCloseLoop]:#LINE# closestart=%s || closeend=%s || closing=%s || closename=%s",str(closestart),str(closeend),str(closing),str(closename)) if not closing or closing == "" then break end @@ -249,18 +249,18 @@ local function parse(text,limit) -- {{{ -- Create text node for any text before this closing tag {{{ if closing == "/" then - local tagstart = root._text:find("<", closestart) + local tagstart = root._rawtext:find("<", closestart) if lasttagend and tagstart and lasttagend < tagstart then index = index + 1 -- Text before closing tag should be a child of the tag being closed - local textnode = ElementNode:new(index, nil, tag, true, lasttagend, tagstart - 1) + local textnode = ElementNode:new(index, "_text", tag, true, lasttagend, tagstart - 1) textnode:close() dbg("[TagCloseLoop]:#LINE# Created text node from %d to %d", lasttagend, tagstart - 1) end lasttagend = closeend + 1 end -- }}} - closestart = root._text:find("<", closestart) + closestart = root._rawtext:find("<", closestart) dbg("[TagCloseLoop]:#LINE# closestart=%s",str(closestart)) tag:close(closestart, closeend + 1) node = tag.parent @@ -271,7 +271,7 @@ local function parse(text,limit) -- {{{ if tpl then -- {{{ dbg("tpl") for k,v in pairs(tpr) do - root._text = root._text:gsub(v,k) + root._rawtext = root._rawtext:gsub(v,k) end end -- }}} return root diff --git a/src/htmlparser/ElementNode.lua b/src/htmlparser/ElementNode.lua index 41f1011..88716d4 100644 --- a/src/htmlparser/ElementNode.lua +++ b/src/htmlparser/ElementNode.lua @@ -106,7 +106,7 @@ function ElementNode:new(index, nameortext, node, descend, openstart, openend) if not node then instance.name = "root" instance.root = instance - instance._text = nameortext + instance._rawtext = nameortext local length = string.len(nameortext) instance._openstart, instance._openend = 1, length instance._closestart, instance._closeend = 1, length @@ -124,20 +124,20 @@ function ElementNode:new(index, nameortext, node, descend, openstart, openend) return setmetatable(instance, ElementNode.mt) end -function ElementNode:gettext() - return string.sub(self.root._text, self._openstart, self._closeend) +function ElementNode:getrawtext() + return string.sub(self.root._rawtext, self._openstart, self._closeend) end -function ElementNode:settext(c) - self.root._text=c +function ElementNode:setrawtext(c) + self.root._rawtext=c end function ElementNode:textonly() - return (self:gettext():gsub("<[^>]*>","")) + return (self:getrawtext():gsub("<[^>]*>","")) end function ElementNode:getcontent() - return string.sub(self.root._text, self._openend + 1, self._closestart - 1) + return string.sub(self.root._rawtext, self._openend + 1, self._closestart - 1) end function ElementNode:addattribute(k, v) @@ -167,19 +167,15 @@ function ElementNode:close(closestart, closeend) node = node.parent if not node then break end node.deepernodes:add(self) - -- text nodes (name == nil) are added to deepernodes for * selector, - -- but not to other indexes since they can't be selected by element name, etc. - if self.name ~= nil then - insert(node.deeperelements, self.name, self) - for k in pairs(self.attributes) do - insert(node.deeperattributes, k, self) - end - if self.id then - insert(node.deeperids, self.id, self) - end - for _,v in ipairs(self.classes) do - insert(node.deeperclasses, v, self) - end + insert(node.deeperelements, self.name, self) + for k in pairs(self.attributes) do + insert(node.deeperattributes, k, self) + end + if self.id then + insert(node.deeperids, self.id, self) + end + for _,v in ipairs(self.classes) do + insert(node.deeperclasses, v, self) end end end diff --git a/tst/init.lua b/tst/init.lua index 81ac87e..b8ee7fd 100644 --- a/tst/init.lua +++ b/tst/init.lua @@ -42,7 +42,7 @@ function test_void() for _,n in ipairs(tree.nodes) do if n.name == "p" then assert_equal(9, #n.nodes, "deeper level") - elseif n.name then + elseif n.name ~= "_text" then assert_equal("br", n.name, "name") assert_equal("", n:getcontent(), "content") end @@ -293,7 +293,7 @@ function test_order() assert_equal(31, #notn, "notn") local blanks = 0 for i,v in pairs(notn) do - if v.name then + if v.name ~= "_text" then assert_equal(i, blanks+tonumber(v.name), "notn order") else blanks = blanks + 1 @@ -342,13 +342,13 @@ function test_text_nodes() assert_equal("p", p.name, "p element") assert_equal(3, #p.nodes, "p should have 3 children") - assert_equal(nil, p.nodes[1].name, "first child should be text node") - assert_equal("line1", p.nodes[1]:gettext(), "first text content") + assert_equal("_text", p.nodes[1].name, "first child should be text node") + assert_equal("line1", p.nodes[1]:getrawtext(), "first text content") assert_equal("br", p.nodes[2].name, "second child should be br") - assert_equal(nil, p.nodes[3].name, "third child should be text node") - assert_equal("line2", p.nodes[3]:gettext(), "third text content") + assert_equal("_text", p.nodes[3].name, "third child should be text node") + assert_equal("line2", p.nodes[3]:getrawtext(), "third text content") assert_equal("line1
line2", p:getcontent(), "getcontent backward compatibility") end @@ -358,8 +358,8 @@ function test_text_nodes_whitespace() assert_equal(1, #tree.nodes, "top level") local p = tree.nodes[1] assert_equal(3, #p.nodes, "p should have 3 children including whitespace") - assert_equal(nil, p.nodes[1].name, "first should be whitespace text node") - assert_equal(nil, p.nodes[3].name, "third should be whitespace text node") + assert_equal("_text", p.nodes[1].name, "first should be whitespace text node") + assert_equal("_text", p.nodes[3].name, "third should be whitespace text node") end function test_text_nodes_selectors() @@ -372,4 +372,10 @@ function test_text_nodes_selectors() local spans = div:select("span") assert_equal(1, #spans, "span selector should return 1 node") assert_equal("span", spans[1].name, "should be span element") + + local texts = div:select("_text") + assert_equal(3, #texts, "_text selector should return 3 text nodes") + for i, node in ipairs(texts) do + assert_equal("_text", node.name, "should be text node") + end end From f0dce27326c34e8123821210eb734861177efec3 Mon Sep 17 00:00:00 2001 From: Remy Wang Date: Thu, 20 Nov 2025 13:38:00 -0800 Subject: [PATCH 4/4] remove special case on text nodes from selectors doc --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d3f8e26..bb433c1 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ Supported selectors are a subset of [jQuery's selectors][1]: Selectors can be combined; e.g. `".class:not([attribute]) element.class"` ## Element type -All tree elements provide, apart from `:select` and `()`, the following accessors (a text node has `nil` as its tagname): +All tree elements provide, apart from `:select` and `()`, the following accessors: ### Basic - `.name` the element's tagname