From 56c453a982693c64bb519240af7a6dc62aba8c62 Mon Sep 17 00:00:00 2001
From: Remy Wang
Date: Thu, 20 Nov 2025 12:39:37 -0800
Subject: [PATCH 1/4] Add text nodes.
---
README.md | 3 +-
src/htmlparser.lua | 24 ++++++++++
src/htmlparser/ElementNode.lua | 22 +++++----
tst/init.lua | 84 ++++++++++++++++++++++++++--------
4 files changed, 102 insertions(+), 31 deletions(-)
diff --git a/README.md b/README.md
index 37149aa..aa0d5f7 100644
--- a/README.md
+++ b/README.md
@@ -70,7 +70,7 @@ Supported selectors are a subset of [jQuery's selectors][1]:
Selectors can be combined; e.g. `".class:not([attribute]) element.class"`
## Element type
-All tree elements provide, apart from `:select` and `()`, the following accessors:
+All tree elements provide, apart from `:select` and `()`, the following accessors (a text node have `nil` as its tagname):
### Basic
- `.name` the element's tagname
@@ -96,7 +96,6 @@ All tree elements provide, apart from `:select` and `()`, the following accessor
- Attribute values in selector strings cannot contain any spaces
- The spaces before and after the `>` in a `parent > child` relation are mandatory
- `line1
line2
")`, `root.nodes[1]:getcontent()` is `"line1
line2"`, while `root.nodes[1].nodes[1].name` is `"br"`
- No start or end tags are implied when [omitted](http://www.w3.org/TR/html5/syntax.html#optional-tags). Only the [void elements](http://www.w3.org/TR/html5/syntax.html#void-elements) should not have an end tag
- No validation is done for tag or attribute names or nesting of element types. The list of void elements is in fact the only part specific to HTML
diff --git a/src/htmlparser.lua b/src/htmlparser.lua
index c6226be..ce9ac8d 100644
--- a/src/htmlparser.lua
+++ b/src/htmlparser.lua
@@ -144,6 +144,7 @@ local function parse(text,limit) -- {{{
local index = 0
local root = ElementNode:new(index, str(text))
local node, descend, tpos, opentags = root, true, 1, {}
+ local lasttagend = nil -- position after last tag ended (nil = start of document)
while true do -- MainLoop {{{
if index == limit then -- {{{
@@ -160,6 +161,15 @@ local function parse(text,limit) -- {{{
dbg("[MainLoop]:#LINE# openstart=%s || tpos=%s || name=%s",str(openstart),str(tpos),str(name))
-- }}}
if not name then break end
+ -- Create text node for any text before this element {{{
+ if lasttagend and openstart and lasttagend < openstart then
+ index = index + 1
+ local textnode = ElementNode:new(index, nil, node, descend, lasttagend, openstart - 1)
+ textnode:close()
+ dbg("[MainLoop]:#LINE# Created text node from %d to %d", lasttagend, openstart - 1)
+ end
+ lasttagend = tpos + 1
+ -- }}}
-- Some more vars {{{
index = index + 1
local tag = ElementNode:new(index, str(name), (node or {}), descend, openstart, tpos)
@@ -236,6 +246,20 @@ local function parse(text,limit) -- {{{
if not closing or closing == "" then break end
tag = table.remove(opentags[closename] or {}) or tag -- kludges for the cases of closing void or non-opened tags
+
+ -- Create text node for any text before this closing tag {{{
+ if closing == "/" then
+ local tagstart = root._text:find("<", closestart)
+ if lasttagend and tagstart and lasttagend < tagstart then
+ index = index + 1
+ -- Text before closing tag should be a child of the tag being closed
+ local textnode = ElementNode:new(index, nil, tag, true, lasttagend, tagstart - 1)
+ textnode:close()
+ dbg("[TagCloseLoop]:#LINE# Created text node from %d to %d", lasttagend, tagstart - 1)
+ end
+ lasttagend = closeend + 1
+ end
+ -- }}}
closestart = root._text:find("<", closestart)
dbg("[TagCloseLoop]:#LINE# closestart=%s",str(closestart))
tag:close(closestart, closeend + 1)
diff --git a/src/htmlparser/ElementNode.lua b/src/htmlparser/ElementNode.lua
index 329043d..41f1011 100644
--- a/src/htmlparser/ElementNode.lua
+++ b/src/htmlparser/ElementNode.lua
@@ -167,15 +167,19 @@ function ElementNode:close(closestart, closeend)
node = node.parent
if not node then break end
node.deepernodes:add(self)
- insert(node.deeperelements, self.name, self)
- for k in pairs(self.attributes) do
- insert(node.deeperattributes, k, self)
- end
- if self.id then
- insert(node.deeperids, self.id, self)
- end
- for _,v in ipairs(self.classes) do
- insert(node.deeperclasses, v, self)
+ -- text nodes (name == nil) are added to deepernodes for * selector,
+ -- but not to other indexes since they can't be selected by element name, etc.
+ if self.name ~= nil then
+ insert(node.deeperelements, self.name, self)
+ for k in pairs(self.attributes) do
+ insert(node.deeperattributes, k, self)
+ end
+ if self.id then
+ insert(node.deeperids, self.id, self)
+ end
+ for _,v in ipairs(self.classes) do
+ insert(node.deeperclasses, v, self)
+ end
end
end
end
diff --git a/tst/init.lua b/tst/init.lua
index eb1bbf6..81ac87e 100644
--- a/tst/init.lua
+++ b/tst/init.lua
@@ -38,11 +38,11 @@ function test_void()
]])
- assert_equal(5, #tree.nodes, "top level")
+ assert_equal(9, #tree.nodes, "top level")
for _,n in ipairs(tree.nodes) do
if n.name == "p" then
- assert_equal(4, #n.nodes, "deeper level")
- else
+ assert_equal(9, #n.nodes, "deeper level")
+ elseif n.name then
assert_equal("br", n.name, "name")
assert_equal("", n:getcontent(), "content")
end
@@ -70,7 +70,7 @@ function test_class()
]])
- assert_equal(3, #tree.nodes, "top level")
+ assert_equal(5, #tree.nodes, "top level")
assert_equal(1, #tree(".one"), ".one")
assert_equal(2, #tree(".two"), ".two")
assert_equal(2, #tree(".three"), ".three")
@@ -126,10 +126,10 @@ function test_attr_notequal()
]])
- assert_equal(4, #tree.nodes, "top level")
- assert_equal(3, #tree("[a1!='a1']"), "a1!='a1'")
- assert_equal(4, #tree("[a1!='b1']"), "a1!='b1'")
- assert_equal(3, #tree("[a1!='']"), "a1!=''")
+ assert_equal(7, #tree.nodes, "top level")
+ assert_equal(6, #tree("[a1!='a1']"), "a1!='a1'")
+ assert_equal(7, #tree("[a1!='b1']"), "a1!='b1'")
+ assert_equal(6, #tree("[a1!='']"), "a1!=''")
assert_equal(3, #tree("[a1!=]"), "a1!=")
end
@@ -141,7 +141,7 @@ function test_attr_prefix_start_end()
]])
- assert_equal(5, #tree.nodes, "top level")
+ assert_equal(9, #tree.nodes, "top level")
assert_equal(3, #tree("[a1|='en']"), "a1|='en'")
assert_equal(4, #tree("[a1^='en']"), "a1^='en'")
assert_equal(2, #tree("[a1$='en']"), "a1$='en'")
@@ -154,7 +154,7 @@ function test_attr_word()
]])
- assert_equal(4, #tree.nodes, "top level")
+ assert_equal(7, #tree.nodes, "top level")
assert_equal(1, #tree("[a1~='two']"), "a1~='two'")
assert_equal(2, #tree("[a1~='three']"), "a1~='three'")
assert_equal(1, #tree("[a1~='four']"), "a1~='four'")
@@ -169,7 +169,7 @@ function test_attr_contains()
]])
- assert_equal(6, #tree.nodes, "top level")
+ assert_equal(11, #tree.nodes, "top level")
assert_equal(2, #tree("[a1*='one']"), "a1*='one'")
assert_equal(2, #tree("[a1*='t']"), "a1*='t'")
assert_equal(1, #tree("[a1*='f']"), "a1*='f'")
@@ -238,11 +238,11 @@ function test_not()
]])
- assert_equal(2, #tree.nodes, "top level")
- assert_equal(1, #tree(":not([a1=1])"), ":not([a1=1])")
- assert_equal(1, #tree(":not([a2])"), ":not([a2])")
- assert_equal(1, #tree(":not(n)"), ":not(n)")
- assert_equal(2, #tree(":not(m)"), ":not(m)")
+ assert_equal(3, #tree.nodes, "top level")
+ assert_equal(4, #tree(":not([a1=1])"), ":not([a1=1])")
+ assert_equal(4, #tree(":not([a2])"), ":not([a2])")
+ assert_equal(4, #tree(":not(n)"), ":not(n)")
+ assert_equal(5, #tree(":not(m)"), ":not(m)")
end
function test_combine()
@@ -256,7 +256,7 @@ function test_combine()
]])
- assert_equal(2, #tree.nodes, "top level")
+ assert_equal(3, #tree.nodes, "top level")
assert_equal(2, #tree("e.c:not([a|='1']) > n[b*='2']"), "e.c:not([a|='1']) > n[b*='2']")
assert_equal(3, #tree("e.c:not([a|='1']) n[b*='2']"), "e.c:not([a|='1']) n[b*='2']")
assert_equal(1, #tree("#123 .c[b]"), "#123 .c[b]")
@@ -290,9 +290,14 @@ function test_order()
assert_equal(i, tonumber(v:getcontent()), "n order")
end
local notn = tree(":not(n)")
- assert_equal(4, #notn, "notn")
+ assert_equal(31, #notn, "notn")
+ local blanks = 0
for i,v in pairs(notn) do
- assert_equal(i, tonumber(v.name), "notn order")
+ if v.name then
+ assert_equal(i, blanks+tonumber(v.name), "notn order")
+ else
+ blanks = blanks + 1
+ end
end
end
@@ -327,5 +332,44 @@ function test_loop_limit()
with unclosed attribute
]]) -- issue#42
- assert(#tree.nodes==17)
+ assert(#tree.nodes==33)
+end
+
+function test_text_nodes()
+ local tree = htmlparser.parse("line1
line2
")
+ assert_equal(1, #tree.nodes, "top level")
+ local p = tree.nodes[1]
+ assert_equal("p", p.name, "p element")
+ assert_equal(3, #p.nodes, "p should have 3 children")
+
+ assert_equal(nil, p.nodes[1].name, "first child should be text node")
+ assert_equal("line1", p.nodes[1]:gettext(), "first text content")
+
+ assert_equal("br", p.nodes[2].name, "second child should be br")
+
+ assert_equal(nil, p.nodes[3].name, "third child should be text node")
+ assert_equal("line2", p.nodes[3]:gettext(), "third text content")
+
+ assert_equal("line1
line2", p:getcontent(), "getcontent backward compatibility")
+end
+
+function test_text_nodes_whitespace()
+ local tree = htmlparser.parse("
")
+ assert_equal(1, #tree.nodes, "top level")
+ local p = tree.nodes[1]
+ assert_equal(3, #p.nodes, "p should have 3 children including whitespace")
+ assert_equal(nil, p.nodes[1].name, "first should be whitespace text node")
+ assert_equal(nil, p.nodes[3].name, "third should be whitespace text node")
+end
+
+function test_text_nodes_selectors()
+ local tree = htmlparser.parse("textinnermore
")
+ local div = tree.nodes[1]
+
+ local all = div:select("*")
+ assert_equal(4, #all, "* selector should return 4 nodes (3 text + 1 span)")
+
+ local spans = div:select("span")
+ assert_equal(1, #spans, "span selector should return 1 node")
+ assert_equal("span", spans[1].name, "should be span element")
end
From 7682c5e2e5cb0db06011f31530d01aa9bb047bd6 Mon Sep 17 00:00:00 2001
From: Remy Wang
Date: Thu, 20 Nov 2025 13:05:21 -0800
Subject: [PATCH 2/4] Fix typo.
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index aa0d5f7..f7ec795 100644
--- a/README.md
+++ b/README.md
@@ -70,7 +70,7 @@ Supported selectors are a subset of [jQuery's selectors][1]:
Selectors can be combined; e.g. `".class:not([attribute]) element.class"`
## Element type
-All tree elements provide, apart from `:select` and `()`, the following accessors (a text node have `nil` as its tagname):
+All tree elements provide, apart from `:select` and `()`, the following accessors (a text node has `nil` as its tagname):
### Basic
- `.name` the element's tagname
From 6e413618e47b9231c1f6f20ea0d6ce846022c461 Mon Sep 17 00:00:00 2001
From: Remy Wang
Date: Thu, 20 Nov 2025 13:23:20 -0800
Subject: [PATCH 3/4] Use _text as tagname for text nodes
---
README.md | 2 +-
src/htmlparser.lua | 16 +++++++--------
src/htmlparser/ElementNode.lua | 36 +++++++++++++++-------------------
tst/init.lua | 22 +++++++++++++--------
4 files changed, 39 insertions(+), 37 deletions(-)
diff --git a/README.md b/README.md
index f7ec795..d3f8e26 100644
--- a/README.md
+++ b/README.md
@@ -83,7 +83,7 @@ All tree elements provide, apart from `:select` and `()`, the following accessor
### Other
- `.index` sequence number of elements in order of appearance; root index is `0`
-- `:gettext()` the complete element text, starting with `""` or `""`
+- `:getrawtext()` the complete element text, starting with `""` or `""`
- `.level` how deep the element is in the tree; root level is `0`
- `.root` the root element of the tree; `root.root` is `root`
- `.deepernodes` a [Set][1] containing all elements in the tree beneath this element, including this element's `.nodes`; `{}` if none
diff --git a/src/htmlparser.lua b/src/htmlparser.lua
index ce9ac8d..b89479c 100644
--- a/src/htmlparser.lua
+++ b/src/htmlparser.lua
@@ -153,7 +153,7 @@ local function parse(text,limit) -- {{{
end -- }}}
-- openstart/tpos Definitions {{{
local openstart, name
- openstart, tpos, name = root._text:find(
+ openstart, tpos, name = root._rawtext:find(
"<" .. -- an uncaptured starting "<"
"([%w-]+)" .. -- name = the first word, directly following the "<"
"[^>]*>", -- include, but not capture everything up to the next ">"
@@ -164,7 +164,7 @@ local function parse(text,limit) -- {{{
-- Create text node for any text before this element {{{
if lasttagend and openstart and lasttagend < openstart then
index = index + 1
- local textnode = ElementNode:new(index, nil, node, descend, lasttagend, openstart - 1)
+ local textnode = ElementNode:new(index, "_text", node, descend, lasttagend, openstart - 1)
textnode:close()
dbg("[MainLoop]:#LINE# Created text node from %d to %d", lasttagend, openstart - 1)
end
@@ -175,7 +175,7 @@ local function parse(text,limit) -- {{{
local tag = ElementNode:new(index, str(name), (node or {}), descend, openstart, tpos)
node = tag
local tagloop
- local tagst, apos = tag:gettext(), 1
+ local tagst, apos = tag:getrawtext(), 1
-- }}}
while true do -- TagLoop {{{
dbg("[TagLoop]:#LINE# tag.name=%s, tagloop=%s",str(tag.name),str(tagloop))
@@ -240,7 +240,7 @@ local function parse(text,limit) -- {{{
end
local closestart, closing, closename
- closestart, closeend, closing, closename = root._text:find("[^<]*<(/?)([%w-]+)", closeend)
+ closestart, closeend, closing, closename = root._rawtext:find("[^<]*<(/?)([%w-]+)", closeend)
dbg("[TagCloseLoop]:#LINE# closestart=%s || closeend=%s || closing=%s || closename=%s",str(closestart),str(closeend),str(closing),str(closename))
if not closing or closing == "" then break end
@@ -249,18 +249,18 @@ local function parse(text,limit) -- {{{
-- Create text node for any text before this closing tag {{{
if closing == "/" then
- local tagstart = root._text:find("<", closestart)
+ local tagstart = root._rawtext:find("<", closestart)
if lasttagend and tagstart and lasttagend < tagstart then
index = index + 1
-- Text before closing tag should be a child of the tag being closed
- local textnode = ElementNode:new(index, nil, tag, true, lasttagend, tagstart - 1)
+ local textnode = ElementNode:new(index, "_text", tag, true, lasttagend, tagstart - 1)
textnode:close()
dbg("[TagCloseLoop]:#LINE# Created text node from %d to %d", lasttagend, tagstart - 1)
end
lasttagend = closeend + 1
end
-- }}}
- closestart = root._text:find("<", closestart)
+ closestart = root._rawtext:find("<", closestart)
dbg("[TagCloseLoop]:#LINE# closestart=%s",str(closestart))
tag:close(closestart, closeend + 1)
node = tag.parent
@@ -271,7 +271,7 @@ local function parse(text,limit) -- {{{
if tpl then -- {{{
dbg("tpl")
for k,v in pairs(tpr) do
- root._text = root._text:gsub(v,k)
+ root._rawtext = root._rawtext:gsub(v,k)
end
end -- }}}
return root
diff --git a/src/htmlparser/ElementNode.lua b/src/htmlparser/ElementNode.lua
index 41f1011..88716d4 100644
--- a/src/htmlparser/ElementNode.lua
+++ b/src/htmlparser/ElementNode.lua
@@ -106,7 +106,7 @@ function ElementNode:new(index, nameortext, node, descend, openstart, openend)
if not node then
instance.name = "root"
instance.root = instance
- instance._text = nameortext
+ instance._rawtext = nameortext
local length = string.len(nameortext)
instance._openstart, instance._openend = 1, length
instance._closestart, instance._closeend = 1, length
@@ -124,20 +124,20 @@ function ElementNode:new(index, nameortext, node, descend, openstart, openend)
return setmetatable(instance, ElementNode.mt)
end
-function ElementNode:gettext()
- return string.sub(self.root._text, self._openstart, self._closeend)
+function ElementNode:getrawtext()
+ return string.sub(self.root._rawtext, self._openstart, self._closeend)
end
-function ElementNode:settext(c)
- self.root._text=c
+function ElementNode:setrawtext(c)
+ self.root._rawtext=c
end
function ElementNode:textonly()
- return (self:gettext():gsub("<[^>]*>",""))
+ return (self:getrawtext():gsub("<[^>]*>",""))
end
function ElementNode:getcontent()
- return string.sub(self.root._text, self._openend + 1, self._closestart - 1)
+ return string.sub(self.root._rawtext, self._openend + 1, self._closestart - 1)
end
function ElementNode:addattribute(k, v)
@@ -167,19 +167,15 @@ function ElementNode:close(closestart, closeend)
node = node.parent
if not node then break end
node.deepernodes:add(self)
- -- text nodes (name == nil) are added to deepernodes for * selector,
- -- but not to other indexes since they can't be selected by element name, etc.
- if self.name ~= nil then
- insert(node.deeperelements, self.name, self)
- for k in pairs(self.attributes) do
- insert(node.deeperattributes, k, self)
- end
- if self.id then
- insert(node.deeperids, self.id, self)
- end
- for _,v in ipairs(self.classes) do
- insert(node.deeperclasses, v, self)
- end
+ insert(node.deeperelements, self.name, self)
+ for k in pairs(self.attributes) do
+ insert(node.deeperattributes, k, self)
+ end
+ if self.id then
+ insert(node.deeperids, self.id, self)
+ end
+ for _,v in ipairs(self.classes) do
+ insert(node.deeperclasses, v, self)
end
end
end
diff --git a/tst/init.lua b/tst/init.lua
index 81ac87e..b8ee7fd 100644
--- a/tst/init.lua
+++ b/tst/init.lua
@@ -42,7 +42,7 @@ function test_void()
for _,n in ipairs(tree.nodes) do
if n.name == "p" then
assert_equal(9, #n.nodes, "deeper level")
- elseif n.name then
+ elseif n.name ~= "_text" then
assert_equal("br", n.name, "name")
assert_equal("", n:getcontent(), "content")
end
@@ -293,7 +293,7 @@ function test_order()
assert_equal(31, #notn, "notn")
local blanks = 0
for i,v in pairs(notn) do
- if v.name then
+ if v.name ~= "_text" then
assert_equal(i, blanks+tonumber(v.name), "notn order")
else
blanks = blanks + 1
@@ -342,13 +342,13 @@ function test_text_nodes()
assert_equal("p", p.name, "p element")
assert_equal(3, #p.nodes, "p should have 3 children")
- assert_equal(nil, p.nodes[1].name, "first child should be text node")
- assert_equal("line1", p.nodes[1]:gettext(), "first text content")
+ assert_equal("_text", p.nodes[1].name, "first child should be text node")
+ assert_equal("line1", p.nodes[1]:getrawtext(), "first text content")
assert_equal("br", p.nodes[2].name, "second child should be br")
- assert_equal(nil, p.nodes[3].name, "third child should be text node")
- assert_equal("line2", p.nodes[3]:gettext(), "third text content")
+ assert_equal("_text", p.nodes[3].name, "third child should be text node")
+ assert_equal("line2", p.nodes[3]:getrawtext(), "third text content")
assert_equal("line1
line2", p:getcontent(), "getcontent backward compatibility")
end
@@ -358,8 +358,8 @@ function test_text_nodes_whitespace()
assert_equal(1, #tree.nodes, "top level")
local p = tree.nodes[1]
assert_equal(3, #p.nodes, "p should have 3 children including whitespace")
- assert_equal(nil, p.nodes[1].name, "first should be whitespace text node")
- assert_equal(nil, p.nodes[3].name, "third should be whitespace text node")
+ assert_equal("_text", p.nodes[1].name, "first should be whitespace text node")
+ assert_equal("_text", p.nodes[3].name, "third should be whitespace text node")
end
function test_text_nodes_selectors()
@@ -372,4 +372,10 @@ function test_text_nodes_selectors()
local spans = div:select("span")
assert_equal(1, #spans, "span selector should return 1 node")
assert_equal("span", spans[1].name, "should be span element")
+
+ local texts = div:select("_text")
+ assert_equal(3, #texts, "_text selector should return 3 text nodes")
+ for i, node in ipairs(texts) do
+ assert_equal("_text", node.name, "should be text node")
+ end
end
From f0dce27326c34e8123821210eb734861177efec3 Mon Sep 17 00:00:00 2001
From: Remy Wang
Date: Thu, 20 Nov 2025 13:38:00 -0800
Subject: [PATCH 4/4] remove special case on text nodes from selectors doc
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index d3f8e26..bb433c1 100644
--- a/README.md
+++ b/README.md
@@ -70,7 +70,7 @@ Supported selectors are a subset of [jQuery's selectors][1]:
Selectors can be combined; e.g. `".class:not([attribute]) element.class"`
## Element type
-All tree elements provide, apart from `:select` and `()`, the following accessors (a text node has `nil` as its tagname):
+All tree elements provide, apart from `:select` and `()`, the following accessors:
### Basic
- `.name` the element's tagname