diff --git a/README.md b/README.md
index 37149aa..bb433c1 100644
--- a/README.md
+++ b/README.md
@@ -83,7 +83,7 @@ All tree elements provide, apart from `:select` and `()`, the following accessor
### Other
- `.index` sequence number of elements in order of appearance; root index is `0`
-- `:gettext()` the complete element text, starting with `""` or `""`
+- `:getrawtext()` the complete element text, starting with `""` or `""`
- `.level` how deep the element is in the tree; root level is `0`
- `.root` the root element of the tree; `root.root` is `root`
- `.deepernodes` a [Set][1] containing all elements in the tree beneath this element, including this element's `.nodes`; `{}` if none
@@ -96,7 +96,6 @@ All tree elements provide, apart from `:select` and `()`, the following accessor
- Attribute values in selector strings cannot contain any spaces
- The spaces before and after the `>` in a `parent > child` relation are mandatory
- `line1
line2
")`, `root.nodes[1]:getcontent()` is `"line1
line2"`, while `root.nodes[1].nodes[1].name` is `"br"`
- No start or end tags are implied when [omitted](http://www.w3.org/TR/html5/syntax.html#optional-tags). Only the [void elements](http://www.w3.org/TR/html5/syntax.html#void-elements) should not have an end tag
- No validation is done for tag or attribute names or nesting of element types. The list of void elements is in fact the only part specific to HTML
diff --git a/src/htmlparser.lua b/src/htmlparser.lua
index c6226be..b89479c 100644
--- a/src/htmlparser.lua
+++ b/src/htmlparser.lua
@@ -144,6 +144,7 @@ local function parse(text,limit) -- {{{
local index = 0
local root = ElementNode:new(index, str(text))
local node, descend, tpos, opentags = root, true, 1, {}
+ local lasttagend = nil -- position after last tag ended (nil = start of document)
while true do -- MainLoop {{{
if index == limit then -- {{{
@@ -152,7 +153,7 @@ local function parse(text,limit) -- {{{
end -- }}}
-- openstart/tpos Definitions {{{
local openstart, name
- openstart, tpos, name = root._text:find(
+ openstart, tpos, name = root._rawtext:find(
"<" .. -- an uncaptured starting "<"
"([%w-]+)" .. -- name = the first word, directly following the "<"
"[^>]*>", -- include, but not capture everything up to the next ">"
@@ -160,12 +161,21 @@ local function parse(text,limit) -- {{{
dbg("[MainLoop]:#LINE# openstart=%s || tpos=%s || name=%s",str(openstart),str(tpos),str(name))
-- }}}
if not name then break end
+ -- Create text node for any text before this element {{{
+ if lasttagend and openstart and lasttagend < openstart then
+ index = index + 1
+ local textnode = ElementNode:new(index, "_text", node, descend, lasttagend, openstart - 1)
+ textnode:close()
+ dbg("[MainLoop]:#LINE# Created text node from %d to %d", lasttagend, openstart - 1)
+ end
+ lasttagend = tpos + 1
+ -- }}}
-- Some more vars {{{
index = index + 1
local tag = ElementNode:new(index, str(name), (node or {}), descend, openstart, tpos)
node = tag
local tagloop
- local tagst, apos = tag:gettext(), 1
+ local tagst, apos = tag:getrawtext(), 1
-- }}}
while true do -- TagLoop {{{
dbg("[TagLoop]:#LINE# tag.name=%s, tagloop=%s",str(tag.name),str(tagloop))
@@ -230,13 +240,27 @@ local function parse(text,limit) -- {{{
end
local closestart, closing, closename
- closestart, closeend, closing, closename = root._text:find("[^<]*<(/?)([%w-]+)", closeend)
+ closestart, closeend, closing, closename = root._rawtext:find("[^<]*<(/?)([%w-]+)", closeend)
dbg("[TagCloseLoop]:#LINE# closestart=%s || closeend=%s || closing=%s || closename=%s",str(closestart),str(closeend),str(closing),str(closename))
if not closing or closing == "" then break end
tag = table.remove(opentags[closename] or {}) or tag -- kludges for the cases of closing void or non-opened tags
- closestart = root._text:find("<", closestart)
+
+ -- Create text node for any text before this closing tag {{{
+ if closing == "/" then
+ local tagstart = root._rawtext:find("<", closestart)
+ if lasttagend and tagstart and lasttagend < tagstart then
+ index = index + 1
+ -- Text before closing tag should be a child of the tag being closed
+ local textnode = ElementNode:new(index, "_text", tag, true, lasttagend, tagstart - 1)
+ textnode:close()
+ dbg("[TagCloseLoop]:#LINE# Created text node from %d to %d", lasttagend, tagstart - 1)
+ end
+ lasttagend = closeend + 1
+ end
+ -- }}}
+ closestart = root._rawtext:find("<", closestart)
dbg("[TagCloseLoop]:#LINE# closestart=%s",str(closestart))
tag:close(closestart, closeend + 1)
node = tag.parent
@@ -247,7 +271,7 @@ local function parse(text,limit) -- {{{
if tpl then -- {{{
dbg("tpl")
for k,v in pairs(tpr) do
- root._text = root._text:gsub(v,k)
+ root._rawtext = root._rawtext:gsub(v,k)
end
end -- }}}
return root
diff --git a/src/htmlparser/ElementNode.lua b/src/htmlparser/ElementNode.lua
index 329043d..88716d4 100644
--- a/src/htmlparser/ElementNode.lua
+++ b/src/htmlparser/ElementNode.lua
@@ -106,7 +106,7 @@ function ElementNode:new(index, nameortext, node, descend, openstart, openend)
if not node then
instance.name = "root"
instance.root = instance
- instance._text = nameortext
+ instance._rawtext = nameortext
local length = string.len(nameortext)
instance._openstart, instance._openend = 1, length
instance._closestart, instance._closeend = 1, length
@@ -124,20 +124,20 @@ function ElementNode:new(index, nameortext, node, descend, openstart, openend)
return setmetatable(instance, ElementNode.mt)
end
-function ElementNode:gettext()
- return string.sub(self.root._text, self._openstart, self._closeend)
+function ElementNode:getrawtext()
+ return string.sub(self.root._rawtext, self._openstart, self._closeend)
end
-function ElementNode:settext(c)
- self.root._text=c
+function ElementNode:setrawtext(c)
+ self.root._rawtext=c
end
function ElementNode:textonly()
- return (self:gettext():gsub("<[^>]*>",""))
+ return (self:getrawtext():gsub("<[^>]*>",""))
end
function ElementNode:getcontent()
- return string.sub(self.root._text, self._openend + 1, self._closestart - 1)
+ return string.sub(self.root._rawtext, self._openend + 1, self._closestart - 1)
end
function ElementNode:addattribute(k, v)
diff --git a/tst/init.lua b/tst/init.lua
index eb1bbf6..b8ee7fd 100644
--- a/tst/init.lua
+++ b/tst/init.lua
@@ -38,11 +38,11 @@ function test_void()
]])
- assert_equal(5, #tree.nodes, "top level")
+ assert_equal(9, #tree.nodes, "top level")
for _,n in ipairs(tree.nodes) do
if n.name == "p" then
- assert_equal(4, #n.nodes, "deeper level")
- else
+ assert_equal(9, #n.nodes, "deeper level")
+ elseif n.name ~= "_text" then
assert_equal("br", n.name, "name")
assert_equal("", n:getcontent(), "content")
end
@@ -70,7 +70,7 @@ function test_class()
]])
- assert_equal(3, #tree.nodes, "top level")
+ assert_equal(5, #tree.nodes, "top level")
assert_equal(1, #tree(".one"), ".one")
assert_equal(2, #tree(".two"), ".two")
assert_equal(2, #tree(".three"), ".three")
@@ -126,10 +126,10 @@ function test_attr_notequal()
]])
- assert_equal(4, #tree.nodes, "top level")
- assert_equal(3, #tree("[a1!='a1']"), "a1!='a1'")
- assert_equal(4, #tree("[a1!='b1']"), "a1!='b1'")
- assert_equal(3, #tree("[a1!='']"), "a1!=''")
+ assert_equal(7, #tree.nodes, "top level")
+ assert_equal(6, #tree("[a1!='a1']"), "a1!='a1'")
+ assert_equal(7, #tree("[a1!='b1']"), "a1!='b1'")
+ assert_equal(6, #tree("[a1!='']"), "a1!=''")
assert_equal(3, #tree("[a1!=]"), "a1!=")
end
@@ -141,7 +141,7 @@ function test_attr_prefix_start_end()
]])
- assert_equal(5, #tree.nodes, "top level")
+ assert_equal(9, #tree.nodes, "top level")
assert_equal(3, #tree("[a1|='en']"), "a1|='en'")
assert_equal(4, #tree("[a1^='en']"), "a1^='en'")
assert_equal(2, #tree("[a1$='en']"), "a1$='en'")
@@ -154,7 +154,7 @@ function test_attr_word()
]])
- assert_equal(4, #tree.nodes, "top level")
+ assert_equal(7, #tree.nodes, "top level")
assert_equal(1, #tree("[a1~='two']"), "a1~='two'")
assert_equal(2, #tree("[a1~='three']"), "a1~='three'")
assert_equal(1, #tree("[a1~='four']"), "a1~='four'")
@@ -169,7 +169,7 @@ function test_attr_contains()
]])
- assert_equal(6, #tree.nodes, "top level")
+ assert_equal(11, #tree.nodes, "top level")
assert_equal(2, #tree("[a1*='one']"), "a1*='one'")
assert_equal(2, #tree("[a1*='t']"), "a1*='t'")
assert_equal(1, #tree("[a1*='f']"), "a1*='f'")
@@ -238,11 +238,11 @@ function test_not()
]])
- assert_equal(2, #tree.nodes, "top level")
- assert_equal(1, #tree(":not([a1=1])"), ":not([a1=1])")
- assert_equal(1, #tree(":not([a2])"), ":not([a2])")
- assert_equal(1, #tree(":not(n)"), ":not(n)")
- assert_equal(2, #tree(":not(m)"), ":not(m)")
+ assert_equal(3, #tree.nodes, "top level")
+ assert_equal(4, #tree(":not([a1=1])"), ":not([a1=1])")
+ assert_equal(4, #tree(":not([a2])"), ":not([a2])")
+ assert_equal(4, #tree(":not(n)"), ":not(n)")
+ assert_equal(5, #tree(":not(m)"), ":not(m)")
end
function test_combine()
@@ -256,7 +256,7 @@ function test_combine()
]])
- assert_equal(2, #tree.nodes, "top level")
+ assert_equal(3, #tree.nodes, "top level")
assert_equal(2, #tree("e.c:not([a|='1']) > n[b*='2']"), "e.c:not([a|='1']) > n[b*='2']")
assert_equal(3, #tree("e.c:not([a|='1']) n[b*='2']"), "e.c:not([a|='1']) n[b*='2']")
assert_equal(1, #tree("#123 .c[b]"), "#123 .c[b]")
@@ -290,9 +290,14 @@ function test_order()
assert_equal(i, tonumber(v:getcontent()), "n order")
end
local notn = tree(":not(n)")
- assert_equal(4, #notn, "notn")
+ assert_equal(31, #notn, "notn")
+ local blanks = 0
for i,v in pairs(notn) do
- assert_equal(i, tonumber(v.name), "notn order")
+ if v.name ~= "_text" then
+ assert_equal(i, blanks+tonumber(v.name), "notn order")
+ else
+ blanks = blanks + 1
+ end
end
end
@@ -327,5 +332,50 @@ function test_loop_limit()
with unclosed attribute
]]) -- issue#42
- assert(#tree.nodes==17)
+ assert(#tree.nodes==33)
+end
+
+function test_text_nodes()
+ local tree = htmlparser.parse("line1
line2
")
+ assert_equal(1, #tree.nodes, "top level")
+ local p = tree.nodes[1]
+ assert_equal("p", p.name, "p element")
+ assert_equal(3, #p.nodes, "p should have 3 children")
+
+ assert_equal("_text", p.nodes[1].name, "first child should be text node")
+ assert_equal("line1", p.nodes[1]:getrawtext(), "first text content")
+
+ assert_equal("br", p.nodes[2].name, "second child should be br")
+
+ assert_equal("_text", p.nodes[3].name, "third child should be text node")
+ assert_equal("line2", p.nodes[3]:getrawtext(), "third text content")
+
+ assert_equal("line1
line2", p:getcontent(), "getcontent backward compatibility")
+end
+
+function test_text_nodes_whitespace()
+ local tree = htmlparser.parse("
")
+ assert_equal(1, #tree.nodes, "top level")
+ local p = tree.nodes[1]
+ assert_equal(3, #p.nodes, "p should have 3 children including whitespace")
+ assert_equal("_text", p.nodes[1].name, "first should be whitespace text node")
+ assert_equal("_text", p.nodes[3].name, "third should be whitespace text node")
+end
+
+function test_text_nodes_selectors()
+ local tree = htmlparser.parse("textinnermore
")
+ local div = tree.nodes[1]
+
+ local all = div:select("*")
+ assert_equal(4, #all, "* selector should return 4 nodes (3 text + 1 span)")
+
+ local spans = div:select("span")
+ assert_equal(1, #spans, "span selector should return 1 node")
+ assert_equal("span", spans[1].name, "should be span element")
+
+ local texts = div:select("_text")
+ assert_equal(3, #texts, "_text selector should return 3 text nodes")
+ for i, node in ipairs(texts) do
+ assert_equal("_text", node.name, "should be text node")
+ end
end