diff --git a/README.md b/README.md index 37149aa..bb433c1 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,7 @@ All tree elements provide, apart from `:select` and `()`, the following accessor ### Other - `.index` sequence number of elements in order of appearance; root index is `0` -- `:gettext()` the complete element text, starting with `""` or `""` +- `:getrawtext()` the complete element text, starting with `""` or `""` - `.level` how deep the element is in the tree; root level is `0` - `.root` the root element of the tree; `root.root` is `root` - `.deepernodes` a [Set][1] containing all elements in the tree beneath this element, including this element's `.nodes`; `{}` if none @@ -96,7 +96,6 @@ All tree elements provide, apart from `:select` and `()`, the following accessor - Attribute values in selector strings cannot contain any spaces - The spaces before and after the `>` in a `parent > child` relation are mandatory - `line1
line2

")`, `root.nodes[1]:getcontent()` is `"line1
line2"`, while `root.nodes[1].nodes[1].name` is `"br"` - No start or end tags are implied when [omitted](http://www.w3.org/TR/html5/syntax.html#optional-tags). Only the [void elements](http://www.w3.org/TR/html5/syntax.html#void-elements) should not have an end tag - No validation is done for tag or attribute names or nesting of element types. The list of void elements is in fact the only part specific to HTML diff --git a/src/htmlparser.lua b/src/htmlparser.lua index c6226be..b89479c 100644 --- a/src/htmlparser.lua +++ b/src/htmlparser.lua @@ -144,6 +144,7 @@ local function parse(text,limit) -- {{{ local index = 0 local root = ElementNode:new(index, str(text)) local node, descend, tpos, opentags = root, true, 1, {} + local lasttagend = nil -- position after last tag ended (nil = start of document) while true do -- MainLoop {{{ if index == limit then -- {{{ @@ -152,7 +153,7 @@ local function parse(text,limit) -- {{{ end -- }}} -- openstart/tpos Definitions {{{ local openstart, name - openstart, tpos, name = root._text:find( + openstart, tpos, name = root._rawtext:find( "<" .. -- an uncaptured starting "<" "([%w-]+)" .. -- name = the first word, directly following the "<" "[^>]*>", -- include, but not capture everything up to the next ">" @@ -160,12 +161,21 @@ local function parse(text,limit) -- {{{ dbg("[MainLoop]:#LINE# openstart=%s || tpos=%s || name=%s",str(openstart),str(tpos),str(name)) -- }}} if not name then break end + -- Create text node for any text before this element {{{ + if lasttagend and openstart and lasttagend < openstart then + index = index + 1 + local textnode = ElementNode:new(index, "_text", node, descend, lasttagend, openstart - 1) + textnode:close() + dbg("[MainLoop]:#LINE# Created text node from %d to %d", lasttagend, openstart - 1) + end + lasttagend = tpos + 1 + -- }}} -- Some more vars {{{ index = index + 1 local tag = ElementNode:new(index, str(name), (node or {}), descend, openstart, tpos) node = tag local tagloop - local tagst, apos = tag:gettext(), 1 + local tagst, apos = tag:getrawtext(), 1 -- }}} while true do -- TagLoop {{{ dbg("[TagLoop]:#LINE# tag.name=%s, tagloop=%s",str(tag.name),str(tagloop)) @@ -230,13 +240,27 @@ local function parse(text,limit) -- {{{ end local closestart, closing, closename - closestart, closeend, closing, closename = root._text:find("[^<]*<(/?)([%w-]+)", closeend) + closestart, closeend, closing, closename = root._rawtext:find("[^<]*<(/?)([%w-]+)", closeend) dbg("[TagCloseLoop]:#LINE# closestart=%s || closeend=%s || closing=%s || closename=%s",str(closestart),str(closeend),str(closing),str(closename)) if not closing or closing == "" then break end tag = table.remove(opentags[closename] or {}) or tag -- kludges for the cases of closing void or non-opened tags - closestart = root._text:find("<", closestart) + + -- Create text node for any text before this closing tag {{{ + if closing == "/" then + local tagstart = root._rawtext:find("<", closestart) + if lasttagend and tagstart and lasttagend < tagstart then + index = index + 1 + -- Text before closing tag should be a child of the tag being closed + local textnode = ElementNode:new(index, "_text", tag, true, lasttagend, tagstart - 1) + textnode:close() + dbg("[TagCloseLoop]:#LINE# Created text node from %d to %d", lasttagend, tagstart - 1) + end + lasttagend = closeend + 1 + end + -- }}} + closestart = root._rawtext:find("<", closestart) dbg("[TagCloseLoop]:#LINE# closestart=%s",str(closestart)) tag:close(closestart, closeend + 1) node = tag.parent @@ -247,7 +271,7 @@ local function parse(text,limit) -- {{{ if tpl then -- {{{ dbg("tpl") for k,v in pairs(tpr) do - root._text = root._text:gsub(v,k) + root._rawtext = root._rawtext:gsub(v,k) end end -- }}} return root diff --git a/src/htmlparser/ElementNode.lua b/src/htmlparser/ElementNode.lua index 329043d..88716d4 100644 --- a/src/htmlparser/ElementNode.lua +++ b/src/htmlparser/ElementNode.lua @@ -106,7 +106,7 @@ function ElementNode:new(index, nameortext, node, descend, openstart, openend) if not node then instance.name = "root" instance.root = instance - instance._text = nameortext + instance._rawtext = nameortext local length = string.len(nameortext) instance._openstart, instance._openend = 1, length instance._closestart, instance._closeend = 1, length @@ -124,20 +124,20 @@ function ElementNode:new(index, nameortext, node, descend, openstart, openend) return setmetatable(instance, ElementNode.mt) end -function ElementNode:gettext() - return string.sub(self.root._text, self._openstart, self._closeend) +function ElementNode:getrawtext() + return string.sub(self.root._rawtext, self._openstart, self._closeend) end -function ElementNode:settext(c) - self.root._text=c +function ElementNode:setrawtext(c) + self.root._rawtext=c end function ElementNode:textonly() - return (self:gettext():gsub("<[^>]*>","")) + return (self:getrawtext():gsub("<[^>]*>","")) end function ElementNode:getcontent() - return string.sub(self.root._text, self._openend + 1, self._closestart - 1) + return string.sub(self.root._rawtext, self._openend + 1, self._closestart - 1) end function ElementNode:addattribute(k, v) diff --git a/tst/init.lua b/tst/init.lua index eb1bbf6..b8ee7fd 100644 --- a/tst/init.lua +++ b/tst/init.lua @@ -38,11 +38,11 @@ function test_void()

]]) - assert_equal(5, #tree.nodes, "top level") + assert_equal(9, #tree.nodes, "top level") for _,n in ipairs(tree.nodes) do if n.name == "p" then - assert_equal(4, #n.nodes, "deeper level") - else + assert_equal(9, #n.nodes, "deeper level") + elseif n.name ~= "_text" then assert_equal("br", n.name, "name") assert_equal("", n:getcontent(), "content") end @@ -70,7 +70,7 @@ function test_class() ]]) - assert_equal(3, #tree.nodes, "top level") + assert_equal(5, #tree.nodes, "top level") assert_equal(1, #tree(".one"), ".one") assert_equal(2, #tree(".two"), ".two") assert_equal(2, #tree(".three"), ".three") @@ -126,10 +126,10 @@ function test_attr_notequal() ]]) - assert_equal(4, #tree.nodes, "top level") - assert_equal(3, #tree("[a1!='a1']"), "a1!='a1'") - assert_equal(4, #tree("[a1!='b1']"), "a1!='b1'") - assert_equal(3, #tree("[a1!='']"), "a1!=''") + assert_equal(7, #tree.nodes, "top level") + assert_equal(6, #tree("[a1!='a1']"), "a1!='a1'") + assert_equal(7, #tree("[a1!='b1']"), "a1!='b1'") + assert_equal(6, #tree("[a1!='']"), "a1!=''") assert_equal(3, #tree("[a1!=]"), "a1!=") end @@ -141,7 +141,7 @@ function test_attr_prefix_start_end() ]]) - assert_equal(5, #tree.nodes, "top level") + assert_equal(9, #tree.nodes, "top level") assert_equal(3, #tree("[a1|='en']"), "a1|='en'") assert_equal(4, #tree("[a1^='en']"), "a1^='en'") assert_equal(2, #tree("[a1$='en']"), "a1$='en'") @@ -154,7 +154,7 @@ function test_attr_word() ]]) - assert_equal(4, #tree.nodes, "top level") + assert_equal(7, #tree.nodes, "top level") assert_equal(1, #tree("[a1~='two']"), "a1~='two'") assert_equal(2, #tree("[a1~='three']"), "a1~='three'") assert_equal(1, #tree("[a1~='four']"), "a1~='four'") @@ -169,7 +169,7 @@ function test_attr_contains() ]]) - assert_equal(6, #tree.nodes, "top level") + assert_equal(11, #tree.nodes, "top level") assert_equal(2, #tree("[a1*='one']"), "a1*='one'") assert_equal(2, #tree("[a1*='t']"), "a1*='t'") assert_equal(1, #tree("[a1*='f']"), "a1*='f'") @@ -238,11 +238,11 @@ function test_not() ]]) - assert_equal(2, #tree.nodes, "top level") - assert_equal(1, #tree(":not([a1=1])"), ":not([a1=1])") - assert_equal(1, #tree(":not([a2])"), ":not([a2])") - assert_equal(1, #tree(":not(n)"), ":not(n)") - assert_equal(2, #tree(":not(m)"), ":not(m)") + assert_equal(3, #tree.nodes, "top level") + assert_equal(4, #tree(":not([a1=1])"), ":not([a1=1])") + assert_equal(4, #tree(":not([a2])"), ":not([a2])") + assert_equal(4, #tree(":not(n)"), ":not(n)") + assert_equal(5, #tree(":not(m)"), ":not(m)") end function test_combine() @@ -256,7 +256,7 @@ function test_combine() ]]) - assert_equal(2, #tree.nodes, "top level") + assert_equal(3, #tree.nodes, "top level") assert_equal(2, #tree("e.c:not([a|='1']) > n[b*='2']"), "e.c:not([a|='1']) > n[b*='2']") assert_equal(3, #tree("e.c:not([a|='1']) n[b*='2']"), "e.c:not([a|='1']) n[b*='2']") assert_equal(1, #tree("#123 .c[b]"), "#123 .c[b]") @@ -290,9 +290,14 @@ function test_order() assert_equal(i, tonumber(v:getcontent()), "n order") end local notn = tree(":not(n)") - assert_equal(4, #notn, "notn") + assert_equal(31, #notn, "notn") + local blanks = 0 for i,v in pairs(notn) do - assert_equal(i, tonumber(v.name), "notn order") + if v.name ~= "_text" then + assert_equal(i, blanks+tonumber(v.name), "notn order") + else + blanks = blanks + 1 + end end end @@ -327,5 +332,50 @@ function test_loop_limit() with unclosed attribute
]]) -- issue#42 - assert(#tree.nodes==17) + assert(#tree.nodes==33) +end + +function test_text_nodes() + local tree = htmlparser.parse("

line1
line2

") + assert_equal(1, #tree.nodes, "top level") + local p = tree.nodes[1] + assert_equal("p", p.name, "p element") + assert_equal(3, #p.nodes, "p should have 3 children") + + assert_equal("_text", p.nodes[1].name, "first child should be text node") + assert_equal("line1", p.nodes[1]:getrawtext(), "first text content") + + assert_equal("br", p.nodes[2].name, "second child should be br") + + assert_equal("_text", p.nodes[3].name, "third child should be text node") + assert_equal("line2", p.nodes[3]:getrawtext(), "third text content") + + assert_equal("line1
line2", p:getcontent(), "getcontent backward compatibility") +end + +function test_text_nodes_whitespace() + local tree = htmlparser.parse("


") + assert_equal(1, #tree.nodes, "top level") + local p = tree.nodes[1] + assert_equal(3, #p.nodes, "p should have 3 children including whitespace") + assert_equal("_text", p.nodes[1].name, "first should be whitespace text node") + assert_equal("_text", p.nodes[3].name, "third should be whitespace text node") +end + +function test_text_nodes_selectors() + local tree = htmlparser.parse("
textinnermore
") + local div = tree.nodes[1] + + local all = div:select("*") + assert_equal(4, #all, "* selector should return 4 nodes (3 text + 1 span)") + + local spans = div:select("span") + assert_equal(1, #spans, "span selector should return 1 node") + assert_equal("span", spans[1].name, "should be span element") + + local texts = div:select("_text") + assert_equal(3, #texts, "_text selector should return 3 text nodes") + for i, node in ipairs(texts) do + assert_equal("_text", node.name, "should be text node") + end end