Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ All tree elements provide, apart from `:select` and `()`, the following accessor

### Other
- `.index` sequence number of elements in order of appearance; root index is `0`
- `:gettext()` the complete element text, starting with `"<tagname"` and ending with `"/>"` or `"</tagname>"`
- `:getrawtext()` the complete element text, starting with `"<tagname"` and ending with `"/>"` or `"</tagname>"`
- `.level` how deep the element is in the tree; root level is `0`
- `.root` the root element of the tree; `root.root` is `root`
- `.deepernodes` a [Set][1] containing all elements in the tree beneath this element, including this element's `.nodes`; `{}` if none
Expand All @@ -96,7 +96,6 @@ All tree elements provide, apart from `:select` and `()`, the following accessor
- Attribute values in selector strings cannot contain any spaces
- The spaces before and after the `>` in a `parent > child` relation are mandatory
- `<!` elements (including doctype, comments, and CDATA) are not parsed; markup within CDATA is *not* escaped
- Textnodes are no separate tree elements; in `local root = htmlparser.parse("<p>line1<br />line2</p>")`, `root.nodes[1]:getcontent()` is `"line1<br />line2"`, while `root.nodes[1].nodes[1].name` is `"br"`
- No start or end tags are implied when [omitted](http://www.w3.org/TR/html5/syntax.html#optional-tags). Only the [void elements](http://www.w3.org/TR/html5/syntax.html#void-elements) should not have an end tag
- No validation is done for tag or attribute names or nesting of element types. The list of void elements is in fact the only part specific to HTML

Expand Down
34 changes: 29 additions & 5 deletions src/htmlparser.lua
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ local function parse(text,limit) -- {{{
local index = 0
local root = ElementNode:new(index, str(text))
local node, descend, tpos, opentags = root, true, 1, {}
local lasttagend = nil -- position after last tag ended (nil = start of document)

while true do -- MainLoop {{{
if index == limit then -- {{{
Expand All @@ -152,20 +153,29 @@ local function parse(text,limit) -- {{{
end -- }}}
-- openstart/tpos Definitions {{{
local openstart, name
openstart, tpos, name = root._text:find(
openstart, tpos, name = root._rawtext:find(
"<" .. -- an uncaptured starting "<"
"([%w-]+)" .. -- name = the first word, directly following the "<"
"[^>]*>", -- include, but not capture everything up to the next ">"
tpos)
dbg("[MainLoop]:#LINE# openstart=%s || tpos=%s || name=%s",str(openstart),str(tpos),str(name))
-- }}}
if not name then break end
-- Create text node for any text before this element {{{
if lasttagend and openstart and lasttagend < openstart then
index = index + 1
local textnode = ElementNode:new(index, "_text", node, descend, lasttagend, openstart - 1)
textnode:close()
dbg("[MainLoop]:#LINE# Created text node from %d to %d", lasttagend, openstart - 1)
end
lasttagend = tpos + 1
-- }}}
-- Some more vars {{{
index = index + 1
local tag = ElementNode:new(index, str(name), (node or {}), descend, openstart, tpos)
node = tag
local tagloop
local tagst, apos = tag:gettext(), 1
local tagst, apos = tag:getrawtext(), 1
-- }}}
while true do -- TagLoop {{{
dbg("[TagLoop]:#LINE# tag.name=%s, tagloop=%s",str(tag.name),str(tagloop))
Expand Down Expand Up @@ -230,13 +240,27 @@ local function parse(text,limit) -- {{{
end

local closestart, closing, closename
closestart, closeend, closing, closename = root._text:find("[^<]*<(/?)([%w-]+)", closeend)
closestart, closeend, closing, closename = root._rawtext:find("[^<]*<(/?)([%w-]+)", closeend)
dbg("[TagCloseLoop]:#LINE# closestart=%s || closeend=%s || closing=%s || closename=%s",str(closestart),str(closeend),str(closing),str(closename))

if not closing or closing == "" then break end

tag = table.remove(opentags[closename] or {}) or tag -- kludges for the cases of closing void or non-opened tags
closestart = root._text:find("<", closestart)

-- Create text node for any text before this closing tag {{{
if closing == "/" then
local tagstart = root._rawtext:find("<", closestart)
if lasttagend and tagstart and lasttagend < tagstart then
index = index + 1
-- Text before closing tag should be a child of the tag being closed
local textnode = ElementNode:new(index, "_text", tag, true, lasttagend, tagstart - 1)
textnode:close()
dbg("[TagCloseLoop]:#LINE# Created text node from %d to %d", lasttagend, tagstart - 1)
end
lasttagend = closeend + 1
end
-- }}}
closestart = root._rawtext:find("<", closestart)
dbg("[TagCloseLoop]:#LINE# closestart=%s",str(closestart))
tag:close(closestart, closeend + 1)
node = tag.parent
Expand All @@ -247,7 +271,7 @@ local function parse(text,limit) -- {{{
if tpl then -- {{{
dbg("tpl")
for k,v in pairs(tpr) do
root._text = root._text:gsub(v,k)
root._rawtext = root._rawtext:gsub(v,k)
end
end -- }}}
return root
Expand Down
14 changes: 7 additions & 7 deletions src/htmlparser/ElementNode.lua
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ function ElementNode:new(index, nameortext, node, descend, openstart, openend)
if not node then
instance.name = "root"
instance.root = instance
instance._text = nameortext
instance._rawtext = nameortext
local length = string.len(nameortext)
instance._openstart, instance._openend = 1, length
instance._closestart, instance._closeend = 1, length
Expand All @@ -124,20 +124,20 @@ function ElementNode:new(index, nameortext, node, descend, openstart, openend)
return setmetatable(instance, ElementNode.mt)
end

function ElementNode:gettext()
return string.sub(self.root._text, self._openstart, self._closeend)
function ElementNode:getrawtext()
return string.sub(self.root._rawtext, self._openstart, self._closeend)
end

function ElementNode:settext(c)
self.root._text=c
function ElementNode:setrawtext(c)
self.root._rawtext=c
end

function ElementNode:textonly()
return (self:gettext():gsub("<[^>]*>",""))
return (self:getrawtext():gsub("<[^>]*>",""))
end

function ElementNode:getcontent()
return string.sub(self.root._text, self._openend + 1, self._closestart - 1)
return string.sub(self.root._rawtext, self._openend + 1, self._closestart - 1)
end

function ElementNode:addattribute(k, v)
Expand Down
90 changes: 70 additions & 20 deletions tst/init.lua
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,11 @@ function test_void()
<br >
<br />
]])
assert_equal(5, #tree.nodes, "top level")
assert_equal(9, #tree.nodes, "top level")
for _,n in ipairs(tree.nodes) do
if n.name == "p" then
assert_equal(4, #n.nodes, "deeper level")
else
assert_equal(9, #n.nodes, "deeper level")
elseif n.name ~= "_text" then
assert_equal("br", n.name, "name")
assert_equal("", n:getcontent(), "content")
end
Expand Down Expand Up @@ -70,7 +70,7 @@ function test_class()
<n class="two three"></n>
<n ssalc="four"></n>
]])
assert_equal(3, #tree.nodes, "top level")
assert_equal(5, #tree.nodes, "top level")
assert_equal(1, #tree(".one"), ".one")
assert_equal(2, #tree(".two"), ".two")
assert_equal(2, #tree(".three"), ".three")
Expand Down Expand Up @@ -126,10 +126,10 @@ function test_attr_notequal()
<n a1></n>
<n></n>
]])
assert_equal(4, #tree.nodes, "top level")
assert_equal(3, #tree("[a1!='a1']"), "a1!='a1'")
assert_equal(4, #tree("[a1!='b1']"), "a1!='b1'")
assert_equal(3, #tree("[a1!='']"), "a1!=''")
assert_equal(7, #tree.nodes, "top level")
assert_equal(6, #tree("[a1!='a1']"), "a1!='a1'")
assert_equal(7, #tree("[a1!='b1']"), "a1!='b1'")
assert_equal(6, #tree("[a1!='']"), "a1!=''")
assert_equal(3, #tree("[a1!=]"), "a1!=")
end

Expand All @@ -141,7 +141,7 @@ function test_attr_prefix_start_end()
<n a1="enen"></n>
<n></n>
]])
assert_equal(5, #tree.nodes, "top level")
assert_equal(9, #tree.nodes, "top level")
assert_equal(3, #tree("[a1|='en']"), "a1|='en'")
assert_equal(4, #tree("[a1^='en']"), "a1^='en'")
assert_equal(2, #tree("[a1$='en']"), "a1$='en'")
Expand All @@ -154,7 +154,7 @@ function test_attr_word()
<n a1></n>
<n></n>
]])
assert_equal(4, #tree.nodes, "top level")
assert_equal(7, #tree.nodes, "top level")
assert_equal(1, #tree("[a1~='two']"), "a1~='two'")
assert_equal(2, #tree("[a1~='three']"), "a1~='three'")
assert_equal(1, #tree("[a1~='four']"), "a1~='four'")
Expand All @@ -169,7 +169,7 @@ function test_attr_contains()
<n a1></n>
<n></n>
]])
assert_equal(6, #tree.nodes, "top level")
assert_equal(11, #tree.nodes, "top level")
assert_equal(2, #tree("[a1*='one']"), "a1*='one'")
assert_equal(2, #tree("[a1*='t']"), "a1*='t'")
assert_equal(1, #tree("[a1*='f']"), "a1*='f'")
Expand Down Expand Up @@ -238,11 +238,11 @@ function test_not()
</n>
<n a2></n>
]])
assert_equal(2, #tree.nodes, "top level")
assert_equal(1, #tree(":not([a1=1])"), ":not([a1=1])")
assert_equal(1, #tree(":not([a2])"), ":not([a2])")
assert_equal(1, #tree(":not(n)"), ":not(n)")
assert_equal(2, #tree(":not(m)"), ":not(m)")
assert_equal(3, #tree.nodes, "top level")
assert_equal(4, #tree(":not([a1=1])"), ":not([a1=1])")
assert_equal(4, #tree(":not([a2])"), ":not([a2])")
assert_equal(4, #tree(":not(n)"), ":not(n)")
assert_equal(5, #tree(":not(m)"), ":not(m)")
end

function test_combine()
Expand All @@ -256,7 +256,7 @@ function test_combine()
</e>
<n b="222"></n>
]])
assert_equal(2, #tree.nodes, "top level")
assert_equal(3, #tree.nodes, "top level")
assert_equal(2, #tree("e.c:not([a|='1']) > n[b*='2']"), "e.c:not([a|='1']) > n[b*='2']")
assert_equal(3, #tree("e.c:not([a|='1']) n[b*='2']"), "e.c:not([a|='1']) n[b*='2']")
assert_equal(1, #tree("#123 .c[b]"), "#123 .c[b]")
Expand Down Expand Up @@ -290,9 +290,14 @@ function test_order()
assert_equal(i, tonumber(v:getcontent()), "n order")
end
local notn = tree(":not(n)")
assert_equal(4, #notn, "notn")
assert_equal(31, #notn, "notn")
local blanks = 0
for i,v in pairs(notn) do
assert_equal(i, tonumber(v.name), "notn order")
if v.name ~= "_text" then
assert_equal(i, blanks+tonumber(v.name), "notn order")
else
blanks = blanks + 1
end
end
end

Expand Down Expand Up @@ -327,5 +332,50 @@ function test_loop_limit()
<a id="unclosed>Element"> with unclosed attribute</a>
<div data-pic="aa<%=image_url%>bb" ></div>
]]) -- issue#42
assert(#tree.nodes==17)
assert(#tree.nodes==33)
end

function test_text_nodes()
local tree = htmlparser.parse("<p>line1<br />line2</p>")
assert_equal(1, #tree.nodes, "top level")
local p = tree.nodes[1]
assert_equal("p", p.name, "p element")
assert_equal(3, #p.nodes, "p should have 3 children")

assert_equal("_text", p.nodes[1].name, "first child should be text node")
assert_equal("line1", p.nodes[1]:getrawtext(), "first text content")

assert_equal("br", p.nodes[2].name, "second child should be br")

assert_equal("_text", p.nodes[3].name, "third child should be text node")
assert_equal("line2", p.nodes[3]:getrawtext(), "third text content")

assert_equal("line1<br />line2", p:getcontent(), "getcontent backward compatibility")
end

function test_text_nodes_whitespace()
local tree = htmlparser.parse("<p> <br/> </p>")
assert_equal(1, #tree.nodes, "top level")
local p = tree.nodes[1]
assert_equal(3, #p.nodes, "p should have 3 children including whitespace")
assert_equal("_text", p.nodes[1].name, "first should be whitespace text node")
assert_equal("_text", p.nodes[3].name, "third should be whitespace text node")
end

function test_text_nodes_selectors()
local tree = htmlparser.parse("<div>text<span>inner</span>more</div>")
local div = tree.nodes[1]

local all = div:select("*")
assert_equal(4, #all, "* selector should return 4 nodes (3 text + 1 span)")

local spans = div:select("span")
assert_equal(1, #spans, "span selector should return 1 node")
assert_equal("span", spans[1].name, "should be span element")

local texts = div:select("_text")
assert_equal(3, #texts, "_text selector should return 3 text nodes")
for i, node in ipairs(texts) do
assert_equal("_text", node.name, "should be text node")
end
end