diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..ef73ef7 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,25 @@ +name: Test +on: [push, pull_request] +jobs: + test: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: 'true' + + - uses: hishamhm/gh-actions-lua@master + with: + luaVersion: "5.4" + + - uses: luarocks/gh-actions-luarocks@master + with: + luaRocksVersion: "3.12.2" + + - name: Build + run: "luarocks build" + + - name: Test + run: "luarocks test" + diff --git a/.gitmodules b/.gitmodules index a83acc9..b6c3bf0 100644 --- a/.gitmodules +++ b/.gitmodules @@ -2,3 +2,7 @@ path = spec/kdl-org url = git@github.com:kdl-org/kdl branch = kdl-v2 +[submodule "spec/v1/kdl-org"] + path = spec/v1/kdl-org + url = git@github.com:kdl-org/kdl.git + branch = release/v1 diff --git a/kdlua-dev-1.rockspec b/kdlua-1.0.0-0.rockspec similarity index 79% rename from kdlua-dev-1.rockspec rename to kdlua-1.0.0-0.rockspec index c3930fd..d3c54c4 100644 --- a/kdlua-dev-1.rockspec +++ b/kdlua-1.0.0-0.rockspec @@ -1,5 +1,5 @@ package = "kdlua" -version = "dev-1" +version = "1.0.0-0" rockspec_format = "3.0" source = { url = "git://github.com/danini-the-pamini/kdlua" @@ -20,7 +20,10 @@ build = { ["kdl.parser"] = "src/parser.lua", ["kdl.tokenizer"] = "src/tokenizer.lua", ["kdl.stringdumper"] = "src/stringdumper.lua", - ["kdl.util"] = "src/util.lua" + ["kdl.util"] = "src/util.lua", + ["kdl.v1.parser"] = "src/v1/parser.lua", + ["kdl.v1.tokenizer"] = "src/v1/tokenizer.lua", + ["kdl.v1.util"] = "src/v1/util.lua" } } test = { @@ -31,4 +34,4 @@ dependencies = { } test_dependencies = { "luafilesystem >= 1.8.0 < 2.0.0" -} \ No newline at end of file +} diff --git a/spec/kdl-org b/spec/kdl-org index d0b30c3..ebf9ef7 160000 --- a/spec/kdl-org +++ b/spec/kdl-org @@ -1 +1 @@ -Subproject commit d0b30c3f35fe406912d60c43dfb92b872f3c9e60 +Subproject commit ebf9ef764908e132c720833dd8813fc5c3d14f42 diff --git a/spec/kdl_spec.lua b/spec/kdl_spec.lua index 359da37..85d4a00 100644 --- a/spec/kdl_spec.lua +++ b/spec/kdl_spec.lua @@ -1,6 +1,25 @@ +local lfs = require "lfs" +local kdl = require "kdl" +require "spec.support" + describe("kdl", function() - local lfs = require "lfs" - local kdl = require "kdl" + it("detects version", function() + -- parses either v1 or v2 + assert.valid_kdl("node foo #true", "node foo #true") + assert.valid_kdl("node \"foo\" true", "node foo #true") + + -- chooses parser based on version directive + assert.valid_kdl("/- kdl-version 1\nnode \"foo\" true", "node foo #true") + assert.valid_kdl("/- kdl-version 2\nnode foo #true", "node foo #true") + + -- fails parsing if syntax does not match version directive + assert.is_not.valid_kdl("/- kdl-version 1\nnode foo #true", "Expected EQUALS, got WS (2:9)") + assert.is_not.valid_kdl("/- kdl-version 2\nnode \"foo\" true", "Identifier cannot be a literal (2:12)") + + -- fails parsing mixed syntax + assert.is_not.valid_kdl("node foo true", "Expected EQUALS, got WS (1:9)") + assert.is_not.valid_kdl("node r\"foo\" #true", "Expected EQUALS, got EOF (1:18)") + end) local function exists(name) local f=io.open(name,"r") @@ -14,11 +33,6 @@ describe("kdl", function() return s end - local function parse(str) - local ok, r = xpcall(kdl.parse_document, debug.traceback, str) - if ok then return r else error(r) end - end - local TEST_CASES = "spec/kdl-org/tests/test_cases" for file in lfs.dir(TEST_CASES.."/input") do if file ~= "." and file ~= ".." then @@ -26,11 +40,11 @@ describe("kdl", function() local expected = TEST_CASES.."/expected_kdl/"..file if exists(expected) then it("parses "..input, function() - assert.equals(readfile(expected), tostring(parse(readfile(input)))) + assert.valid_kdl(readfile(input), readfile(expected), 2) end) else it("does not parse "..input, function() - assert.has_error(function() kdl.parse_document(readfile(input)) end) + assert.is_not.valid_kdl(readfile(input), 2) end) end end diff --git a/spec/parser_spec.lua b/spec/parser_spec.lua index 168138e..e7d207b 100644 --- a/spec/parser_spec.lua +++ b/spec/parser_spec.lua @@ -1,14 +1,11 @@ +require "spec.support" + describe("parser", function() local parser = require "kdl.parser" local document = require "kdl.document" local node = require "kdl.node" local value = require "kdl.value" - local function parse(str) - local ok, r = xpcall(parser.parse, debug.traceback, str) - if ok then return r else error(r) end - end - local function n(name, args, children, ty, fn) if type(args) == "function" then fn = args @@ -31,28 +28,30 @@ describe("parser", function() end it("parses empty string", function() - assert.same(document.new(), parse("")) - assert.same(document.new(), parse(" ")) - assert.same(document.new(), parse("\n")) + assert.valid_kdl("", document.new(), 2) + assert.valid_kdl(" ", document.new(), 2) + assert.valid_kdl("\n", document.new(), 2) end) it("parses nodes", function() - assert.same(document.new{ node.new("node") }, parse("node")) - assert.same(document.new{ node.new("node") }, parse("node\n")) - assert.same(document.new{ node.new("node") }, parse("\nnode\n")) - assert.same( + assert.valid_kdl("node", document.new{ node.new("node") }, 2) + assert.valid_kdl("node\n", document.new{ node.new("node") }, 2) + assert.valid_kdl("\nnode\n", document.new{ node.new("node") }, 2) + assert.valid_kdl( + "node1\nnode2", document.new{ node.new("node1"), node.new("node2") }, - parse("node1\nnode2") + 2 ) end) it("parses node entries", function() - assert.same(document.new{ node.new("node") }, parse("node;")) - assert.same(document.new{ node.new("node", { value.new(1) }) }, parse("node 1")) - assert.same( + assert.valid_kdl("node;", document.new{ node.new("node") }, 2) + assert.valid_kdl("node 1", document.new{ node.new("node", { value.new(1) }) }, 2) + assert.valid_kdl( + 'node 1 2 "3" #true #false #null', document.new{ node.new("node", { value.new(1), @@ -63,422 +62,491 @@ describe("parser", function() value.new(nil) }), }, - parse('node 1 2 "3" #true #false #null') + 2 ) - assert.same(document.new{ node.new("node", {}, { node.new("node2") }) }, parse("node {\n node2\n}")) - assert.same(document.new{ node.new("node", {}, { node.new("node2") }) }, parse("node {\n node2 \n}")) - assert.same(document.new{ node.new("node", {}, { node.new("node2") }) }, parse("node { node2; }")) - assert.same(document.new{ node.new("node", {}, { node.new("node2") }) }, parse("node { node2 }")) - assert.same(document.new{ node.new("node", {}, { node.new("node2"), node.new("node3") }) }, parse("node { node2; node3 }")) + assert.valid_kdl("node {\n node2\n}", document.new{ node.new("node", {}, { node.new("node2") }) }, 2) + assert.valid_kdl("node {\n node2 \n}", document.new{ node.new("node", {}, { node.new("node2") }) }, 2) + assert.valid_kdl("node { node2; }", document.new{ node.new("node", {}, { node.new("node2") }) }, 2) + assert.valid_kdl("node { node2 }", document.new{ node.new("node", {}, { node.new("node2") }) }, 2) + assert.valid_kdl("node { node2; node3 }", document.new{ node.new("node", {}, { node.new("node2"), node.new("node3") }) }, 2) end) it("parses slashdash nodes", function() - assert.same(document.new(), parse("/-node")) - assert.same(document.new(), parse("/- node")) - assert.same(document.new(), parse("/- node\n")) - assert.same(document.new(), parse("/-node 1 2 3")) - assert.same(document.new(), parse("/-node key=#false")) - assert.same(document.new(), parse("/-node{\nnode\n}")) - assert.same(document.new(), parse("/-node 1 2 3 key=\"value\" \\\n{\nnode\n}")) + assert.valid_kdl("/-node", document.new(), 2) + assert.valid_kdl("/- node", document.new(), 2) + assert.valid_kdl("/- node\n", document.new(), 2) + assert.valid_kdl("/-node 1 2 3", document.new(), 2) + assert.valid_kdl("/-node key=#false", document.new(), 2) + assert.valid_kdl("/-node{\nnode\n}", document.new(), 2) + assert.valid_kdl("/-node 1 2 3 key=\"value\" \\\n{\nnode\n}", document.new(), 2) end); it("parses slashdash args", function() - assert.same(document.new{ node.new("node") }, parse("node /-1")) - assert.same(document.new{ node.new("node", { value.new(2) }) }, parse("node /-1 2")) - assert.same(document.new{ node.new("node", { value.new(1), value.new(3) }) }, parse("node 1 /- 2 3")) - assert.same(document.new{ node.new("node") }, parse("node /--1")) - assert.same(document.new{ node.new("node") }, parse("node /- -1")) - assert.same(document.new{ node.new("node") }, parse("node \\\n/- -1")) + assert.valid_kdl("node /-1", document.new{ node.new("node") }, 2) + assert.valid_kdl("node /-1 2", document.new{ node.new("node", { value.new(2) }) }, 2) + assert.valid_kdl("node 1 /- 2 3", document.new{ node.new("node", { value.new(1), value.new(3) }) }, 2) + assert.valid_kdl("node /--1", document.new{ node.new("node") }, 2) + assert.valid_kdl("node /- -1", document.new{ node.new("node") }, 2) + assert.valid_kdl("node \\\n/- -1", document.new{ node.new("node") }, 2) end) it("parses slashdash props", function() - assert.same(document.new{ node.new("node") }, parse("node /-key=1")) - assert.same(document.new{ node.new("node") }, parse("node /- key=1")) - assert.same(document.new{ node.new("node", { ["key"]=value.new(1) }) }, parse("node key=1 /-key2=2")) + assert.valid_kdl("node /-key=1", document.new{ node.new("node") }, 2) + assert.valid_kdl("node /- key=1", document.new{ node.new("node") }, 2) + assert.valid_kdl("node key=1 /-key2=2", document.new{ node.new("node", { ["key"]=value.new(1) }) }, 2) end) it("parses slashdash children", function() - assert.same(document.new{ node.new("node") }, parse("node /-{}")) - assert.same(document.new{ node.new("node") }, parse("node /- {}")) - assert.same(document.new{ node.new("node") }, parse("node /-{\nnode2\n}")) + assert.valid_kdl("node /-{}", document.new{ node.new("node") }, 2) + assert.valid_kdl("node /- {}", document.new{ node.new("node") }, 2) + assert.valid_kdl("node /-{\nnode2\n}", document.new{ node.new("node") }, 2) end) it('parses strings', function() - assert.same(document.new{ node.new('node', { value.new("") }) }, parse('node ""')) - assert.same(document.new{ node.new('node', { value.new("hello") }) }, parse('node "hello"')) - assert.same(document.new{ node.new('node', { value.new("hello\nworld") }) }, parse([[node "hello\nworld"]])) - assert.same(document.new{ node.new('node', { value.new("-flag") }) }, parse([[node -flag]])) - assert.same(document.new{ node.new('node', { value.new("--flagg") }) }, parse([[node --flagg]])) - assert.same(document.new{ node.new('node', { value.new("\u{10FFF}") }) }, parse([[node "\u{10FFF}"]])) - assert.same(document.new{ node.new('node', { value.new("\"\\\u{08}\u{0C}\n\r\t") }) }, parse([[node "\"\\\b\f\n\r\t"]])) - assert.same(document.new{ node.new('node', { value.new("\u{10}") }) }, parse([[node "\u{10}"]])) - assert.has_error(function() parser.parse([[node "\i"]]) end, "Unexpected escape: \\i") - assert.has_error(function() parser.parse([[node "\u{c0ffee}"]]) end, "Invalid code point \\u{c0ffee}") + assert.valid_kdl('node ""', document.new{ node.new('node', { value.new("") }) }, 2) + assert.valid_kdl('node "hello"', document.new{ node.new('node', { value.new("hello") }) }, 2) + assert.valid_kdl([[node "hello\nworld"]], document.new{ node.new('node', { value.new("hello\nworld") }) }, 2) + assert.valid_kdl([[node -flag]], document.new{ node.new('node', { value.new("-flag") }) }, 2) + assert.valid_kdl([[node --flagg]], document.new{ node.new('node', { value.new("--flagg") }) }, 2) + assert.valid_kdl([[node "\u{10FFF}"]], document.new{ node.new('node', { value.new("\u{10FFF}") }) }, 2) + assert.valid_kdl([[node "\"\\\b\f\n\r\t"]], document.new{ node.new('node', { value.new("\"\\\u{08}\u{0C}\n\r\t") }) }, 2) + assert.valid_kdl([[node "\u{10}"]], document.new{ node.new('node', { value.new("\u{10}") }) }, 2) + assert.is_not.valid_kdl([[node "\i"]], "Unexpected escape: \\i (1:6)", 2) + assert.is_not.valid_kdl([[node "\u{c0ffee}"]], "Invalid code point \\u{c0ffee} (1:6)", 2) + assert.is_not.valid_kdl([[node "oops]], "Unterminated string literal (1:6)", 2) end) it("parses unindented multiline strings", function() - assert.same(document.new{ node.new("node", { value.new("foo\nbar\n baz\nqux") }) }, parse("node \"\n foo\n bar\n baz\n qux\n \"")) - assert.same(document.new{ node.new("node", { value.new("foo\nbar\n baz\nqux") }) }, parse("node #\"\n foo\n bar\n baz\n qux\n \"#")) - assert.has_error(function() parser.parse("node \"\n foo\n bar\n baz\n \"") end, "Invalid multiline string indentation") - assert.has_error(function() parser.parse("node \"\n foo\n bar\n baz\n qux\"") end, "Invalid muliline string final line: ' qux'") - assert.has_error(function() parser.parse("node #\"\n foo\n bar\n baz\n \"#") end, "Invalid multiline string indentation") + assert.valid_kdl('node """\n foo\n bar\n baz\n qux\n """', document.new{ node.new("node", { value.new("foo\nbar\n baz\nqux") }) }, 2) + assert.valid_kdl('node #"""\n foo\n bar\n baz\n qux\n """#', document.new{ node.new("node", { value.new("foo\nbar\n baz\nqux") }) }, 2) + assert.is_not.valid_kdl('node """\n foo\n bar\n baz\n """', "Invalid multi-line string indentation (1:6)", 2) + assert.is_not.valid_kdl('node #"""\n foo\n bar\n baz\n """#', "Invalid multi-line string indentation (1:6)", 2) end) it("parses floats", function() - assert.same(document.new{ node.new("node", { value.new(1.0) }) }, parse("node 1.0")) - assert.same(document.new{ node.new("node", { value.new(0.0) }) }, parse("node 0.0")) - assert.same(document.new{ node.new("node", { value.new(-1.0) }) }, parse("node -1.0")) - assert.same(document.new{ node.new("node", { value.new(1.0) }) }, parse("node +1.0")) - assert.same(document.new{ node.new("node", { value.new(1.0e10) }) }, parse("node 1.0e10")) - assert.same(document.new{ node.new("node", { value.new(1.0e-10) }) }, parse("node 1.0e-10")) - assert.same(document.new{ node.new("node", { value.new(123456789.0) }) }, parse("node 123_456_789.0")) - assert.same(document.new{ node.new("node", { value.new(123456789.0) }) }, parse("node 123_456_789.0_")) - assert.has_error(function() parser.parse("node 1._0") end, "Invalid number: 1._0") - assert.has_error(function() parser.parse("node 1.") end, "Invalid number: 1.") - assert.has_error(function() parser.parse("node 1.0v2") end, "Unexpected 'v'") - assert.has_error(function() parser.parse("node -1em") end, "Unexpected 'm'") - assert.has_error(function() parser.parse("node .0") end, "Identifier cannot look like an illegal float") + assert.valid_kdl("node 1.0", document.new{ node.new("node", { value.new(1.0) }) }, 2) + assert.valid_kdl("node 0.0", document.new{ node.new("node", { value.new(0.0) }) }, 2) + assert.valid_kdl("node -1.0", document.new{ node.new("node", { value.new(-1.0) }) }, 2) + assert.valid_kdl("node +1.0", document.new{ node.new("node", { value.new(1.0) }) }, 2) + assert.valid_kdl("node 1.0e10", document.new{ node.new("node", { value.new(1.0e10) }) }, 2) + assert.valid_kdl("node 1.0e-10", document.new{ node.new("node", { value.new(1.0e-10) }) }, 2) + assert.valid_kdl("node 123_456_789.0", document.new{ node.new("node", { value.new(123456789.0) }) }, 2) + assert.valid_kdl("node 123_456_789.0_", document.new{ node.new("node", { value.new(123456789.0) }) }, 2) + assert.is_not.valid_kdl("node 1._0", "Invalid number: 1._0 (1:6)", 2) + assert.is_not.valid_kdl("node 1.", "Invalid number: 1. (1:6)", 2) + assert.is_not.valid_kdl("node 1.0v2", "Unexpected 'v' (1:6)", 2) + assert.is_not.valid_kdl("node -1em", "Unexpected 'm' (1:6)", 2) + assert.is_not.valid_kdl("node .0", "Identifier cannot look like an illegal float (1:6)", 2) end) it("parses integers", function() - assert.same(document.new{ node.new("node", { value.new(0) }) }, parse("node 0")) - assert.same(document.new{ node.new("node", { value.new(123456789) }) }, parse("node 0123456789")) - assert.same(document.new{ node.new("node", { value.new(123456789) }) }, parse("node 0123_456_789")) - assert.same(document.new{ node.new("node", { value.new(123456789) }) }, parse("node 0123_456_789_")) - assert.same(document.new{ node.new("node", { value.new(123456789) }) }, parse("node +0123456789")) - assert.same(document.new{ node.new("node", { value.new(-123456789) }) }, parse("node -0123456789")) + assert.valid_kdl("node 0", document.new{ node.new("node", { value.new(0) }) }, 2) + assert.valid_kdl("node 0123456789", document.new{ node.new("node", { value.new(123456789) }) }, 2) + assert.valid_kdl("node 0123_456_789", document.new{ node.new("node", { value.new(123456789) }) }, 2) + assert.valid_kdl("node 0123_456_789_", document.new{ node.new("node", { value.new(123456789) }) }, 2) + assert.valid_kdl("node +0123456789", document.new{ node.new("node", { value.new(123456789) }) }, 2) + assert.valid_kdl("node -0123456789", document.new{ node.new("node", { value.new(-123456789) }) }, 2) end) + it("parses hexadecimal", function() + assert.valid_kdl("node 0x0123456789abcdef", document.new{ node.new("node", { value.new(0x0123456789abcdef) }) }, 2) + assert.valid_kdl("node 0x01234567_89abcdef", document.new{ node.new("node", { value.new(0x0123456789abcdef) }) }, 2) + assert.valid_kdl("node 0x01234567_89abcdef_", document.new{ node.new("node", { value.new(0x0123456789abcdef) }) }, 2) + assert.is_not.valid_kdl("node 0x_123", "Invalid hexadecimal: _123 (1:6)", 2) + assert.is_not.valid_kdl("node 0xG", "Unexpected 'G' (1:6)", 2) + assert.is_not.valid_kdl("node 0xx", "Unexpected 'x' (1:6)", 2) + end) + + it("parses octal", function() - assert.same(document.new{ node.new("node", { value.new(342391) }) }, parse("node 0o01234567")) - assert.same(document.new{ node.new("node", { value.new(342391) }) }, parse("node 0o0123_4567")) - assert.same(document.new{ node.new("node", { value.new(342391) }) }, parse("node 0o01234567_")) - assert.has_error(function() parser.parse("node 0o_123") end, "Invalid octal: _123") - assert.has_error(function() parser.parse("node 0o8") end, "Unexpected '8'") - assert.has_error(function() parser.parse("node 0oo") end, "Unexpected 'o'") + assert.valid_kdl("node 0o01234567", document.new{ node.new("node", { value.new(342391) }) }, 2) + assert.valid_kdl("node 0o0123_4567", document.new{ node.new("node", { value.new(342391) }) }, 2) + assert.valid_kdl("node 0o01234567_", document.new{ node.new("node", { value.new(342391) }) }, 2) + assert.is_not.valid_kdl("node 0o_123", "Invalid octal: _123 (1:6)", 2) + assert.is_not.valid_kdl("node 0o8", "Unexpected '8' (1:6)", 2) + assert.is_not.valid_kdl("node 0oo", "Unexpected 'o' (1:6)", 2) end) it("parses binary", function() - assert.same(document.new{ node.new("node", { value.new(5) }) }, parse("node 0b0101")) - assert.same(document.new{ node.new("node", { value.new(6) }) }, parse("node 0b01_10")) - assert.same(document.new{ node.new("node", { value.new(6) }) }, parse("node 0b01___10")) - assert.same(document.new{ node.new("node", { value.new(6) }) }, parse("node 0b0110_")) - assert.has_error(function() parser.parse("node 0b_0110") end, "Invalid binary: _0110") - assert.has_error(function() parser.parse("node 0b20") end, "Unexpected '2'") - assert.has_error(function() parser.parse("node 0bb") end, "Unexpected 'b'") + assert.valid_kdl("node 0b0101", document.new{ node.new("node", { value.new(5) }) }, 2) + assert.valid_kdl("node 0b01_10", document.new{ node.new("node", { value.new(6) }) }, 2) + assert.valid_kdl("node 0b01___10", document.new{ node.new("node", { value.new(6) }) }, 2) + assert.valid_kdl("node 0b0110_", document.new{ node.new("node", { value.new(6) }) }, 2) + assert.is_not.valid_kdl("node 0b_0110", "Invalid binary: _0110 (1:6)", 2) + assert.is_not.valid_kdl("node 0b20", "Unexpected '2' (1:6)", 2) + assert.is_not.valid_kdl("node 0bb", "Unexpected 'b' (1:6)", 2) end) it("parses raw strings", function() - assert.same(document.new{ node.new("node", { value.new("foo") }) }, parse([[node #"foo"#]])) - assert.same(document.new{ node.new("node", { value.new([[foo\nbar]]) }) }, parse([[node #"foo\nbar"#]])) - assert.same(document.new{ node.new("node", { value.new("foo") }) }, parse([[node #"foo"#]])) - assert.same(document.new{ node.new("node", { value.new("foo") }) }, parse([[node ##"foo"##]])) - assert.same(document.new{ node.new("node", { value.new([[\nfoo\r]]) }) }, parse([[node #"\nfoo\r"#]])) - assert.has_error(function() parser.parse('node ##"foo"#') end, "Unterminated rawstring literal") + assert.valid_kdl([[node #"foo"#]], document.new{ node.new("node", { value.new("foo") }) }, 2) + assert.valid_kdl([[node #"foo\nbar"#]], document.new{ node.new("node", { value.new([[foo\nbar]]) }) }, 2) + assert.valid_kdl([[node #"foo"#]], document.new{ node.new("node", { value.new("foo") }) }, 2) + assert.valid_kdl([[node ##"foo"##]], document.new{ node.new("node", { value.new("foo") }) }, 2) + assert.valid_kdl([[node #"\nfoo\r"#]], document.new{ node.new("node", { value.new([[\nfoo\r]]) }) }, 2) + assert.is_not.valid_kdl('node ##"foo"#', "Unterminated rawstring literal (1:6)", 2) end) it("parses booleans", function() - assert.same(document.new{ node.new("node", { value.new(true) }) }, parse("node #true")) - assert.same(document.new{ node.new("node", { value.new(false) }) }, parse("node #false")) + assert.valid_kdl("node #true", document.new{ node.new("node", { value.new(true) }) }, 2) + assert.valid_kdl("node #false", document.new{ node.new("node", { value.new(false) }) }, 2) end) it("parses nulls", function() - assert.same(document.new{ node.new("node", { value.new(nil) }) }, parse("node #null")) + assert.valid_kdl("node #null", document.new{ node.new("node", { value.new(nil) }) }, 2) end) it("parses node spacing", function() - assert.same(document.new{ node.new("node", { value.new(1) }) }, parse("node 1")) - assert.same(document.new{ node.new("node", { value.new(1) }) }, parse("node\t1")) - assert.same(document.new{ node.new("node", { value.new(1) }) }, parse("node\t \\ // hello\n 1")) + assert.valid_kdl("node 1", document.new{ node.new("node", { value.new(1) }) }, 2) + assert.valid_kdl("node\t1", document.new{ node.new("node", { value.new(1) }) }, 2) + assert.valid_kdl("node\t \\ // hello\n 1", document.new{ node.new("node", { value.new(1) }) }, 2) end) it("parses single line comment", function() - assert.same(document.new{}, parse("//hello")) - assert.same(document.new{}, parse("// \thello")) - assert.same(document.new{}, parse("//hello\n")) - assert.same(document.new{}, parse("//hello\r\n")) - assert.same(document.new{}, parse("//hello\n\r")) - assert.same(document.new{ node.new("world") }, parse("//hello\rworld")) - assert.same(document.new{ node.new("world") }, parse("//hello\nworld\r\n")) + assert.valid_kdl("//hello", document.new{}, 2) + assert.valid_kdl("// \thello", document.new{}, 2) + assert.valid_kdl("//hello\n", document.new{}, 2) + assert.valid_kdl("//hello\r\n", document.new{}, 2) + assert.valid_kdl("//hello\n\r", document.new{}, 2) + assert.valid_kdl("//hello\rworld", document.new{ node.new("world") }, 2) + assert.valid_kdl("//hello\nworld\r\n", document.new{ node.new("world") }, 2) end) it("parses multi line comment", function() - assert.same(document.new{}, parse("/*hello*/")); - assert.same(document.new{}, parse("/*hello*/\n")); - assert.same(document.new{}, parse("/*\nhello\r\n*/")); - assert.same(document.new{}, parse("/*\nhello** /\n*/")); - assert.same(document.new{}, parse("/**\nhello** /\n*/")); - assert.same(document.new{ node.new("world") }, parse("/*hello*/world")); + assert.valid_kdl("/*hello*/", document.new{}); + assert.valid_kdl("/*hello*/\n", document.new{}); + assert.valid_kdl("/*\nhello\r\n*/", document.new{}); + assert.valid_kdl("/*\nhello** /\n*/", document.new{}); + assert.valid_kdl("/**\nhello** /\n*/", document.new{}); + assert.valid_kdl("/*hello*/world", document.new{ node.new("world") }); end) it("parses esclines", function() - assert.same(document.new{ node.new("node", { value.new(1) }) }, parse("node\\\n 1")) - assert.same(document.new{ node.new("node") }, parse("node\\\n")) - assert.same(document.new{ node.new("node") }, parse("node\\ \n")) - assert.same(document.new{ node.new("node") }, parse("node\\\n ")) + assert.valid_kdl("node\\\n 1", document.new{ node.new("node", { value.new(1) }) }, 2) + assert.valid_kdl("node\\\n", document.new{ node.new("node") }, 2) + assert.valid_kdl("node\\ \n", document.new{ node.new("node") }, 2) + assert.valid_kdl("node\\\n ", document.new{ node.new("node") }, 2) + assert.is_not.valid_kdl('node \\foo', [[Unexpected '\' (1:5)]], 2) + assert.is_not.valid_kdl('node\\\\\nnode2', [[Unexpected '\' (1:5)]], 2) + assert.is_not.valid_kdl('node \\\\\nnode2', [[Unexpected '\' (1:5)]], 2) end) it("parses whitespace", function() - assert.same(document.new{ node.new("node") }, parse(" node")) - assert.same(document.new{ node.new("node") }, parse("\tnode")) - assert.same(document.new{ node.new("etc") }, parse("/* \nfoo\r\n */ etc")) + assert.valid_kdl(" node", document.new{ node.new("node") }, 2) + assert.valid_kdl("\tnode", document.new{ node.new("node") }, 2) + assert.valid_kdl("/* \nfoo\r\n */ etc", document.new{ node.new("etc") }, 2) end) it('parses newlines', function() - assert.same(document.new{ node.new('node1'), node.new('node2') }, parse("node1\nnode2")) - assert.same(document.new{ node.new('node1'), node.new('node2') }, parse("node1\rnode2")) - assert.same(document.new{ node.new('node1'), node.new('node2') }, parse("node1\r\nnode2")) - assert.same(document.new{ node.new('node1'), node.new('node2') }, parse("node1\n\nnode2")) + assert.valid_kdl("node1\nnode2", document.new{ node.new('node1'), node.new('node2') }, 2) + assert.valid_kdl("node1\rnode2", document.new{ node.new('node1'), node.new('node2') }, 2) + assert.valid_kdl("node1\r\nnode2", document.new{ node.new('node1'), node.new('node2') }, 2) + assert.valid_kdl("node1\n\nnode2", document.new{ node.new('node1'), node.new('node2') }, 2) end) - it("pasrses basic", function() - local doc = parse('title "Hello, World"') - local nodes = document.new{ - node.new("title", { value.new("Hello, World") }) - } - assert.same(nodes, doc) + it("parses basic", function() + assert.valid_kdl( + 'title "Hello, World"', + document.new{ + node.new("title", { value.new("Hello, World") }) + }, + 2 + ) end) it("parses multiple values", function() - local doc = parse("bookmarks 12 15 188 1234") - local nodes = document.new{ - node.new("bookmarks", { value.new(12), value.new(15), value.new(188), value.new(1234) }) - } - assert.same(nodes, doc) + assert.valid_kdl( + "bookmarks 12 15 188 1234", + document.new{ + node.new("bookmarks", { value.new(12), value.new(15), value.new(188), value.new(1234) }) + }, + 2 + ) end) it("parses properties", function() - local doc = parse[[ - author "Alex Monad" email="alex@example.com" active= #true - foo bar =#true "baz" quux =\ - #false 1 2 3 - ]] - local nodes = document.new{ - n("author", { value.new("Alex Monad") }, function(nd) - nd:insert("email", value.new("alex@example.com")) - nd:insert("active", value.new(true)) - end), - n("foo", { value.new("baz"), value.new(1), value.new(2), value.new(3) }, function(nd) - nd:insert("bar", value.new(true)) - nd:insert("quux", value.new(false)) - end) - } - assert.same(nodes, doc) + assert.valid_kdl( + [[ + author "Alex Monad" email="alex@example.com" active= #true + foo bar =#true "baz" quux =\ + #false 1 2 3 + ]], + document.new{ + n("author", { value.new("Alex Monad") }, function(nd) + nd:insert("email", value.new("alex@example.com")) + nd:insert("active", value.new(true)) + end), + n("foo", { value.new("baz"), value.new(1), value.new(2), value.new(3) }, function(nd) + nd:insert("bar", value.new(true)) + nd:insert("quux", value.new(false)) + end) + }, + 2 + ) end) it("parses nested child nodes", function() - local doc = parse[[ - contents { - section "First section" { - paragraph "This is the first paragraph" - paragraph "This is the second paragraph" + assert.valid_kdl( + [[ + contents { + section "First section" { + paragraph "This is the first paragraph" + paragraph "This is the second paragraph" + } } - } - ]] - local nodes = document.new{ - node.new("contents", {}, { - node.new("section", { value.new("First section") }, { - node.new("paragraph", { value.new("This is the first paragraph") }), - node.new("paragraph", { value.new("This is the second paragraph") }) + ]], + document.new{ + node.new("contents", {}, { + node.new("section", { value.new("First section") }, { + node.new("paragraph", { value.new("This is the first paragraph") }), + node.new("paragraph", { value.new("This is the second paragraph") }) + }) }) - }) - } - assert.same(nodes, doc) + }, + 2 + ) end) it("parses semicolons", function() - local doc = parse("node1; node2; node3;") - local nodes = document.new{ - node.new("node1"), - node.new("node2"), - node.new("node3"), - } - assert.same(nodes, doc) + assert.valid_kdl( + "node1; node2; node3;", + document.new{ + node.new("node1"), + node.new("node2"), + node.new("node3"), + }, + 2 + ) end) it('parses optional child semicolon', function() - local doc = parse('node {foo;bar;baz}') - local nodes = document.new{ - node.new('node', {}, { - node.new('foo'), - node.new('bar'), - node.new('baz') - }) - } - assert.same(nodes, doc) + assert.valid_kdl( + 'node {foo;bar;baz}', + document.new{ + node.new('node', {}, { + node.new('foo'), + node.new('bar'), + node.new('baz') + }) + }, + 2 + ) end) it("parses raw strings", function() - local doc = parse[[ - node "this\nhas\tescapes" - other #"C:\Users\zkat\"# - other-raw #"hello"world"# - ]] - local nodes = document.new{ - node.new("node", { value.new("this\nhas\tescapes") }), - node.new("other", { value.new("C:\\Users\\zkat\\") }), - node.new("other-raw", { value.new("hello\"world") }) - } - assert.same(nodes, doc) + assert.valid_kdl( + [[ + node "this\nhas\tescapes" + other #"C:\Users\zkat\"# + other-raw #"hello"world"# + ]], + document.new{ + node.new("node", { value.new("this\nhas\tescapes") }), + node.new("other", { value.new("C:\\Users\\zkat\\") }), + node.new("other-raw", { value.new("hello\"world") }) + }, + 2 + ) end) it("parses multiline strings", function() - local doc = parse[[ -string "my + assert.valid_kdl( + [[ +string """ +my multiline -value" -]] - local nodes = document.new{ - node.new("string", { value.new("my\nmultiline\nvalue") }) - } - assert.same(nodes, doc) +value +""" + ]], + document.new{ + node.new("string", { value.new("my\nmultiline\nvalue") }) + }, + 2 + ) + + assert.is_not.valid_kdl('node """foo"""', "Expected NEWLINE, found 'f' (1:6)", 2) + assert.is_not.valid_kdl('node #"""foo"""#', "Expected NEWLINE, found 'f' (1:6)", 2) + assert.is_not.valid_kdl('node """\n oops', "Unterminated multi-line string literal (1:6)", 2) + assert.is_not.valid_kdl('node #"""\n oops', "Unterminated multi-line rawstring literal (1:6)", 2) end) it("parses numbers", function() - local doc = parse[[ - num 1.234e-42 - my-hex 0xdeadbeef - my-octal 0o755 - my-binary 0b10101101 - bignum 1_000_000 - ]] - local nodes = document.new{ - node.new("num", { value.new(1.234e-42) }), - node.new("my-hex", { value.new(0xdeadbeef) }), - node.new("my-octal", { value.new(493) }), - node.new("my-binary", { value.new(173) }), - node.new("bignum", { value.new(1000000) }) - } - assert.same(nodes, doc) - end) - - it("parses comments comments", function() - local doc = parse[[ - // C style - - /* - C style multiline - */ - - tag /*foo=#true*/ bar=#false - - /*/* - hello - */*/ - ]] - local nodes = document.new{ - node.new("tag", { ["bar"]=value.new(false) }) - } - assert.same(nodes, doc) + assert.valid_kdl( + [[ + num 1.234e-42 + my-hex 0xdeadbeef + my-octal 0o755 + my-binary 0b10101101 + bignum 1_000_000 + ]], + document.new{ + node.new("num", { value.new(1.234e-42) }), + node.new("my-hex", { value.new(0xdeadbeef) }), + node.new("my-octal", { value.new(493) }), + node.new("my-binary", { value.new(173) }), + node.new("bignum", { value.new(1000000) }) + }, + 2 + ) + end) + + it("parses comments", function() + assert.valid_kdl( + [[ + // C style + + /* + C style multiline + */ + + tag /*foo=#true*/ bar=#false + + /*/* + hello + */*/ + ]], + document.new{ + node.new("tag", { ["bar"]=value.new(false) }) + }, + 2 + ) end) it("parses slash dash", function() - local doc = parse[[ - /-mynode "foo" key=1 { - a - b - c - } - - mynode /- "commented" "not commented" /-key="value" /-{ - a - b - } - ]] - local nodes = document.new{ - node.new("mynode", { value.new("not commented") }) - } - assert.same(nodes, doc) + assert.valid_kdl( + [[ + /-mynode "foo" key=1 { + a + b + c + } + + mynode /- "commented" "not commented" /-key="value" /-{ + a + b + } + ]], + document.new{ + node.new("mynode", { value.new("not commented") }) + }, + 2 + ) end) it("parses multiline nodes", function() - local doc = parse[[ - title \ - "Some title" - - my-node 1 2 \ // comments are ok after \ - 3 4 - ]] - local nodes = document.new{ - node.new("title", { value.new("Some title") }), - node.new("my-node", { value.new(1), value.new(2), value.new(3), value.new(4) }) - } - assert.same(nodes, doc) + assert.valid_kdl( + [[ + title \ + "Some title" + + my-node 1 2 \ // comments are ok after \ + 3 4 + ]], + document.new{ + node.new("title", { value.new("Some title") }), + node.new("my-node", { value.new(1), value.new(2), value.new(3), value.new(4) }) + }, + 2 + ) end) it("parses utf8", function() - local doc = parse[[ - smile "😁" - ノãƒŧド お名前īŧ"☜(īžŸãƒŽīžŸâ˜œ)" - ]] - local nodes = document.new{ - node.new("smile", { value.new("😁") }), - node.new("ノãƒŧド", { ["お名前"]=value.new("☜(īžŸãƒŽīžŸâ˜œ)") }) - } - assert.same(nodes, doc) + assert.valid_kdl( + [[ + smile "😁" + ノãƒŧド お名前="☜(īžŸãƒŽīžŸâ˜œ)" + ]], + document.new{ + node.new("smile", { value.new("😁") }), + node.new("ノãƒŧド", { ["お名前"]=value.new("☜(īžŸãƒŽīžŸâ˜œ)") }) + }, + 2 + ) end) it("parses node names", function() - local doc = parse[[ - "!@$@$%Q$%~@!40" "1.2.3" "!!!!!"=#true - foo123~!@$%^&*.:'|?+ "weeee" - - 1 - ]] - local nodes = document.new{ - node.new("!@$@$%Q$%~@!40", { value.new("1.2.3"), ["!!!!!"]=value.new(true) }), - node.new("foo123~!@$%^&*.:'|?+", { value.new("weeee") }), - node.new("-", { value.new(1) }) - } - assert.same(nodes, doc) + assert.valid_kdl( + [[ +"!@$@$%Q$%~@!40" "1.2.3" "!!!!!"=#true +foo123~!@$%^&*.:'|?+ "weeee" +- 1 + ]], + document.new{ + node.new("!@$@$%Q$%~@!40", { value.new("1.2.3"), ["!!!!!"]=value.new(true) }), + node.new("foo123~!@$%^&*.:'|?+", { value.new("weeee") }), + node.new("-", { value.new(1) }) + }, + 2 + ) end) it("parses escapes", function() - local doc = parse[[ - node1 "\u{1f600}" - node2 "\n\t\r\\\"\f\b" - ]] - local nodes = document.new{ - node.new("node1", { value.new("😀") }), - node.new("node2", { value.new("\n\t\r\\\"\f\b") }) - } - assert.same(nodes, doc) + assert.valid_kdl( + [[ +node1 "\u{1f600}" +node2 "\n\t\r\\\"\f\b" + ]], + document.new{ + node.new("node1", { value.new("😀") }), + node.new("node2", { value.new("\n\t\r\\\"\f\b") }) + }, + 2 + ) + + assert.is_not.valid_kdl('node "\\u"', "Invalid unicode escape (1:6)", 2) + assert.is_not.valid_kdl('node "\\u{}"', "Invalid unicode escape: (1:6)", 2) + assert.is_not.valid_kdl('node "\\u{"', "Invalid unicode escape: \\u{} (1:6)", 2) + assert.is_not.valid_kdl('node "\\u}"', "Invalid unicode escape (1:6)", 2) + assert.is_not.valid_kdl('node "\\u{0123456}"', "Invalid unicode escape: \\u{0123456} (1:6)", 2) end) it("parses node types", function() - local doc = parse("(foo)node") - local nodes = document.new{ - node.new("node", {}, {}, "foo") - } - assert.same(nodes, doc) + assert.valid_kdl( + "(foo)node", + document.new{ + node.new("node", {}, {}, "foo") + }, + 2 + ) end) it("parses value types", function() - local doc = parse('node (foo)"bar"') - local nodes = document.new{ - node.new("node", { value.new("bar", "foo") }), - } - assert.same(nodes, doc) + assert.valid_kdl( + 'node (foo)"bar"', + document.new{ + node.new("node", { value.new("bar", "foo") }), + }, + 2 + ) end) it("parses property types", function() - local doc = parse('node baz=(foo)"bar"') - local nodes = document.new{ - node.new("node", { ["baz"]=value.new("bar", "foo") }), - } - assert.same(nodes, doc) + assert.valid_kdl( + 'node baz=(foo)"bar"', + document.new{ + node.new("node", { ["baz"]=value.new("bar", "foo") }), + }, + 2 + ) end) it("parses child types", function() - local doc = parse[[ - node { - (foo)bar - } - ]] - local nodes = document.new{ - node.new("node", {}, { - node.new("bar", {}, {}, "foo"), - }) - } - assert.same(nodes, doc) - end) -end) \ No newline at end of file + assert.valid_kdl( + [[ + node { + (foo)bar + } + ]], + document.new{ + node.new("node", {}, { + node.new("bar", {}, {}, "foo"), + }) + }, + 2 + ) + end) + + it("reads version directive", function() + assert.valid_kdl('/- kdl-version 2\nnode foo', 2) + assert.is_not.valid_kdl('/- kdl-version 1\nnode "foo"', "Version mismatch, expected 2, got 1", 2) + end) +end) diff --git a/spec/support.lua b/spec/support.lua new file mode 100644 index 0000000..1eaefdd --- /dev/null +++ b/spec/support.lua @@ -0,0 +1,46 @@ +local assert = require 'luassert.assert' +local util = require "luassert.util" +local say = require "say" +local kdl = require "kdl" + +local function valid_kdl(state, args) + local expected = args[2] + local version = args[3] + args[3] = nil + args.nofmt = { true, true, true} + if not version and type(expected) == "number" then + version = expected + expected = nil + args[2] = nil + end + local success, result = pcall(kdl.parse_document, args[1], version) + local result_str = tostring(result) + if expected then + args[2] = tostring(expected) + if success then + table.insert(args, result_str) + if type(expected) == "string" then + if result_str == expected then return true end + local s, r = pcall(kdl.parse_document, expected, version) + if s then return util.deepcompare(result, r, true) end + return true + else + return util.deepcompare(result, expected, true) or + result_str == tostring(expected) + end + else + result_str = result_str:gsub('^.-:%d+: ', '', 1) + table.insert(args, result_str) + return result_str ~= tostring(expected) + end + else + if success then table.insert(args, "(error)") + else table.insert(args, "(no error)") end + table.insert(args, result_str) + end + return success +end + +say:set("assertion.valid_kdl.positive", "Expected valid KDL.\nInput:\n%s\nExpected:\n%s\nActual:\n%s") +say:set("assertion.valid_kdl.negative", "Expected invalid KDL.\nInput:\n%s\nExpected:\n%s\nActual:\n%s") +assert:register("assertion", "valid_kdl", valid_kdl, "assertion.valid_kdl.positive", "assertion.valid_kdl.negative") diff --git a/spec/tokenizer_spec.lua b/spec/tokenizer_spec.lua index 2a48199..7938eed 100644 --- a/spec/tokenizer_spec.lua +++ b/spec/tokenizer_spec.lua @@ -1,184 +1,189 @@ describe("tokenizer", function() local tokenizer = require "kdl.tokenizer" + local function strip(token) + return { type=token.type, value=token.value } + end + it("can peek at upcoming tokens", function() local t = tokenizer.new("node 1 2 3") - assert.same({ type="IDENT", value="node" }, t:peek()) - assert.same({ type="WS", value=" " }, t:peek_next()) - assert.same({ type="IDENT", value="node" }, t:next()) - assert.same({ type="WS", value=" " }, t:peek()) - assert.same({ type="INTEGER", value=1 }, t:peek_next()) + assert.same({ type="IDENT", value="node" }, strip(t:peek())) + assert.same({ type="WS", value=" " }, strip(t:peek_next())) + assert.same({ type="IDENT", value="node" }, strip(t:next())) + assert.same({ type="WS", value=" " }, strip(t:peek())) + assert.same({ type="INTEGER", value=1 }, strip(t:peek_next())) end) it("tokenizes identifiers", function() - assert.same({ type="IDENT", value="foo" }, tokenizer.new("foo"):next()) + assert.same({ type="IDENT", value="foo" }, strip(tokenizer.new("foo"):next())) + assert.same({ type="IDENT", value="foo-bar123" }, strip(tokenizer.new("foo-bar123"):next())) + assert.same({ type="IDENT", value="-" }, strip(tokenizer.new("-"):next())) + assert.same({ type="IDENT", value="--" }, strip(tokenizer.new("--"):next())) end) it("tokenizes strings", function() - assert.same({ type="STRING", value="foo" }, tokenizer.new('"foo"'):next()) - assert.same({ type="STRING", value="foo\nbar" }, tokenizer.new('"foo\\nbar"'):next()) - assert.same({ type="STRING", value="\u{10FFF}" }, tokenizer.new('"\\u{10FFF}"'):next()) + assert.same({ type="STRING", value="foo" }, strip(tokenizer.new('"foo"'):next())) + assert.same({ type="STRING", value="foo\nbar" }, strip(tokenizer.new('"foo\\nbar"'):next())) + assert.same({ type="STRING", value="\u{10FFF}" }, strip(tokenizer.new('"\\u{10FFF}"'):next())) + assert.same({ type="STRING", value="foo" }, strip(tokenizer.new('"\\\n\n\nfoo"'):next())) end) it("tokenizes multi line strings", function() - assert.same({ type="STRING", value="foo\nbar\n baz\nqux" }, tokenizer.new("\"\n foo\n bar\n baz\n qux\n \""):next()) - assert.same({ type="RAWSTRING", value="foo\nbar\n baz\nqux" }, tokenizer.new("#\"\n foo\n bar\n baz\n qux\n \"#"):next()) + assert.same({ type="STRING", value="foo\nbar\n baz\nqux" }, strip(tokenizer.new('"""\n foo\n bar\n baz\n qux\n """'):next())) + assert.same({ type="RAWSTRING", value="foo\nbar\n baz\nqux" }, strip(tokenizer.new('#"""\n foo\n bar\n baz\n qux\n """#'):next())) end) it("tokenizes rawstrings", function() - assert.same({ type="RAWSTRING", value="foo\\nbar" }, tokenizer.new('#"foo\\nbar"#'):next()) - assert.same({ type="RAWSTRING", value="foo\"bar" }, tokenizer.new('#"foo"bar"#'):next()) - assert.same({ type="RAWSTRING", value="foo\"#bar" }, tokenizer.new('##"foo"#bar"##'):next()) - assert.same({ type="RAWSTRING", value="\"foo\"" }, tokenizer.new('#""foo""#'):next()) + assert.same({ type="RAWSTRING", value="foo\\nbar" }, strip(tokenizer.new('#"foo\\nbar"#'):next())) + assert.same({ type="RAWSTRING", value="foo\"bar" }, strip(tokenizer.new('#"foo"bar"#'):next())) + assert.same({ type="RAWSTRING", value="foo\"#bar" }, strip(tokenizer.new('##"foo"#bar"##'):next())) + assert.same({ type="RAWSTRING", value="\"foo\"" }, strip(tokenizer.new('#""foo""#'):next())) local t = tokenizer.new('node #"C:\\Users\\zkat\\"#') - assert.same({ type="IDENT", value="node" }, t:next()) - assert.same({ type="WS", value=" " }, t:next()) - assert.same({ type="RAWSTRING", value="C:\\Users\\zkat\\" }, t:next()) + assert.same({ type="IDENT", value="node" }, strip(t:next())) + assert.same({ type="WS", value=" " }, strip(t:next())) + assert.same({ type="RAWSTRING", value="C:\\Users\\zkat\\" }, strip(t:next())) t = tokenizer.new('other-node #"hello"world"#') - assert.same({ type="IDENT", value="other-node" }, t:next()) - assert.same({ type="WS", value=" " }, t:next()) - assert.same({ type="RAWSTRING", value="hello\"world" }, t:next()) + assert.same({ type="IDENT", value="other-node" }, strip(t:next())) + assert.same({ type="WS", value=" " }, strip(t:next())) + assert.same({ type="RAWSTRING", value="hello\"world" }, strip(t:next())) end) it("tokenizes integers", function() - assert.same({ type="INTEGER", value=0x0123456789abcdef }, tokenizer.new("0x0123456789abcdef"):next()) - assert.same({ type="INTEGER", value=342391 }, tokenizer.new("0o01234567"):next()) - assert.same({ type="INTEGER", value=41 }, tokenizer.new("0b101001"):next()) - assert.same({ type="INTEGER", value=-0x0123456789abcdef }, tokenizer.new("-0x0123456789abcdef"):next()) - assert.same({ type="INTEGER", value=-342391 }, tokenizer.new("-0o01234567"):next()) - assert.same({ type="INTEGER", value=-41 }, tokenizer.new("-0b101001"):next()) - assert.same({ type="INTEGER", value=0x0123456789abcdef }, tokenizer.new("+0x0123456789abcdef"):next()) - assert.same({ type="INTEGER", value=342391 }, tokenizer.new("+0o01234567"):next()) - assert.same({ type="INTEGER", value=41 }, tokenizer.new("+0b101001"):next()) + assert.same({ type="INTEGER", value=0x0123456789abcdef }, strip(tokenizer.new("0x0123456789abcdef"):next())) + assert.same({ type="INTEGER", value=342391 }, strip(tokenizer.new("0o01234567"):next())) + assert.same({ type="INTEGER", value=41 }, strip(tokenizer.new("0b101001"):next())) + assert.same({ type="INTEGER", value=-0x0123456789abcdef }, strip(tokenizer.new("-0x0123456789abcdef"):next())) + assert.same({ type="INTEGER", value=-342391 }, strip(tokenizer.new("-0o01234567"):next())) + assert.same({ type="INTEGER", value=-41 }, strip(tokenizer.new("-0b101001"):next())) + assert.same({ type="INTEGER", value=0x0123456789abcdef }, strip(tokenizer.new("+0x0123456789abcdef"):next())) + assert.same({ type="INTEGER", value=342391 }, strip(tokenizer.new("+0o01234567"):next())) + assert.same({ type="INTEGER", value=41 }, strip(tokenizer.new("+0b101001"):next())) end) it("tokenizes floats", function() - assert.same({ type="FLOAT", value=1.23 }, tokenizer.new("1.23"):next()) - assert.same({ type="FLOAT", value=math.huge }, tokenizer.new("#inf"):next()) - assert.same({ type="FLOAT", value=-math.huge }, tokenizer.new("#-inf"):next()) + assert.same({ type="FLOAT", value=1.23 }, strip(tokenizer.new("1.23"):next())) + assert.same({ type="FLOAT", value=math.huge }, strip(tokenizer.new("#inf"):next())) + assert.same({ type="FLOAT", value=-math.huge }, strip(tokenizer.new("#-inf"):next())) local nan = tokenizer.new("#nan"):next() assert.same(nan.type, "FLOAT") assert.is_not.equal(nan.value, nan.value); end) it("tokenizers booleans", function() - assert.same({ type="TRUE", value=true }, tokenizer.new("#true"):next()) - assert.same({ type="FALSE", value=false }, tokenizer.new("#false"):next()) + assert.same({ type="TRUE", value=true }, strip(tokenizer.new("#true"):next())) + assert.same({ type="FALSE", value=false }, strip(tokenizer.new("#false"):next())) end) it("tokenizers nulls", function() - assert.same({ type="NULL", value=nil }, tokenizer.new("#null"):next()) + assert.same({ type="NULL", value=nil }, strip(tokenizer.new("#null"):next())) end) it("tokenizers symbols", function() - assert.same({ type="LBRACE", value="{" }, tokenizer.new("{"):next()) - assert.same({ type="RBRACE", value="}" }, tokenizer.new("}"):next()) + assert.same({ type="LBRACE", value="{" }, strip(tokenizer.new("{"):next())) + assert.same({ type="RBRACE", value="}" }, strip(tokenizer.new("}"):next())) end) it("tokenizes equals", function() - assert.same({ type="EQUALS", value="=" }, tokenizer.new("="):next()) - assert.same({ type="EQUALS", value=" =" }, tokenizer.new(" ="):next()) - assert.same({ type="EQUALS", value="= " }, tokenizer.new("= "):next()) - assert.same({ type="EQUALS", value=" = " }, tokenizer.new(" = "):next()) - assert.same({ type="EQUALS", value=" =" }, tokenizer.new(" =foo"):next()) - assert.same({ type="EQUALS", value="\u{FE66}" }, tokenizer.new("\u{FE66}"):next()) - assert.same({ type="EQUALS", value="\u{FF1D}" }, tokenizer.new("\u{FF1D}"):next()) - assert.same({ type="EQUALS", value="🟰" }, tokenizer.new("🟰"):next()) + assert.same({ type="EQUALS", value="=" }, strip(tokenizer.new("="):next())) + assert.same({ type="EQUALS", value=" =" }, strip(tokenizer.new(" ="):next())) + assert.same({ type="EQUALS", value="= " }, strip(tokenizer.new("= "):next())) + assert.same({ type="EQUALS", value=" = " }, strip(tokenizer.new(" = "):next())) + assert.same({ type="EQUALS", value=" =" }, strip(tokenizer.new(" =foo"):next())) end) it("tokenizes whitespace", function() - assert.same({ type="WS", value=" " }, tokenizer.new(" "):next()) - assert.same({ type="WS", value="\t" }, tokenizer.new("\t"):next()) - assert.same({ type="WS", value=" \t" }, tokenizer.new(" \t"):next()) - assert.same({ type="WS", value="\\\n" }, tokenizer.new("\\\n"):next()) - assert.same({ type="WS", value="\\" }, tokenizer.new("\\"):next()) - assert.same({ type="WS", value="\\\n" }, tokenizer.new("\\//some comment\n"):next()) - assert.same({ type="WS", value="\\ \n" }, tokenizer.new("\\ //some comment\n"):next()) - assert.same({ type="WS", value="\\" }, tokenizer.new("\\//some comment"):next()) - assert.same({ type="WS", value=" \\\n" }, tokenizer.new(" \\\n"):next()) - assert.same({ type="WS", value=" \\\n" }, tokenizer.new(" \\//some comment\n"):next()) - assert.same({ type="WS", value=" \\ \n" }, tokenizer.new(" \\ //some comment\n"):next()) - assert.same({ type="WS", value=" \\" }, tokenizer.new(" \\//some comment"):next()) - assert.same({ type="WS", value=" \\\n \\\n " }, tokenizer.new(" \\\n \\\n "):next()) + assert.same({ type="WS", value=" " }, strip(tokenizer.new(" "):next())) + assert.same({ type="WS", value="\t" }, strip(tokenizer.new("\t"):next())) + assert.same({ type="WS", value=" \t" }, strip(tokenizer.new(" \t"):next())) + assert.same({ type="WS", value="\\\n" }, strip(tokenizer.new("\\\n"):next())) + assert.same({ type="WS", value="\\" }, strip(tokenizer.new("\\"):next())) + assert.same({ type="WS", value="\\\n" }, strip(tokenizer.new("\\//some comment\n"):next())) + assert.same({ type="WS", value="\\ \n" }, strip(tokenizer.new("\\ //some comment\n"):next())) + assert.same({ type="WS", value="\\" }, strip(tokenizer.new("\\//some comment"):next())) + assert.same({ type="WS", value=" \\\n" }, strip(tokenizer.new(" \\\n"):next())) + assert.same({ type="WS", value=" \\\n" }, strip(tokenizer.new(" \\//some comment\n"):next())) + assert.same({ type="WS", value=" \\ \n" }, strip(tokenizer.new(" \\ //some comment\n"):next())) + assert.same({ type="WS", value=" \\" }, strip(tokenizer.new(" \\//some comment"):next())) + assert.same({ type="WS", value=" \\\n \\\n " }, strip(tokenizer.new(" \\\n \\\n "):next())) end) it("tokenizes multiple tokens", function() local t = tokenizer.new("node 1 \"two\" a=3") - assert.same({ type="IDENT", value="node" }, t:next()) - assert.same({ type="WS", value=" " }, t:next()) - assert.same({ type="INTEGER", value=1 }, t:next()) - assert.same({ type="WS", value=" " }, t:next()) - assert.same({ type="STRING", value="two" }, t:next()) - assert.same({ type="WS", value=" " }, t:next()) - assert.same({ type="IDENT", value="a" }, t:next()) - assert.same({ type="EQUALS", value="=" }, t:next()) - assert.same({ type="INTEGER", value=3 }, t:next()) - assert.same({ type="EOF", value="" }, t:next()) - assert.same({ type=false, value=false }, t:next()) + assert.same({ type="IDENT", value="node" }, strip(t:next())) + assert.same({ type="WS", value=" " }, strip(t:next())) + assert.same({ type="INTEGER", value=1 }, strip(t:next())) + assert.same({ type="WS", value=" " }, strip(t:next())) + assert.same({ type="STRING", value="two" }, strip(t:next())) + assert.same({ type="WS", value=" " }, strip(t:next())) + assert.same({ type="IDENT", value="a" }, strip(t:next())) + assert.same({ type="EQUALS", value="=" }, strip(t:next())) + assert.same({ type="INTEGER", value=3 }, strip(t:next())) + assert.same({ type="EOF", value="" }, strip(t:next())) + assert.same({ type="EOF", value=nil }, strip(t:next())) end) it("tokenizes single line comments", function() - assert.same({ type="EOF", value="" }, tokenizer.new("// comment"):next()) + assert.same({ type="EOF", value="" }, strip(tokenizer.new("// comment"):next())) local t = tokenizer.new([[node1 // comment node2]]) - assert.same({ type="IDENT", value="node1" }, t:next()) - assert.same({ type="NEWLINE", value="\n" }, t:next()) - assert.same({ type="NEWLINE", value="\n" }, t:next()) - assert.same({ type="IDENT", value="node2" }, t:next()) - assert.same({ type="EOF", value="" }, t:next()) - assert.same({ type=false, value=false }, t:next()) + assert.same({ type="IDENT", value="node1" }, strip(t:next())) + assert.same({ type="NEWLINE", value="\n" }, strip(t:next())) + assert.same({ type="NEWLINE", value="\n" }, strip(t:next())) + assert.same({ type="IDENT", value="node2" }, strip(t:next())) + assert.same({ type="EOF", value="" }, strip(t:next())) + assert.same({ type="EOF", value=nil }, strip(t:next())) end) it("tokenizes multiline comments", function() local t = tokenizer.new("foo /*bar=1*/ baz=2") - assert.same({ type="IDENT", value="foo" }, t:next()) - assert.same({ type="WS", value=" " }, t:next()) - assert.same({ type="IDENT", value="baz" }, t:next()) - assert.same({ type="EQUALS", value="=" }, t:next()) - assert.same({ type="INTEGER", value=2 }, t:next()) - assert.same({ type="EOF", value="" }, t:next()) - assert.same({ type=false, value=false }, t:next()) + assert.same({ type="IDENT", value="foo" }, strip(t:next())) + assert.same({ type="WS", value=" " }, strip(t:next())) + assert.same({ type="IDENT", value="baz" }, strip(t:next())) + assert.same({ type="EQUALS", value="=" }, strip(t:next())) + assert.same({ type="INTEGER", value=2 }, strip(t:next())) + assert.same({ type="EOF", value="" }, strip(t:next())) + assert.same({ type="EOF", value=nil }, strip(t:next())) end) it("tokenizes utf8", function() - assert.same({ type="IDENT", value="😁" }, tokenizer.new("😁"):next()) - assert.same({ type="STRING", value="😁" }, tokenizer.new('"😁"'):next()) - assert.same({ type="IDENT", value="ノãƒŧド" }, tokenizer.new("ノãƒŧド"):next()) - assert.same({ type="IDENT", value="お名前" }, tokenizer.new("お名前"):next()) - assert.same({ type="STRING", value="☜(īžŸãƒŽīžŸâ˜œ)" }, tokenizer.new('"☜(īžŸãƒŽīžŸâ˜œ)"'):next()) + assert.same({ type="IDENT", value="😁" }, strip(tokenizer.new("😁"):next())) + assert.same({ type="STRING", value="😁" }, strip(tokenizer.new('"😁"'):next())) + assert.same({ type="IDENT", value="ノãƒŧド" }, strip(tokenizer.new("ノãƒŧド"):next())) + assert.same({ type="IDENT", value="お名前" }, strip(tokenizer.new("お名前"):next())) + assert.same({ type="STRING", value="☜(īžŸãƒŽīžŸâ˜œ)" }, strip(tokenizer.new('"☜(īžŸãƒŽīžŸâ˜œ)"'):next())) local t = tokenizer.new([[smile "😁" -ノãƒŧド お名前īŧ"☜(īžŸãƒŽīžŸâ˜œ)"]]) +ノãƒŧド お名前="☜(īžŸãƒŽīžŸâ˜œ)"]]) - assert.same({ type="IDENT", value="smile" }, t:next()) - assert.same({ type="WS", value=" " }, t:next()) - assert.same({ type="STRING", value="😁" }, t:next()) - assert.same({ type="NEWLINE", value="\n" }, t:next()) - assert.same({ type="IDENT", value="ノãƒŧド" }, t:next()) - assert.same({ type="WS", value=" " }, t:next()) - assert.same({ type="IDENT", value="お名前" }, t:next()) - assert.same({ type="EQUALS", value="īŧ" }, t:next()) - assert.same({ type="STRING", value="☜(īžŸãƒŽīžŸâ˜œ)" }, t:next()) - assert.same({ type="EOF", value="" }, t:next()) - assert.same({ type=false, value=false }, t:next()) + assert.same({ type="IDENT", value="smile" }, strip(t:next())) + assert.same({ type="WS", value=" " }, strip(t:next())) + assert.same({ type="STRING", value="😁" }, strip(t:next())) + assert.same({ type="NEWLINE", value="\n" }, strip(t:next())) + assert.same({ type="IDENT", value="ノãƒŧド" }, strip(t:next())) + assert.same({ type="WS", value=" " }, strip(t:next())) + assert.same({ type="IDENT", value="お名前" }, strip(t:next())) + assert.same({ type="EQUALS", value="=" }, strip(t:next())) + assert.same({ type="STRING", value="☜(īžŸãƒŽīžŸâ˜œ)" }, strip(t:next())) + assert.same({ type="EOF", value="" }, strip(t:next())) + assert.same({ type="EOF", value=nil }, strip(t:next())) end) it("tokenizes semicolons", function() local t = tokenizer.new("node1; node2") - assert.same({ type="IDENT", value="node1" }, t:next()) - assert.same({ type="SEMICOLON", value=";" }, t:next()) - assert.same({ type="WS", value=" " }, t:next()) - assert.same({ type="IDENT", value="node2" }, t:next()) - assert.same({ type="EOF", value="" }, t:next()) - assert.same({ type=false, value=false }, t:next()) + assert.same({ type="IDENT", value="node1" }, strip(t:next())) + assert.same({ type="SEMICOLON", value=";" }, strip(t:next())) + assert.same({ type="WS", value=" " }, strip(t:next())) + assert.same({ type="IDENT", value="node2" }, strip(t:next())) + assert.same({ type="EOF", value="" }, strip(t:next())) + assert.same({ type="EOF", value=nil }, strip(t:next())) end) it("tokenizes slash dash", function() @@ -186,56 +191,56 @@ node2]]) a }]]) - assert.same({ type="SLASHDASH", value="/-" }, t:next()) - assert.same({ type="IDENT", value="mynode" }, t:next()) - assert.same({ type="WS", value=" " }, t:next()) - assert.same({ type="SLASHDASH", value="/-" }, t:next()) - assert.same({ type="STRING", value="foo" }, t:next()) - assert.same({ type="WS", value=" " }, t:next()) - assert.same({ type="SLASHDASH", value="/-" }, t:next()) - assert.same({ type="IDENT", value="key" }, t:next()) - assert.same({ type="EQUALS", value="=" }, t:next()) - assert.same({ type="INTEGER", value=1 }, t:next()) - assert.same({ type="WS", value=" " }, t:next()) - assert.same({ type="SLASHDASH", value="/-" }, t:next()) - assert.same({ type="LBRACE", value="{" }, t:next()) - assert.same({ type="NEWLINE", value="\n" }, t:next()) - assert.same({ type="WS", value=" " }, t:next()) - assert.same({ type="IDENT", value="a" }, t:next()) - assert.same({ type="NEWLINE", value="\n" }, t:next()) - assert.same({ type="RBRACE", value="}" }, t:next()) - assert.same({ type="EOF", value="" }, t:next()) - assert.same({ type=false, value=false }, t:next()) + assert.same({ type="SLASHDASH", value="/-" }, strip(t:next())) + assert.same({ type="IDENT", value="mynode" }, strip(t:next())) + assert.same({ type="WS", value=" " }, strip(t:next())) + assert.same({ type="SLASHDASH", value="/-" }, strip(t:next())) + assert.same({ type="STRING", value="foo" }, strip(t:next())) + assert.same({ type="WS", value=" " }, strip(t:next())) + assert.same({ type="SLASHDASH", value="/-" }, strip(t:next())) + assert.same({ type="IDENT", value="key" }, strip(t:next())) + assert.same({ type="EQUALS", value="=" }, strip(t:next())) + assert.same({ type="INTEGER", value=1 }, strip(t:next())) + assert.same({ type="WS", value=" " }, strip(t:next())) + assert.same({ type="SLASHDASH", value="/-" }, strip(t:next())) + assert.same({ type="LBRACE", value="{" }, strip(t:next())) + assert.same({ type="NEWLINE", value="\n" }, strip(t:next())) + assert.same({ type="WS", value=" " }, strip(t:next())) + assert.same({ type="IDENT", value="a" }, strip(t:next())) + assert.same({ type="NEWLINE", value="\n" }, strip(t:next())) + assert.same({ type="RBRACE", value="}" }, strip(t:next())) + assert.same({ type="EOF", value="" }, strip(t:next())) + assert.same({ type="EOF", value=nil }, strip(t:next())) end) it("tokenizes multiline nodes", function() local t = tokenizer.new([[title \ "Some title"]]) - assert.same({ type="IDENT", value="title" }, t:next()) - assert.same({ type="WS", value=" \\\n " }, t:next()) - assert.same({ type="STRING", value="Some title" }, t:next()) - assert.same({ type="EOF", value="" }, t:next()) - assert.same({ type=false, value=false }, t:next()) + assert.same({ type="IDENT", value="title" }, strip(t:next())) + assert.same({ type="WS", value=" \\\n " }, strip(t:next())) + assert.same({ type="STRING", value="Some title" }, strip(t:next())) + assert.same({ type="EOF", value="" }, strip(t:next())) + assert.same({ type="EOF", value=nil }, strip(t:next())) end) it("tokenizes types", function() local t = tokenizer.new("(foo)bar") - assert.same({ type="LPAREN", value="(" }, t:next()) - assert.same({ type="IDENT", value="foo" }, t:next()) - assert.same({ type="RPAREN", value=")" }, t:next()) - assert.same({ type="IDENT", value="bar" }, t:next()) + assert.same({ type="LPAREN", value="(" }, strip(t:next())) + assert.same({ type="IDENT", value="foo" }, strip(t:next())) + assert.same({ type="RPAREN", value=")" }, strip(t:next())) + assert.same({ type="IDENT", value="bar" }, strip(t:next())) t = tokenizer.new("(foo)/*asdf*/bar") - assert.same({ type="LPAREN", value="(" }, t:next()) - assert.same({ type="IDENT", value="foo" }, t:next()) - assert.same({ type="RPAREN", value=")" }, t:next()) - assert.same({ type="IDENT", value="bar" }, t:next()) + assert.same({ type="LPAREN", value="(" }, strip(t:next())) + assert.same({ type="IDENT", value="foo" }, strip(t:next())) + assert.same({ type="RPAREN", value=")" }, strip(t:next())) + assert.same({ type="IDENT", value="bar" }, strip(t:next())) t = tokenizer.new("(foo/*asdf*/)bar") - assert.same({ type="LPAREN", value="(" }, t:next()) - assert.same({ type="IDENT", value="foo" }, t:next()) - assert.same({ type="RPAREN", value=")" }, t:next()) - assert.same({ type="IDENT", value="bar" }, t:next()) + assert.same({ type="LPAREN", value="(" }, strip(t:next())) + assert.same({ type="IDENT", value="foo" }, strip(t:next())) + assert.same({ type="RPAREN", value=")" }, strip(t:next())) + assert.same({ type="IDENT", value="bar" }, strip(t:next())) end) -end) \ No newline at end of file +end) diff --git a/spec/v1/kdl-org b/spec/v1/kdl-org new file mode 160000 index 0000000..ef93a6b --- /dev/null +++ b/spec/v1/kdl-org @@ -0,0 +1 @@ +Subproject commit ef93a6b10c4e16d94194280bb6687661d7024476 diff --git a/spec/v1/kdl_spec.lua b/spec/v1/kdl_spec.lua new file mode 100644 index 0000000..8f2d7c1 --- /dev/null +++ b/spec/v1/kdl_spec.lua @@ -0,0 +1,39 @@ +local lfs = require "lfs" +local kdl = require "kdl" +require "spec.support" + +describe("kdl", function() + local ignore_list = {"escline_comment_node"} + local function exists(name) + local f=io.open(name,"r") + if f~=nil then io.close(f) return true else return false end + end + + local function readfile(filename) + local f = assert(io.open(filename, "r")) + local s = f:read("a") + f:close() + return s + end + + local TEST_CASES = "spec/v1/kdl-org/tests/test_cases" + for file in lfs.dir(TEST_CASES.."/input") do + local ignored = false + for _, i in ipairs(ignore_list) do + if file:match(i) then ignored = true; break end + end + if not ignored and file ~= "." and file ~= ".." then + local input = TEST_CASES.."/input/"..file + local expected = TEST_CASES.."/expected_kdl/"..file + if exists(expected) then + it("parses "..input, function() + assert.valid_kdl(readfile(input), readfile(expected), 1) + end) + else + it("does not parse "..input, function() + assert.is_not.valid_kdl(readfile(input), 1) + end) + end + end + end +end) diff --git a/spec/v1/parser_spec.lua b/spec/v1/parser_spec.lua new file mode 100644 index 0000000..10d0fcd --- /dev/null +++ b/spec/v1/parser_spec.lua @@ -0,0 +1,475 @@ +describe("parser", function() + local parser = require "kdl.v1.parser" + local document = require "kdl.document" + local node = require "kdl.node" + local value = require "kdl.value" + + local function parse(str) + local ok, r = xpcall(parser.parse, debug.traceback, str) + if ok then return r else error(r) end + end + + local function n(name, args, children, ty, fn) + if type(args) == "function" then + fn = args + args = nil + children = nil + ty = nil + end + if type(children) == "function" then + fn = children + children = nil + ty = nil + end + if type(ty) == "function" then + fn = ty + ty = nil + end + local nd = node.new(name, args, children, ty) + if fn then fn(nd) end + return nd + end + + it("parses empty string", function() + assert.same(document.new(), parse("")) + assert.same(document.new(), parse(" ")) + assert.same(document.new(), parse("\n")) + end) + + it("parses nodes", function() + assert.same(document.new{ node.new("node") }, parse("node")) + assert.same(document.new{ node.new("node") }, parse("node\n")) + assert.same(document.new{ node.new("node") }, parse("\nnode\n")) + assert.same( + document.new{ + node.new("node1"), + node.new("node2") + }, + parse("node1\nnode2") + ) + end) + + it("parses node entries", function() + assert.same(document.new{ node.new("node") }, parse("node;")) + assert.same(document.new{ node.new("node", { value.new(1) }) }, parse("node 1")) + assert.same( + document.new{ + node.new("node", { + value.new(1), + value.new(2), + value.new("3"), + value.new(true), + value.new(false), + value.new(nil) + }), + }, + parse('node 1 2 "3" true false null') + ) + assert.same(document.new{ node.new("node", {}, { node.new("node2") }) }, parse("node {\n node2\n}")) + assert.same(document.new{ node.new("node", {}, { node.new("node2") }) }, parse("node {\n node2 \n}")) + assert.same(document.new{ node.new("node", {}, { node.new("node2") }) }, parse("node { node2; }")) + end) + + it("parses slashdash nodes", function() + assert.same(document.new(), parse("/-node")) + assert.same(document.new(), parse("/- node")) + assert.same(document.new(), parse("/- node\n")) + assert.same(document.new(), parse("/-node 1 2 3")) + assert.same(document.new(), parse("/-node key=false")) + assert.same(document.new(), parse("/-node{\nnode\n}")) + assert.same(document.new(), parse("/-node 1 2 3 key=\"value\" \\\n{\nnode\n}")) + end); + + it("parses slashdash args", function() + assert.same(document.new{ node.new("node") }, parse("node /-1")) + assert.same(document.new{ node.new("node", { value.new(2) }) }, parse("node /-1 2")) + assert.same(document.new{ node.new("node", { value.new(1), value.new(3) }) }, parse("node 1 /- 2 3")) + assert.same(document.new{ node.new("node") }, parse("node /--1")) + assert.same(document.new{ node.new("node") }, parse("node /- -1")) + assert.same(document.new{ node.new("node") }, parse("node \\\n/- -1")) + end) + + it("parses slashdash props", function() + assert.same(document.new{ node.new("node") }, parse("node /-key=1")) + assert.same(document.new{ node.new("node") }, parse("node /- key=1")) + assert.same(document.new{ node.new("node", { ["key"]=value.new(1) }) }, parse("node key=1 /-key2=2")) + end) + + it("parses slashdash children", function() + assert.same(document.new{ node.new("node") }, parse("node /-{}")) + assert.same(document.new{ node.new("node") }, parse("node /- {}")) + assert.same(document.new{ node.new("node") }, parse("node /-{\nnode2\n}")) + end) + + it('parses strings', function() + assert.same(document.new{ node.new('node', { value.new("") }) }, parse('node ""')) + assert.same(document.new{ node.new('node', { value.new("hello") }) }, parse('node "hello"')) + assert.same(document.new{ node.new('node', { value.new("hello\nworld") }) }, parse([[node "hello\nworld"]])) + assert.same(document.new{ node.new('node', { value.new("\u{10FFF}") }) }, parse([[node "\u{10FFF}"]])) + assert.same(document.new{ node.new('node', { value.new("\"\\\u{08}\u{0C}\n\r\t") }) }, parse([[node "\"\\\b\f\n\r\t"]])) + assert.same(document.new{ node.new('node', { value.new("\u{10}") }) }, parse([[node "\u{10}"]])) + assert.has_error(function() parser.parse([[node "\i"]]) end, "Unexpected escape: \\i (1:6)") + assert.has_error(function() parser.parse([[node "\u{c0ffee}"]]) end, "Invalid code point \\u{c0ffee} (1:6)") + end) + + it("parses floats", function() + assert.same(document.new{ node.new("node", { value.new(1.0) }) }, parse("node 1.0")) + assert.same(document.new{ node.new("node", { value.new(0.0) }) }, parse("node 0.0")) + assert.same(document.new{ node.new("node", { value.new(-1.0) }) }, parse("node -1.0")) + assert.same(document.new{ node.new("node", { value.new(1.0) }) }, parse("node +1.0")) + assert.same(document.new{ node.new("node", { value.new(1.0e10) }) }, parse("node 1.0e10")) + assert.same(document.new{ node.new("node", { value.new(1.0e-10) }) }, parse("node 1.0e-10")) + assert.same(document.new{ node.new("node", { value.new(123456789.0) }) }, parse("node 123_456_789.0")) + assert.same(document.new{ node.new("node", { value.new(123456789.0) }) }, parse("node 123_456_789.0_")) + assert.has_error(function() parser.parse("node ?1.0") end, "Expected EQUALS, got EOF (1:10)") + assert.has_error(function() parser.parse("node _1.0") end, "Expected EQUALS, got EOF (1:10)") + assert.has_error(function() parser.parse("node 1._0") end, "Invalid number: 1._0 (1:6)") + assert.has_error(function() parser.parse("node 1.") end, "Invalid number: 1. (1:6)") + assert.has_error(function() parser.parse("node .0") end, "Expected EQUALS, got EOF (1:8)") + end) + + it("parses integers", function() + assert.same(document.new{ node.new("node", { value.new(0) }) }, parse("node 0")) + assert.same(document.new{ node.new("node", { value.new(123456789) }) }, parse("node 0123456789")) + assert.same(document.new{ node.new("node", { value.new(123456789) }) }, parse("node 0123_456_789")) + assert.same(document.new{ node.new("node", { value.new(123456789) }) }, parse("node 0123_456_789_")) + assert.same(document.new{ node.new("node", { value.new(123456789) }) }, parse("node +0123456789")) + assert.same(document.new{ node.new("node", { value.new(-123456789) }) }, parse("node -0123456789")) + assert.has_error(function() parser.parse('node ?0123456789') end, "Expected EQUALS, got EOF (1:17)") + assert.has_error(function() parser.parse('node _0123456789') end, "Expected EQUALS, got EOF (1:17)") + assert.has_error(function() parser.parse('node a') end, "Expected EQUALS, got EOF (1:7)") + assert.has_error(function() parser.parse('node --') end, "Expected EQUALS, got EOF (1:8)") + end) + + it("parses hexadecimal", function() + assert.same(document.new{ node.new("node", { value.new(0x0123456789abcdef) }) }, parse("node 0x0123456789abcdef")) + assert.same(document.new{ node.new("node", { value.new(0x0123456789abcdef) }) }, parse("node 0x01234567_89abcdef")) + assert.same(document.new{ node.new("node", { value.new(0x0123456789abcdef) }) }, parse("node 0x0123456789abcdef_")) + assert.has_error(function() parser.parse("node 0x_123") end, "Invalid hexadecimal: _123 (1:6)") + assert.has_error(function() parser.parse("node 0xg") end, "Unexpected 'g' (1:6)") + assert.has_error(function() parser.parse("node 0xx") end, "Unexpected 'x' (1:6)") + end) + + it("parses octal", function() + assert.same(document.new{ node.new("node", { value.new(342391) }) }, parse("node 0o01234567")) + assert.same(document.new{ node.new("node", { value.new(342391) }) }, parse("node 0o0123_4567")) + assert.same(document.new{ node.new("node", { value.new(342391) }) }, parse("node 0o01234567_")) + assert.has_error(function() parser.parse("node 0o_123") end, "Invalid octal: _123 (1:6)") + assert.has_error(function() parser.parse("node 0o8") end, "Unexpected '8' (1:6)") + assert.has_error(function() parser.parse("node 0oo") end, "Unexpected 'o' (1:6)") + end) + + it("parses binary", function() + assert.same(document.new{ node.new("node", { value.new(5) }) }, parse("node 0b0101")) + assert.same(document.new{ node.new("node", { value.new(6) }) }, parse("node 0b01_10")) + assert.same(document.new{ node.new("node", { value.new(6) }) }, parse("node 0b01___10")) + assert.same(document.new{ node.new("node", { value.new(6) }) }, parse("node 0b0110_")) + assert.has_error(function() parser.parse("node 0b_0110") end, "Invalid binary: _0110 (1:6)") + assert.has_error(function() parser.parse("node 0b20") end, "Unexpected '2' (1:6)") + assert.has_error(function() parser.parse("node 0bb") end, "Unexpected 'b' (1:6)") + end) + + it("parses raw strings", function() + assert.same(document.new{ node.new("node", { value.new("foo") }) }, parse([[node r"foo"]])) + assert.same(document.new{ node.new("node", { value.new([[foo\nbar]]) }) }, parse([[node r"foo\nbar"]])) + assert.same(document.new{ node.new("node", { value.new("foo") }) }, parse([[node r#"foo"#]])) + assert.same(document.new{ node.new("node", { value.new("foo") }) }, parse([[node r##"foo"##]])) + assert.same(document.new{ node.new("node", { value.new([[\nfoo\r]]) }) }, parse([[node r#"\nfoo\r"#]])) + assert.has_error(function() parser.parse('node r##"foo"#') end, "Unterminated rawstring literal (1:6)") + end) + + it("parses booleans", function() + assert.same(document.new{ node.new("node", { value.new(true) }) }, parse("node true")) + assert.same(document.new{ node.new("node", { value.new(false) }) }, parse("node false")) + end) + + it("parses nulls", function() + assert.same(document.new{ node.new("node", { value.new(nil) }) }, parse("node null")) + end) + + it("parses node spacing", function() + assert.same(document.new{ node.new("node", { value.new(1) }) }, parse("node 1")) + assert.same(document.new{ node.new("node", { value.new(1) }) }, parse("node\t1")) + assert.same(document.new{ node.new("node", { value.new(1) }) }, parse("node\t \\ // hello\n 1")) + end) + + it("parses single line comment", function() + assert.same(document.new{}, parse("//hello")) + assert.same(document.new{}, parse("// \thello")) + assert.same(document.new{}, parse("//hello\n")) + assert.same(document.new{}, parse("//hello\r\n")) + assert.same(document.new{}, parse("//hello\n\r")) + assert.same(document.new{ node.new("world") }, parse("//hello\rworld")) + assert.same(document.new{ node.new("world") }, parse("//hello\nworld\r\n")) + end) + + it("parses multi line comment", function() + assert.same(document.new{}, parse("/*hello*/")); + assert.same(document.new{}, parse("/*hello*/\n")); + assert.same(document.new{}, parse("/*\nhello\r\n*/")); + assert.same(document.new{}, parse("/*\nhello** /\n*/")); + assert.same(document.new{}, parse("/**\nhello** /\n*/")); + assert.same(document.new{ node.new("world") }, parse("/*hello*/world")); + end) + + it("parses esclines", function() + assert.same(document.new{ node.new("node", { value.new(1) }) }, parse("node\\\n 1")) + assert.has_error(function() parser.parse('node\\\nnode2') end, "Expected EQUALS, got EOF (2:6)") + end) + + it("parses whitespace", function() + assert.same(document.new{ node.new("node") }, parse(" node")) + assert.same(document.new{ node.new("node") }, parse("\tnode")) + assert.same(document.new{ node.new("etc") }, parse("/* \nfoo\r\n */ etc")) + end) + + it('parses newlines', function() + assert.same(document.new{ node.new('node1'), node.new('node2') }, parse("node1\nnode2")) + assert.same(document.new{ node.new('node1'), node.new('node2') }, parse("node1\rnode2")) + assert.same(document.new{ node.new('node1'), node.new('node2') }, parse("node1\r\nnode2")) + assert.same(document.new{ node.new('node1'), node.new('node2') }, parse("node1\n\nnode2")) + end) + + it("parses basic", function() + local doc = parse('title "Hello, World"') + local nodes = document.new{ + node.new("title", { value.new("Hello, World") }) + } + assert.same(nodes, doc) + end) + + it("parses multiple values", function() + local doc = parse("bookmarks 12 15 188 1234") + local nodes = document.new{ + node.new("bookmarks", { value.new(12), value.new(15), value.new(188), value.new(1234) }) + } + assert.same(nodes, doc) + end) + + it("parses properties", function() + local doc = parse[[ + author "Alex Monad" email="alex@example.com" active=true + foo bar=true "baz" quux=false 1 2 3 + ]] + local nodes = document.new{ + n("author", { value.new("Alex Monad") }, function(nd) + nd:insert("email", value.new("alex@example.com")) + nd:insert("active", value.new(true)) + end), + n("foo", { value.new("baz"), value.new(1), value.new(2), value.new(3) }, function(nd) + nd:insert("bar", value.new(true)) + nd:insert("quux", value.new(false)) + end) + } + assert.same(nodes, doc) + end) + + it("parses nested child nodes", function() + local doc = parse[[ + contents { + section "First section" { + paragraph "This is the first paragraph" + paragraph "This is the second paragraph" + } + } + ]] + local nodes = document.new{ + node.new("contents", {}, { + node.new("section", { value.new("First section") }, { + node.new("paragraph", { value.new("This is the first paragraph") }), + node.new("paragraph", { value.new("This is the second paragraph") }) + }) + }) + } + assert.same(nodes, doc) + end) + + it("parses semicolons", function() + local doc = parse("node1; node2; node3;") + local nodes = document.new{ + node.new("node1"), + node.new("node2"), + node.new("node3"), + } + assert.same(nodes, doc) + end) + + it("parses raw strings", function() + local doc = parse[[ + node "this\nhas\tescapes" + other r"C:\Users\zkat\" + other-raw r#"hello"world"# + ]] + local nodes = document.new{ + node.new("node", { value.new("this\nhas\tescapes") }), + node.new("other", { value.new("C:\\Users\\zkat\\") }), + node.new("other-raw", { value.new("hello\"world") }) + } + assert.same(nodes, doc) + end) + + it("parses multiline strings", function() + local doc = parse[[ +string "my +multiline +value" +]] + local nodes = document.new{ + node.new("string", { value.new("my\nmultiline\nvalue") }) + } + assert.same(nodes, doc) + end) + + it("parses numbers", function() + local doc = parse[[ + num 1.234e-42 + my-hex 0xdeadbeef + my-octal 0o755 + my-binary 0b10101101 + bignum 1_000_000 + ]] + local nodes = document.new{ + node.new("num", { value.new(1.234e-42) }), + node.new("my-hex", { value.new(0xdeadbeef) }), + node.new("my-octal", { value.new(493) }), + node.new("my-binary", { value.new(173) }), + node.new("bignum", { value.new(1000000) }) + } + assert.same(nodes, doc) + end) + + it("parses comments comments", function() + local doc = parse[[ + // C style + + /* + C style multiline + */ + + tag /*foo=true*/ bar=false + + /*/* + hello + */*/ + ]] + local nodes = document.new{ + node.new("tag", { ["bar"]=value.new(false) }) + } + assert.same(nodes, doc) + end) + + it("parses slash dash", function() + local doc = parse[[ + /-mynode "foo" key=1 { + a + b + c + } + + mynode /- "commented" "not commented" /-key="value" /-{ + a + b + } + ]] + local nodes = document.new{ + node.new("mynode", { value.new("not commented") }) + } + assert.same(nodes, doc) + end) + + it("parses multiline nodes", function() + local doc = parse[[ + title \ + "Some title" + + my-node 1 2 \ // comments are ok after \ + 3 4 + ]] + local nodes = document.new{ + node.new("title", { value.new("Some title") }), + node.new("my-node", { value.new(1), value.new(2), value.new(3), value.new(4) }) + } + assert.same(nodes, doc) + end) + + it("parses utf8", function() + local doc = parse[[ + smile "😁" + ノãƒŧド お名前="☜(īžŸãƒŽīžŸâ˜œ)" + ]] + local nodes = document.new{ + node.new("smile", { value.new("😁") }), + node.new("ノãƒŧド", { ["お名前"]=value.new("☜(īžŸãƒŽīžŸâ˜œ)") }) + } + assert.same(nodes, doc) + end) + + it("parses node names", function() + local doc = parse[[ + "!@#$@$%Q#$%~@!40" "1.2.3" "!!!!!"=true + foo123~!@#$%^&*.:'|?+ "weeee" + ]] + local nodes = document.new{ + node.new("!@#$@$%Q#$%~@!40", { value.new("1.2.3"), ["!!!!!"]=value.new(true) }), + node.new("foo123~!@#$%^&*.:'|?+", { value.new("weeee") }), + } + assert.same(nodes, doc) + end) + + it("parses escapes", function() + local doc = parse[[ + node1 "\u{1f600}" + node2 "\n\t\r\\\"\f\b" + ]] + local nodes = document.new{ + node.new("node1", { value.new("😀") }), + node.new("node2", { value.new("\n\t\r\\\"\f\b") }) + } + assert.same(nodes, doc) + end) + + it("parses node types", function() + local doc = parse("(foo)node") + local nodes = document.new{ + node.new("node", {}, {}, "foo") + } + assert.same(nodes, doc) + end) + + it("parses value types", function() + local doc = parse('node (foo)"bar"') + local nodes = document.new{ + node.new("node", { value.new("bar", "foo") }), + } + assert.same(nodes, doc) + end) + + it("parses property types", function() + local doc = parse('node baz=(foo)"bar"') + local nodes = document.new{ + node.new("node", { ["baz"]=value.new("bar", "foo") }), + } + assert.same(nodes, doc) + end) + + it("parses child types", function() + local doc = parse[[ + node { + (foo)bar + } + ]] + local nodes = document.new{ + node.new("node", {}, { + node.new("bar", {}, {}, "foo"), + }) + } + assert.same(nodes, doc) + end) + + it("reads version directive", function() + local doc = parse('/- kdl-version 1\nnode "foo"') + assert.is_not_nil(doc) + + assert.has_error(function() parser.parse('/- kdl-version 2\nnode foo') end, "Version mismatch, expected 1, got 2") + end) +end) diff --git a/spec/v1/tokenizer_spec.lua b/spec/v1/tokenizer_spec.lua new file mode 100644 index 0000000..3654cc9 --- /dev/null +++ b/spec/v1/tokenizer_spec.lua @@ -0,0 +1,225 @@ +describe("tokenizer", function() + local tokenizer = require "kdl.v1.tokenizer" + + local function strip(token) + return { type=token.type, value=token.value } + end + + it("can peek at upcoming tokens", function() + local t = tokenizer.new("node 1 2 3") + assert.same({ type="IDENT", value="node" }, strip(t:peek())) + assert.same({ type="WS", value=" " }, strip(t:peek_next())) + assert.same({ type="IDENT", value="node" }, strip(t:next())) + assert.same({ type="WS", value=" " }, strip(t:peek())) + assert.same({ type="INTEGER", value=1 }, strip(t:peek_next())) + end) + + it("tokenizes identifiers", function() + assert.same({ type="IDENT", value="foo" }, strip(tokenizer.new("foo"):next())) + assert.same({ type="IDENT", value="foo-bar123" }, strip(tokenizer.new("foo-bar123"):next())) + end) + + it("tokenizes strings", function() + assert.same({ type="STRING", value="foo" }, strip(tokenizer.new('"foo"'):next())) + assert.same({ type="STRING", value="foo\nbar" }, strip(tokenizer.new('"foo\\nbar"'):next())) + assert.same({ type="STRING", value="\u{10FFF}" }, strip(tokenizer.new('"\\u{10FFF}"'):next())) + end) + + it("tokenizes multi line strings", function() + assert.same({ type="STRING", value="\n foo\n bar\n baz\n qux\n " }, strip(tokenizer.new("\"\n foo\n bar\n baz\n qux\n \""):next())) + assert.same({ type="RAWSTRING", value="\n foo\n bar\n baz\n qux\n " }, strip(tokenizer.new("r#\"\n foo\n bar\n baz\n qux\n \"#"):next())) + end) + + it("tokenizes rawstrings", function() + assert.same({ type="RAWSTRING", value="foo\\nbar" }, strip(tokenizer.new('r"foo\\nbar"'):next())) + assert.same({ type="RAWSTRING", value="foo\"bar" }, strip(tokenizer.new('r#"foo"bar"#'):next())) + assert.same({ type="RAWSTRING", value="foo\"#bar" }, strip(tokenizer.new('r##"foo"#bar"##'):next())) + assert.same({ type="RAWSTRING", value="\"foo\"" }, strip(tokenizer.new('r#""foo""#'):next())) + + local t = tokenizer.new('node r#"C:\\Users\\zkat\\"#') + assert.same({ type="IDENT", value="node" }, strip(t:next())) + assert.same({ type="WS", value=" " }, strip(t:next())) + assert.same({ type="RAWSTRING", value="C:\\Users\\zkat\\" }, strip(t:next())) + + t = tokenizer.new('other-node r#"hello"world"#') + assert.same({ type="IDENT", value="other-node" }, strip(t:next())) + assert.same({ type="WS", value=" " }, strip(t:next())) + assert.same({ type="RAWSTRING", value="hello\"world" }, strip(t:next())) + end) + + it("tokenizes integers", function() + assert.same({ type="INTEGER", value=0x0123456789abcdef }, strip(tokenizer.new("0x0123456789abcdef"):next())) + assert.same({ type="INTEGER", value=342391 }, strip(tokenizer.new("0o01234567"):next())) + assert.same({ type="INTEGER", value=41 }, strip(tokenizer.new("0b101001"):next())) + assert.same({ type="INTEGER", value=-0x0123456789abcdef }, strip(tokenizer.new("-0x0123456789abcdef"):next())) + assert.same({ type="INTEGER", value=-342391 }, strip(tokenizer.new("-0o01234567"):next())) + assert.same({ type="INTEGER", value=-41 }, strip(tokenizer.new("-0b101001"):next())) + assert.same({ type="INTEGER", value=0x0123456789abcdef }, strip(tokenizer.new("+0x0123456789abcdef"):next())) + assert.same({ type="INTEGER", value=342391 }, strip(tokenizer.new("+0o01234567"):next())) + assert.same({ type="INTEGER", value=41 }, strip(tokenizer.new("+0b101001"):next())) + end) + + it("tokenizes floats", function() + assert.same({ type="FLOAT", value=1.23 }, strip(tokenizer.new("1.23"):next())) + end) + + it("tokenizers booleans", function() + assert.same({ type="TRUE", value=true }, strip(tokenizer.new("true"):next())) + assert.same({ type="FALSE", value=false }, strip(tokenizer.new("false"):next())) + end) + + it("tokenizers nulls", function() + assert.same({ type="NULL", value=nil }, strip(tokenizer.new("null"):next())) + end) + + it("tokenizers symbols", function() + assert.same({ type="LBRACE", value="{" }, strip(tokenizer.new("{"):next())) + assert.same({ type="RBRACE", value="}" }, strip(tokenizer.new("}"):next())) + assert.same({ type="EQUALS", value="=" }, strip(tokenizer.new("="):next())) + end) + + it("tokenizes whitespace", function() + assert.same({ type="WS", value=" " }, strip(tokenizer.new(" "):next())) + assert.same({ type="WS", value="\t" }, strip(tokenizer.new("\t"):next())) + assert.same({ type="WS", value=" \t" }, strip(tokenizer.new(" \t"):next())) + assert.same({ type="WS", value="\\\n" }, strip(tokenizer.new("\\\n"):next())) + assert.same({ type="WS", value="\\" }, strip(tokenizer.new("\\"):next())) + assert.same({ type="WS", value="\\\n" }, strip(tokenizer.new("\\//some comment\n"):next())) + assert.same({ type="WS", value="\\ \n" }, strip(tokenizer.new("\\ //some comment\n"):next())) + assert.same({ type="WS", value="\\" }, strip(tokenizer.new("\\//some comment"):next())) + end) + + it("tokenizes multiple tokens", function() + local t = tokenizer.new("node 1 \"two\" a=3") + + assert.same({ type="IDENT", value="node" }, strip(t:next())) + assert.same({ type="WS", value=" " }, strip(t:next())) + assert.same({ type="INTEGER", value=1 }, strip(t:next())) + assert.same({ type="WS", value=" " }, strip(t:next())) + assert.same({ type="STRING", value="two" }, strip(t:next())) + assert.same({ type="WS", value=" " }, strip(t:next())) + assert.same({ type="IDENT", value="a" }, strip(t:next())) + assert.same({ type="EQUALS", value="=" }, strip(t:next())) + assert.same({ type="INTEGER", value=3 }, strip(t:next())) + assert.same({ type="EOF", value="" }, strip(t:next())) + assert.same({ type="EOF", value=nil }, strip(t:next())) + end) + + it("tokenizes single line comments", function() + assert.same({ type="EOF", value="" }, strip(tokenizer.new("// comment"):next())) + + local t = tokenizer.new([[node1 +// comment +node2]]) + + assert.same({ type="IDENT", value="node1" }, strip(t:next())) + assert.same({ type="NEWLINE", value="\n" }, strip(t:next())) + assert.same({ type="NEWLINE", value="\n" }, strip(t:next())) + assert.same({ type="IDENT", value="node2" }, strip(t:next())) + assert.same({ type="EOF", value="" }, strip(t:next())) + assert.same({ type="EOF", value=nil }, strip(t:next())) + end) + + it("tokenizes multiline comments", function() + local t = tokenizer.new("foo /*bar=1*/ baz=2") + + assert.same({ type="IDENT", value="foo" }, strip(t:next())) + assert.same({ type="WS", value=" " }, strip(t:next())) + assert.same({ type="IDENT", value="baz" }, strip(t:next())) + assert.same({ type="EQUALS", value="=" }, strip(t:next())) + assert.same({ type="INTEGER", value=2 }, strip(t:next())) + assert.same({ type="EOF", value="" }, strip(t:next())) + assert.same({ type="EOF", value=nil }, strip(t:next())) + end) + + it("tokenizes utf8", function() + assert.same({ type="IDENT", value="😁" }, strip(tokenizer.new("😁"):next())) + assert.same({ type="STRING", value="😁" }, strip(tokenizer.new('"😁"'):next())) + assert.same({ type="IDENT", value="ノãƒŧド" }, strip(tokenizer.new("ノãƒŧド"):next())) + assert.same({ type="IDENT", value="お名前" }, strip(tokenizer.new("お名前"):next())) + assert.same({ type="STRING", value="☜(īžŸãƒŽīžŸâ˜œ)" }, strip(tokenizer.new('"☜(īžŸãƒŽīžŸâ˜œ)"'):next())) + + local t = tokenizer.new([[smile "😁" +ノãƒŧド お名前="☜(īžŸãƒŽīžŸâ˜œ)"]]) + + assert.same({ type="IDENT", value="smile" }, strip(t:next())) + assert.same({ type="WS", value=" " }, strip(t:next())) + assert.same({ type="STRING", value="😁" }, strip(t:next())) + assert.same({ type="NEWLINE", value="\n" }, strip(t:next())) + assert.same({ type="IDENT", value="ノãƒŧド" }, strip(t:next())) + assert.same({ type="WS", value=" " }, strip(t:next())) + assert.same({ type="IDENT", value="お名前" }, strip(t:next())) + assert.same({ type="EQUALS", value="=" }, strip(t:next())) + assert.same({ type="STRING", value="☜(īžŸãƒŽīžŸâ˜œ)" }, strip(t:next())) + assert.same({ type="EOF", value="" }, strip(t:next())) + assert.same({ type="EOF", value=nil }, strip(t:next())) + end) + + it("tokenizes semicolons", function() + local t = tokenizer.new("node1; node2") + + assert.same({ type="IDENT", value="node1" }, strip(t:next())) + assert.same({ type="SEMICOLON", value=";" }, strip(t:next())) + assert.same({ type="WS", value=" " }, strip(t:next())) + assert.same({ type="IDENT", value="node2" }, strip(t:next())) + assert.same({ type="EOF", value="" }, strip(t:next())) + assert.same({ type="EOF", value=nil }, strip(t:next())) + end) + + it("tokenizes slash dash", function() + local t = tokenizer.new([[/-mynode /-"foo" /-key=1 /-{ + a +}]]) + + assert.same({ type="SLASHDASH", value="/-" }, strip(t:next())) + assert.same({ type="IDENT", value="mynode" }, strip(t:next())) + assert.same({ type="WS", value=" " }, strip(t:next())) + assert.same({ type="SLASHDASH", value="/-" }, strip(t:next())) + assert.same({ type="STRING", value="foo" }, strip(t:next())) + assert.same({ type="WS", value=" " }, strip(t:next())) + assert.same({ type="SLASHDASH", value="/-" }, strip(t:next())) + assert.same({ type="IDENT", value="key" }, strip(t:next())) + assert.same({ type="EQUALS", value="=" }, strip(t:next())) + assert.same({ type="INTEGER", value=1 }, strip(t:next())) + assert.same({ type="WS", value=" " }, strip(t:next())) + assert.same({ type="SLASHDASH", value="/-" }, strip(t:next())) + assert.same({ type="LBRACE", value="{" }, strip(t:next())) + assert.same({ type="NEWLINE", value="\n" }, strip(t:next())) + assert.same({ type="WS", value=" " }, strip(t:next())) + assert.same({ type="IDENT", value="a" }, strip(t:next())) + assert.same({ type="NEWLINE", value="\n" }, strip(t:next())) + assert.same({ type="RBRACE", value="}" }, strip(t:next())) + assert.same({ type="EOF", value="" }, strip(t:next())) + assert.same({ type="EOF", value=nil }, strip(t:next())) + end) + + it("tokenizes multiline nodes", function() + local t = tokenizer.new([[title \ + "Some title"]]) + + assert.same({ type="IDENT", value="title" }, strip(t:next())) + assert.same({ type="WS", value=" \\\n " }, strip(t:next())) + assert.same({ type="STRING", value="Some title" }, strip(t:next())) + assert.same({ type="EOF", value="" }, strip(t:next())) + assert.same({ type="EOF", value=nil }, strip(t:next())) + end) + + it("tokenizes types", function() + local t = tokenizer.new("(foo)bar") + assert.same({ type="LPAREN", value="(" }, strip(t:next())) + assert.same({ type="IDENT", value="foo" }, strip(t:next())) + assert.same({ type="RPAREN", value=")" }, strip(t:next())) + assert.same({ type="IDENT", value="bar" }, strip(t:next())) + + t = tokenizer.new("(foo)/*asdf*/bar") + assert.same({ type="LPAREN", value="(" }, strip(t:next())) + assert.same({ type="IDENT", value="foo" }, strip(t:next())) + assert.same({ type="RPAREN", value=")" }, strip(t:next())) + assert.has_error(function() t:next() end, [[Unexpected '/' (1:6)]]) + + t = tokenizer.new("(foo/*asdf*/)bar") + assert.same({ type="LPAREN", value="(" }, strip(t:next())) + assert.same({ type="IDENT", value="foo" }, strip(t:next())) + assert.has_error(function() t:next() end, [[Unexpected '/' (1:5)]]) + end) +end) diff --git a/src/kdl.lua b/src/kdl.lua index 81345cc..96d6286 100644 --- a/src/kdl.lua +++ b/src/kdl.lua @@ -1,9 +1,27 @@ local kdl = { _version = "dev" } local parser = require "kdl.parser" +local tokenizer = require "kdl.tokenizer" +local parser_v1 = require "kdl.v1.parser" -function kdl.parse_document(str) - return parser.parse(str) +function kdl.parse_document(str, version) + if not version then + local t = tokenizer.new(str) + version = t:version_directive() + if not version then + local success, result = pcall(parser.parse, str) + if success then return result end + return parser_v1.parse(str) + end + end + + if version == 1 then + return parser_v1.parse(str) + elseif version == 2 then + return parser.parse(str) + else + error("Unrecognised version '"..version.."'") + end end -return kdl \ No newline at end of file +return kdl diff --git a/src/parser.lua b/src/parser.lua index 63ee298..e01e16e 100644 --- a/src/parser.lua +++ b/src/parser.lua @@ -14,6 +14,18 @@ function Parser:document() return document.new(nodes) end +function Parser:check_version() + local doc_version = self.tokenizer:version_directive() + if not doc_version then return end + if doc_version ~= 2 then + error("Version mismatch, expected 2, got "..doc_version) + end +end + +local function fail(message, token) + error(message.." ("..token.line..":"..token.column..")") +end + function Parser:nodes() local nodes = {} local n @@ -29,8 +41,7 @@ function Parser:node() local commented = false if self.tokenizer:peek().type == "SLASHDASH" then - self.tokenizer:next() - self:ws() + self:slashdash() commented = true end @@ -38,11 +49,7 @@ function Parser:node() if not type and not self:peek_identifier() then return false end local n = node.new(self:identifier()) - local t = self.tokenizer:peek().type - if t == "WS" or t == "LBRACE" then self:entries(n) - elseif t == "SEMICOLON" then self.tokenizer:next() - elseif t == "LPAREN" then error("Unexpected '('") - end + self:entries(n) if commented then return nil end if type ~= nil then n.type = type end @@ -63,7 +70,7 @@ end function Parser:identifier() local t = self.tokenizer:peek() if self:is_identifier(t) then return self.tokenizer:next().value end - error("Expected identifier, got "..t.type) + fail("Expected identifier, got "..t.type, t) end function Parser:ws() @@ -86,57 +93,81 @@ end function Parser:entries(n) local commented = false + local has_children = false while true do - self:ws() - local p = self.tokenizer:peek().type - if p == "IDENT" then - if self.tokenizer:peek_next().type == "EQUALS" then - local k, v = self:prop() - if not commented then n:insert(k, v) end + local peek = self.tokenizer:peek() + if peek.type == "WS" or peek.type == "SLASHDASH" then + self:ws() + peek = self.tokenizer:peek() + if peek.type == "SLASHDASH" then + self:slashdash() + peek = self.tokenizer:peek() + commented = true + end + if peek.type == "STRING" or peek.type == "IDENT" then + if has_children then fail("Unexpected "..peek.type, peek) end + local t = self.tokenizer:peek_next() + if t.type == "EQUALS" then + local k, v = self:prop() + if not commented then n:insert(k, v) end + else + local v = self:value() + if not commented then n:insert(v) end + end + commented = false + elseif peek.type == "NEWLINE" or + peek.type == "EOF" or + peek.type == "SEMICOLON" then + self.tokenizer:next() + return + elseif peek.type == "LBRACE" then + self:lbrace(n, commented) + has_children = true + commented = false + elseif peek.type == "RBRACE" then + self:rbrace() + return else local v = self:value() + if has_children then fail("Unexpected "..peek.type, peek) end if not commented then n:insert(v) end + commented = false end - commented = false - elseif p == "LBRACE" then - self.depth = self.depth + 1 - local children = self:children() - if not commented then n.children = children end - self:node_term() - return - elseif p == "RBRACE" then - if self.depth == 0 then error("Unexpected '}'") end - self.depth = self.depth - 1 - return - elseif p == "SLASHDASH" then - commented = true - self.tokenizer:next() - self:ws() - elseif p == "NEWLINE" or p == "EOF" or p == "SEMICOLON" then + elseif peek.type == "NEWLINE" or + peek.type == "EOF" or + peek.type == "SEMICOLON" then self.tokenizer:next() return - elseif p == "STRING" then - if self.tokenizer:peek_next().type == "EQUALS" then - local k, v = self:prop() - if not commented then n:insert(k, v) end - else - local v = self:value() - if not commented then n:insert(v) end - end + elseif peek.type == "LBRACE" then + self:lbrace(n, commented) + has_children = true commented = false + elseif peek.type == "RBRACE" then + self:rbrace() + return else - local v = self:value() - if not commented then n:insert(v) end - commented = false + fail("Unexpected "..peek.type, peek) end end end +function Parser:lbrace(n, commented) + if not commented and #n.children > 0 then fail("Unexpected {", self.tokenizer:peek()) end + self.depth = self.depth + 1 + local children = self:children() + self.depth = self.depth - 1 + if not commented then n.children = children end +end + +function Parser:rbrace() + if self.depth == 0 then fail("Unexpected }", self.tokenizer:peek()) end +end + function Parser:prop() local name = self:identifier() self:expect("EQUALS") - local value = self:value() - return name, value + local val = self:value() + return name, val end function Parser:children() @@ -160,7 +191,7 @@ function Parser:value() t.type == "NULL" then return value.new(t.value, type) end - error("Expected value, got "..t.type) + fail("Expected value, got "..t.type, t) end function Parser:type() @@ -174,27 +205,29 @@ function Parser:type() return type end -function Parser:expect(type) - local t = self.tokenizer:peek().type - if t == type then return self.tokenizer:next() - else error("Expected "..type..", got "..t) end +function Parser:slashdash() + local t = self.tokenizer:next() + if t.type ~= "SLASHDASH" then + fail("Expected SLASHDASH, found "..t.type, t) + end + self:linespaces() + local peek = self.tokenizer:peek() + if peek.type == "RBRACE" or peek.type == "EOF" or peek.type=="SEMICOLON" then + fail("Unexpected "..peek.type.." after SLASHDASH", peek) + end end -function Parser:node_term() - self:ws() - local t = self.tokenizer:peek().type - if t == "NEWLINE" or t == "SEMICOLON" or t == "EOF" then - return self.tokenizer:next() - elseif t ~= "RBRACE" then - error("Unexpected "..t) - end +function Parser:expect(type) + local t = self.tokenizer:peek() + if t.type == type then return self.tokenizer:next() + else fail("Expected "..type..", got "..t.type, t) end end function Parser:eof() - local t = self.tokenizer:peek().type - if t == "EOF" or t == false then return end + local t = self.tokenizer:peek() + if t.type == "EOF" then return end - error("Expected EOF, got "..t) + fail("Expected EOF, got "..t.type, t) end function parser.parse(str) @@ -203,7 +236,8 @@ function parser.parse(str) depth=0 } setmetatable(p, { __index = Parser }) + p:check_version() return p:document() end -return parser \ No newline at end of file +return parser diff --git a/src/stringdumper.lua b/src/stringdumper.lua index 1c3d5d8..15f15f1 100644 --- a/src/stringdumper.lua +++ b/src/stringdumper.lua @@ -52,4 +52,4 @@ function stringdumper.dump(string) return s..'"' end -return stringdumper \ No newline at end of file +return stringdumper diff --git a/src/tokenizer.lua b/src/tokenizer.lua index d0226fb..94a930b 100644 --- a/src/tokenizer.lua +++ b/src/tokenizer.lua @@ -13,35 +13,6 @@ local function debom(str) return str end -function string:lines() - local function char(i) - if i < 0 or i > utf8.len(self) then - return nil - end - return utf8.sub(self, i, i) - end - - local lines = {} - local i = 1 - local buffer = "" - while i <= utf8.len(self) do - local c = char(i) - if c == "\r" and char(i+1) == "\n" then - table.insert(lines, buffer) - buffer = "" - i = i+1 - elseif table.contains(util.NEWLINES, c) then - table.insert(lines, buffer) - buffer = "" - else - buffer = buffer..c - end - i = i+1 - end - table.insert(lines, buffer) - return lines -end - function tokenizer.new(str, start) local self = { str=debom(str), @@ -55,12 +26,25 @@ function tokenizer.new(str, start) comment_nesting=0, peeked_tokens={}, in_type=false, - last_token=nil + last_token=nil, + line=1, + column=1, + line_at_start=1, + column_at_start=1 } setmetatable(self, { __index = Tokenizer }) return self end +function Tokenizer:version_directive() + local match = self.str:match(util.VERSION_PATTERN) + return match and tonumber(match) +end + +function Tokenizer:fail(message) + error(message.." ("..self.line_at_start..":"..self.column_at_start..")") +end + function Tokenizer:reset() self.index = self.start end @@ -131,42 +115,78 @@ local function valid_integer(s) return s:match("^[+-]?%d[0-9_]*$") end -local function parse_decimal(s) +function Tokenizer:parse_decimal(s) if s:match("[.eE]") and valid_float(s) then return { type="FLOAT", value=tonumber(munch_underscores(s)) } elseif valid_integer(s) then return { type="INTEGER", value=tonumber(munch_underscores(s), 10) } else - if table.contains(util.NON_INITIAL_IDENTIFIER_CHARS, utf8.sub(s, 1, 1)) then error("Invalid number: "..s) end + if table.contains(util.NON_INITIAL_IDENTIFIER_CHARS, utf8.sub(s, 1, 1)) then self:fail("Invalid number: "..s) end for i = 2,utf8.len(s) do - if table.contains(util.NON_IDENTIFIER_CHARS, utf8.sub(s, i, i)) then error("Invalid number: "..s) end + if table.contains(util.NON_IDENTIFIER_CHARS, utf8.sub(s, i, i)) then self:fail("Invalid number: "..s) end end return { type="IDENT", value=s } end end -local function parse_hexadecimal(s) +function Tokenizer:parse_hexadecimal(s) if s:match("^[+-]?%x[0-9a-fA-F_]*$") then return { type="INTEGER", value=tonumber(munch_underscores(s), 16)} end - error("Invalid hexadecimal: "..s) + self:fail("Invalid hexadecimal: "..s) end -local function parse_octal(s) +function Tokenizer:parse_octal(s) if s:match("^[+-]?[0-7][0-7_]*$") then return { type="INTEGER", value=tonumber(munch_underscores(s), 8)} end - error("Invalid octal: "..s) + self:fail("Invalid octal: "..s) end -local function parse_binary(s) +function Tokenizer:parse_binary(s) if s:match("^[+-]?[01][01_]*$") then return { type="INTEGER", value=tonumber(munch_underscores(s), 2)} end - error("Invalid binary: "..s) + self:fail("Invalid binary: "..s) +end + +local function unescape_ws(str) + local function char(i) + if i < 0 or i > utf8.len(str) then + return nil + end + return utf8.sub(str, i, i) + end + + local i = 1 + local buffer = "" + while i <= utf8.len(str) do + local c = char(i) + if c == nil then + return buffer + elseif c == "\\" then + local c2 = char(i+1) + if c2 == nil then return buffer + elseif c2 == "\\" then buffer = buffer.."\\\\"; i = i+1 + elseif table.contains(util.WHITESPACE, c2) or table.contains(util.NEWLINES, c2) then + local j = i+2 + local cj = char(j) + while table.contains(util.WHITESPACE, cj) or table.contains(util.NEWLINES, cj) do + j = j+1 + cj = char(j) + end + i = j-1 + else buffer = buffer + c + end + else buffer = buffer..c + end + i = i+1 + end + + return buffer end -local function convert_escapes(str) +function Tokenizer:convert_escapes(str, ws) local function char(i) if i < 0 or i > utf8.len(str) then return nil @@ -192,8 +212,8 @@ local function convert_escapes(str) elseif c2 == "f" then buffer = buffer.."\f"; i = i+1 elseif c2 == "s" then buffer = buffer.." "; i = i+1 elseif c2 == "u" then - local c2 = char(i+2) - if c2 ~= "{" then error("Invalid unicode escape") end + c2 = char(i+2) + if c2 ~= "{" then self:fail("Invalid unicode escape") end local hex = "" local j = i+3 local cj = char(j) @@ -202,13 +222,15 @@ local function convert_escapes(str) j = j+1 cj = char(j) end - if #hex > 6 or char(j) ~= "}" then error("Invalid unicode escape: \\u{"..hex.."}") end + if #hex > 6 or char(j) ~= "}" then self:fail("Invalid unicode escape: \\u{"..hex.."}") end local code = tonumber(hex, 16) - if not code then error("Invalid unicode escape: "..hex) end - if code < 0 or code > 0x10FFFF then error(string.format("Invalid code point \\u{%x}", code)) end + if not code then self:fail("Invalid unicode escape: "..hex) end + if code < 0 or code > 0x10FFFF or (code >= 0xD800 and code <= 0xDFFF) then + self:fail(string.format("Invalid code point \\u{%x}", code)) + end i = j buffer = buffer..utf8.char(code) - elseif table.contains(util.WHITESPACE, c2) or table.contains(util.NEWLINES, c2) then + elseif ws and (table.contains(util.WHITESPACE, c2) or table.contains(util.NEWLINES, c2)) then local j = i+2 local cj = char(j) while table.contains(util.WHITESPACE, cj) or table.contains(util.NEWLINES, cj) do @@ -217,7 +239,7 @@ local function convert_escapes(str) end i = j-1 else - error("Unexpected escape: \\"..c2) + self:fail("Unexpected escape: \\"..c2) end else buffer = buffer..c end @@ -227,60 +249,77 @@ local function convert_escapes(str) return buffer end -local function unindent(s) - local all = s:lines() - local indent = all[#all] - local lines = {} - table.move(all, 1, #all-1, 1, lines) +function Tokenizer:unescape(str) + return self:convert_escapes(str, true) +end - if #indent ~= 0 then - for i=1,utf8.len(indent) do - if not table.contains(util.WHITESPACE, utf8.sub(indent,i,i)) then - error("Invalid muliline string final line: '"..indent.."'") - end - end - for _, line in pairs(lines) do - if not line:starts(indent) then - error("Invalid multiline string indentation") - end - end +function Tokenizer:unescape_non_ws(str) + return self:convert_escapes(str, false) +end + +function Tokenizer:dedent(str) + local lines = util.lines(str) + local indent = table.remove(lines, #lines) + if not indent:match("^"..util.wss.."$") then + self:fail("Invalid multi-line string final line") end - local result = "" - for i, line in pairs(lines) do - result = result..utf8.sub(line, utf8.len(indent)+1) - if i < #lines then result = result.."\n" end + local valid = "^"..indent.."(.*)" + + local result = {} + for _,line in ipairs(lines) do + if line:match("^"..util.wss.."$") then + table.insert(result, '') + goto continue + end + local m = line:match(valid) + if m then + table.insert(result, m) + goto continue + end + self:fail("Invalid multi-line string indentation") + ::continue:: end - return result + return table.join(result, "\n") end function Tokenizer:_read_next() self.context = nil self.previous_context = nil + self.line_at_start = self.line + self.column_at_start = self.column while true do ::continue:: local c = self:char(self.index) if self.context == nil then - if c == '"' then + if c == nil then + if self.done then + return self:_token("EOF", nil) + end + self.done = true + return self:_token("EOF", "") + elseif c == '"' then self.buffer = "" - if self:char(self.index + 1) == "\n" then + if self:char(self.index + 1) == '"' and self:char(self.index + 2) == '"' then + local nl = self:expect_newline(self.index + 3) self:set_context("multi_line_string") - self.index = self.index + 2 + self:traverse(3 + utf8.len(nl)) else self:set_context("string") - self.index = self.index + 1 + self:traverse(1) end elseif c == "#" then if self:char(self.index + 1) == '"' then - self.rawstring_hashes = 1 self.buffer = "" - if self:char(self.index + 2) == "\n" then + self.rawstring_hashes = 1 + if self:char(self.index + 2) == '"' and self:char(self.index + 3) == '"' then + local nl = self:expect_newline(self.index + 4) self:set_context("multi_line_rawstring") - self.index = self.index + 3 + self:traverse(utf8.len(nl) + 4) else self:set_context("rawstring") - self.index = self.index + 2 + self:traverse(2) end goto continue elseif self:char(self.index + 1) == "#" then @@ -292,26 +331,27 @@ function Tokenizer:_read_next() end if self:char(i) == '"' then self.buffer = "" - if self:char(i + 1) == "\n" then + if self:char(i + 1) == '"' and self:char(i + 2) == '"' then + local nl = self:expect_newline(i + 3) self:set_context("multi_line_rawstring") - self.index = i + 2 + self:traverse(self.rawstring_hashes + 3 + utf8.len(nl)) else self:set_context("rawstring") - self.index = i + 1 + self:traverse(self.rawstring_hashes + 1) end goto continue end end self:set_context("keyword") self.buffer = c - self.index = self.index + 1 + self:traverse(1) elseif c == "-" then local n = self:char(self.index + 1) local n2 = self:char(self.index + 2) if n ~= nil and n:match("%d") then if n == "0" and n2 ~= nil and n2:match("[box]") then self:set_context(integer_context(n2)) - self.index = self.index + 2 + self:traverse(2) else self:set_context("decimal") end @@ -319,148 +359,165 @@ function Tokenizer:_read_next() self:set_context("ident") end self.buffer = c - self.index = self.index + 1 + self:traverse(1) elseif c ~= nil and c:match("[0-9+]") then local n = self:char(self.index + 1) local n2 = self:char(self.index + 2) if c == "0" and n ~= nil and n:match("[box]") then - self.index = self.index + 2 self.buffer = "" self:set_context(integer_context(n)) + self:traverse(2) elseif c == "+" and n == "0" and n2 ~= nil and n2:match("[box]") then - self.index = self.index + 3 self.buffer = c self:set_context(integer_context(n2)) + self:traverse(3) else - self:set_context("decimal") - self.index = self.index + 1 self.buffer = c + self:set_context("decimal") + self:traverse(1) end elseif c == "\\" then local t = tokenizer.new(self.str, self.index + 1) local la = t:next() if la.type == "NEWLINE" or la.type == "EOF" then - self.index = t.index - self:set_context("whitespace") self.buffer = c..la.value + self:set_context("whitespace") + self:traverse_to(t.index) goto continue elseif la.type == "WS" then local lan = t:next() if lan.type == "NEWLINE" or lan.type == "EOF" then - self.index = t.index - self:set_context("whitespace") self.buffer = c..la.value if lan.type == "NEWLINE" then self.buffer = self.buffer.."\n" end + self:set_context("whitespace") + self:traverse_to(t.index) goto continue end end - error("Unexpected '\\") - elseif table.contains(util.EQUALS, c) then - self:set_context("equals") + self:fail([[Unexpected '\']]) + elseif c == "=" then self.buffer = c - self.index = self.index + 1 + self:set_context("equals") + self:traverse(1) elseif util.SYMBOLS[c] then - self.index = self.index + 1 - return { type=util.SYMBOLS[c], value=c } - elseif c == "\r" then - local n = self:char(self.index + 1) - if n == "\n" then - self.index = self.index + 2 - return { type="NEWLINE", value=c..n } - else - self.index = self.index + 1 - return { type="NEWLINE", value=c } - end - elseif table.contains(util.NEWLINES, c) then - self.index = self.index + 1 - return { type="NEWLINE", value=c } + self:traverse(1) + return self:_token(util.SYMBOLS[c], c) + elseif c == "\r" or table.contains(util.NEWLINES, c) then + local nl = self:expect_newline(self.index) + self:traverse(utf8.len(nl)) + return self:_token("NEWLINE", nl) elseif c == "/" then local n = self:char(self.index + 1) if n == "/" then - if self.in_type or self.last_token == "RPAREN" then error("Unexpected '/'") end + if self.in_type or self.last_token == "RPAREN" then self:fail("Unexpected '/'") end self:set_context("single_line_comment") - self.index = self.index + 2 + self:traverse(2) elseif n == "*" then self:set_context("multi_line_comment") self.comment_nesting = 1 - self.index = self.index + 2 + self:traverse(2) elseif n == "-" then - self.index = self.index + 2 - return { type="SLASHDASH", value="/-" } + self:traverse(2) + return self:_token("SLASHDASH", "/-") else - error("Unexpected '"..c.."'") + self:fail("Unexpected '"..c.."'") end elseif table.contains(util.WHITESPACE, c) then - self:set_context("whitespace") self.buffer = c - self.index = self.index + 1 - elseif c == nil then - if self.done then return { type=false, value=false } end - self.done = true - return { type="EOF", value="" } + self:set_context("whitespace") + self:traverse(1) elseif not table.contains(util.NON_INITIAL_IDENTIFIER_CHARS, c) then - self:set_context("ident") self.buffer = c - self.index = self.index + 1 + self:set_context("ident") + self:traverse(1) elseif c == "(" then self.in_type = true - self.index = self.index + 1 - return { type="LPAREN", value=c } + self:traverse(1) + return self:_token("LPAREN", c) elseif c == ")" then self.in_type = false - self.index = self.index + 1 - return { type="RPAREN", value=c } + self:traverse(1) + return self:_token("RPAREN", c) else - error("Unexpected '"..c.."'") + self:fail("Unexpected '"..c.."'") end elseif self.context == "ident" then if c ~= nil and not table.contains(util.NON_IDENTIFIER_CHARS, c) then - self.index = self.index + 1 self.buffer = self.buffer..c + self:traverse(1) else if table.contains(util.RESERVED, self.buffer) then - error("Identifier cannot be a literal") + self:fail("Identifier cannot be a literal") elseif self.buffer:match("^%.%d") then - error("Identifier cannot look like an illegal float") + self:fail("Identifier cannot look like an illegal float") else - return { type="IDENT", value=self.buffer } + return self:_token("IDENT", self.buffer) end end elseif self.context == "keyword" then if c ~= nil and c:match("[a-z%-]") then - self.index = self.index + 1 self.buffer = self.buffer..c + self:traverse(1) else - if self.buffer == "#true" then return { type="TRUE", value=true } end - if self.buffer == "#false" then return { type="FALSE", value=false } end - if self.buffer == "#null" then return { type="NULL", value=nil } end - if self.buffer == "#inf" then return { type="FLOAT", value=math.huge } end - if self.buffer == "#-inf" then return { type="FLOAT", value=-math.huge } end - if self.buffer == "#nan" then return { type="FLOAT", value=-(0/0) } end - error("Unknown keyword "..self.buffer) + if self.buffer == "#true" then return self:_token("TRUE", true) end + if self.buffer == "#false" then return self:_token("FALSE", false) end + if self.buffer == "#null" then return self:_token("NULL", nil) end + if self.buffer == "#inf" then return self:_token("FLOAT", math.huge) end + if self.buffer == "#-inf" then return self:_token("FLOAT", -math.huge) end + if self.buffer == "#nan" then return self:_token("FLOAT", -(0/0)) end + self:fail("Unknown keyword "..self.buffer) end - elseif self.context == "string" or self.context == "multi_line_string" then + elseif self.context == "string" then if c == "\\" then self.buffer = self.buffer..c - self.buffer = self.buffer..self:char(self.index + 1) - self.index = self.index + 2 + local c2 = self:char(self.index + 1) + self.buffer = self.buffer..c2 + if table.contains(util.NEWLINES, c2) then + local i = 2 + c2 = self:char(self.index + i) + while table.contains(util.NEWLINES, c2) do + self.buffer = self.buffer..c2 + i = i + 1 + c2 = self:char(self.index + i) + end + self:traverse(i) + else + self:traverse(2) + end elseif c == '"' then - self.index = self.index + 1 - local string = self.buffer - if self.context == "multi_line_string" then string = unindent(string) end - string = convert_escapes(string) - return { type="STRING", value=string } - elseif c == nil or c == "" then - error("Unterminated string literal") + self:traverse(1) + return self:_token("STRING", self:unescape(self.buffer)) + elseif c == "" or c == nil then + self:fail("Unterminated string literal") else + if table.contains(util.NEWLINES, c) then + self:fail("Unexpected NEWLINE in single-line string") + end self.buffer = self.buffer..c - self.index = self.index + 1 + self:traverse(1) end - elseif self.context == "rawstring" or self.context == "multi_line_rawstring" then + elseif self.context == "multi_line_string" then + if c == "\\" then + self.buffer = self.buffer..c..self:char(self.index + 1) + self:traverse(2) + elseif c == '"' then + if self:char(self.index + 1) == '"' and self:char(self.index + 2) == '"' then + self:traverse(3) + return self:_token("STRING", self:unescape_non_ws(self:dedent(unescape_ws(self.buffer)))) + end + self.buffer = self.buffer..c + self:traverse(1) + elseif c == "" or c == nil then + self:fail("Unterminated multi-line string literal") + else + self.buffer = self.buffer..c + self:traverse(1) + end + elseif self.context == "rawstring" then if c == nil or c == "" then - error("Unterminated rawstring literal") + self:fail("Unterminated rawstring literal") end if c == '"' then @@ -469,119 +526,142 @@ function Tokenizer:_read_next() h = h + 1 end if h == self.rawstring_hashes then - self.index = self.index + 1 + h - local string = self.buffer - if self.context == "multi_line_rawstring" then string = unindent(string) end - return { type="RAWSTRING", value=string } + self:traverse(1 + h) + return self:_token("RAWSTRING", self.buffer) + end + elseif table.contains(util.NEWLINES, c) then + self:fail("Unexpected NEWLINE in single-line rawstring") + end + + self.buffer = self.buffer..c + self:traverse(1) + elseif self.context == "multi_line_rawstring" then + if c == nil or c == "" then + self:fail("Unterminated multi-line rawstring literal") + end + + if c == '"' and + self:char(self.index + 1) == '"' and + self:char(self.index + 2) == '"' and + self:char(self.index + 3) == '#' then + + local h = 1 + while self:char(self.index + 3 + h) == "#" and h < self.rawstring_hashes do + h = h + 1 + end + if h == self.rawstring_hashes then + self:traverse(h + 3) + return self:_token("RAWSTRING", self:dedent(self.buffer)) end end self.buffer = self.buffer..c - self.index = self.index + 1 + self:traverse(1) elseif self.context == "decimal" then if c ~= nil and c:match("[0-9%.%-+_eE]") then - self.index = self.index + 1 self.buffer = self.buffer..c + self:traverse(1) elseif table.contains(util.WHITESPACE, c) or table.contains(util.NEWLINES, c) or c == nil then - return parse_decimal(self.buffer) + return self:parse_decimal(self.buffer) else - error("Unexpected '"..c.."'") + self:fail("Unexpected '"..c.."'") end elseif self.context == "hexadecimal" then if c ~= nil and c:match("[0-9a-fA-F_]") then - self.index = self.index + 1 self.buffer = self.buffer..c + self:traverse(1) elseif table.contains(util.WHITESPACE, c) or table.contains(util.NEWLINES, c) or c == nil then - return parse_hexadecimal(self.buffer) + return self:parse_hexadecimal(self.buffer) else - error("Unexpected '"..c.."'") + self:fail("Unexpected '"..c.."'") end elseif self.context == "octal" then if c ~= nil and c:match("[0-7_]") then - self.index = self.index + 1 self.buffer = self.buffer..c + self:traverse(1) elseif table.contains(util.WHITESPACE, c) or table.contains(util.NEWLINES, c) or c == nil then - return parse_octal(self.buffer) + return self:parse_octal(self.buffer) else - error("Unexpected '"..c.."'") + self:fail("Unexpected '"..c.."'") end elseif self.context == "binary" then if c ~= nil and c:match("[01_]") then - self.index = self.index + 1 self.buffer = self.buffer..c + self:traverse(1) elseif table.contains(util.WHITESPACE, c) or table.contains(util.NEWLINES, c) or c == nil then - return parse_binary(self.buffer) + return self:parse_binary(self.buffer) else - error("Unexpected '"..c.."'") + self:fail("Unexpected '"..c.."'") end elseif self.context == "single_line_comment" then if table.contains(util.NEWLINES, c) or c == "\r" then self:set_context(nil) + self.column_at_start = self.column goto continue elseif c == nil then self.done = true - return { type="EOF", value="" } + return self:_token("EOF", "") else - self.index = self.index + 1 + self:traverse(1) end elseif self.context == "multi_line_comment" then local n = self:char(self.index + 1) if c == "/" and n == "*" then self.comment_nesting = self.comment_nesting + 1 - self.index = self.index + 2 + self:traverse(2) elseif c == "*" and n == "/" then self.comment_nesting = self.comment_nesting - 1 - self.index = self.index + 2 + self:traverse(2) if self.comment_nesting == 0 then self:revert_context() end else - self.index = self.index + 1 + self:traverse(1) end elseif self.context == "whitespace" then if table.contains(util.WHITESPACE, c) then - self.index = self.index + 1 self.buffer = self.buffer..c - elseif table.contains(util.EQUALS, c) then - self:set_context("equals") + self:traverse(1) + elseif c == "=" then self.buffer = self.buffer..c - self.index = self.index + 1 + self:set_context("equals") + self:traverse(1) elseif c == "\\" then local t = tokenizer.new(self.str, self.index + 1) local la = t:next() if la.type == "NEWLINE" or la.type == "EOF" then - self.index = t.index self.buffer = self.buffer..c..la.value + self:traverse_to(t.index) goto continue elseif la.type == "WS" then local lan = t:next() if lan.type == "NEWLINE" or lan.type == "EOF" then - self.index = t.index self.buffer = self.buffer..c..la.value if lan.type == "NEWLINE" then self.buffer = self.buffer.."\n" end + self:traverse_to(t.index) goto continue end end - error("Unexpected '\\'") + self:fail([[Unexpected '\']]) elseif c == "/" and self:char(self.index + 1) == "*" then - self:set_context("multi_line_comment") self.comment_nesting = 1 - self.index = self.index + 2 + self:set_context("multi_line_comment") + self:traverse(2) else - return { type="WS", value=self.buffer } + return self:_token("WS", self.buffer) end elseif self.context == "equals" then local t = tokenizer.new(self.str, self.index) local la = t:next() if la.type == "WS" then self.buffer = self.buffer..la.value - self.index = t.index + self:traverse_to(t.index) end - return { type="EQUALS", value=self.buffer } + return self:_token("EQUALS", self.buffer) elseif self.context == nil then - error("Unexpected nil context") + self:fail("Unexpected nil context") else - error("Unexpected context "..self.context) + self:fail("Unexpected context "..self.context) end end end @@ -590,14 +670,49 @@ function Tokenizer:char(i) if i < 1 or i > utf8.len(self.str) then return nil end local c = utf8.sub(self.str, i, i) for _, value in pairs(util.FORBIDDEN) do - if c == value then error("Forbidden character: "..c) end + if c == value then self:fail("Forbidden character: "..c) end end return c end +function Tokenizer:_token(type, value) + return { type=type, value=value, line=self.line_at_start, column=self.column_at_start } +end + +function Tokenizer:traverse(n) + n = n or 1 + for i = 0,n-1 do + local c = self:char(self.index + i) + if c == "\r" then + self.column = 1 + elseif table.contains(util.NEWLINES, c) then + self.line = self.line + 1 + self.column = 1 + else + self.column = self.column + 1 + end + end + self.index = self.index + n +end + +function Tokenizer:traverse_to(i) + self:traverse(i - self.index) +end + function Tokenizer:revert_context() self.context = self.previous_context self.previous_context = nil end -return tokenizer \ No newline at end of file +function Tokenizer:expect_newline(i) + local c = self:char(i) + if c == "\r" then + local n = self:char(i + 1) + if n == "\n" then return c..n end + elseif not table.contains(util.NEWLINES, c) then + self:fail("Expected NEWLINE, found '"..c.."'") + end + return c +end + +return tokenizer diff --git a/src/util.lua b/src/util.lua index 9bf7c1b..478ed4e 100644 --- a/src/util.lua +++ b/src/util.lua @@ -1,5 +1,7 @@ local utf8 = require "lua-utf8" +local util = {} + function table.contains(t, x) for _, value in pairs(t) do if value == x then return true end @@ -7,34 +9,73 @@ function table.contains(t, x) return false end -function string:starts(with) - return utf8.sub(self,1,utf8.len(with)) == with +function table.join(t, sep) + local s = "" + local first = true + sep = sep or "" + for _,v in ipairs(t) do + if first then + s = v + first = false + else + s = s .. sep .. v + end + end + return s end -local util = {} +function util.lines(str) + local function char(i) + if i < 0 or i > utf8.len(str) then + return nil + end + return utf8.sub(str, i, i) + end -util.EQUALS = {"=", "īšĻ", "īŧ", "🟰"} + local lines = {} + local i = 1 + local buffer = "" + while i <= utf8.len(str) do + local c = char(i) + if c == "\r" and char(i+1) == "\n" then + table.insert(lines, buffer) + buffer = "" + i = i+1 + elseif table.contains(util.NEWLINES, c) then + table.insert(lines, buffer) + buffer = "" + else + buffer = buffer..c + end + i = i+1 + end + table.insert(lines, buffer) + return lines +end util.SYMBOLS = { ["{"]="LBRACE", ["}"]="RBRACE", - [";"]="SEMICOLON" + [";"]="SEMICOLON", + ["="]="EQUALS" } -for _, value in pairs(util.EQUALS) do - util.SYMBOLS[value] = "EQUALS" -end util.DIGITS = { "0", "1", "2", "3", "4", "5", "6", "7", "8", "9" } util.WHITESPACE = { - "\u{0009}", "\u{000B}", "\u{0020}", "\u{00A0}", - "\u{1680}", "\u{2000}", "\u{2001}", "\u{2002}", - "\u{2003}", "\u{2004}", "\u{2005}", "\u{2006}", - "\u{2007}", "\u{2008}", "\u{2009}", "\u{200A}", - "\u{202F}", "\u{205F}", "\u{3000}" + "\u{0009}", "\u{0020}", "\u{00A0}", "\u{1680}", + "\u{2000}", "\u{2001}", "\u{2002}", "\u{2003}", + "\u{2004}", "\u{2005}", "\u{2006}", "\u{2007}", + "\u{2008}", "\u{2009}", "\u{200A}", "\u{202F}", + "\u{205F}", "\u{3000}" } +util.ws = "["..table.join(util.WHITESPACE).."]" +util.wss = util.ws.."*" -util.NEWLINES = { "\u{000A}", "\u{0085}", "\u{000C}", "\u{2028}", "\u{2029}" } +util.NEWLINES = { "\u{000A}", "\u{0085}", "\u{000B}", "\u{000C}", "\u{2028}", "\u{2029}" } + +util.nl = "["..table.join(util.NEWLINES).."]" +util.nls = util.nl.."*" util.NON_IDENTIFIER_CHARS = { nil, @@ -58,4 +99,6 @@ for i = 0x2066, 0x2069 do table.insert(util.FORBIDDEN, utf8.char(i)) end util.RESERVED = { "true", "false", "null", "inf", "-inf", "nan" } -return util \ No newline at end of file +util.VERSION_PATTERN = "^/%-"..util.ws.."+kdl%-version"..util.ws.."+([0-9]+)"..util.wss..util.nls + +return util diff --git a/src/v1/parser.lua b/src/v1/parser.lua new file mode 100644 index 0000000..3b98919 --- /dev/null +++ b/src/v1/parser.lua @@ -0,0 +1,207 @@ +local tokenizer = require "kdl.v1.tokenizer" +local document = require "kdl.document" +local node = require "kdl.node" +local value = require "kdl.value" + +local parser = {} + +local Parser = {} + +function Parser:document() + local nodes = self:nodes() + self:linespaces() + self:eof() + return document.new(nodes) +end + +function Parser:check_version() + local doc_version = self.tokenizer:version_directive() + if not doc_version then return end + if doc_version ~= 1 then + error("Version mismatch, expected 1, got "..doc_version) + end +end + +local function fail(message, token) + error(message.." ("..token.line..":"..token.column..")") +end + +function Parser:nodes() + local nodes = {} + local n + repeat + n = self:node() + if n then table.insert(nodes, n) end + until n == false + return nodes +end + +function Parser:node() + self:linespaces() + + local commented = false + if self.tokenizer:peek().type == "SLASHDASH" then + self.tokenizer:next() + self:ws() + commented = true + end + + local type = self:type() + if not type and not self:peek_identifier() then return false end + local n = node.new(self:identifier()) + + self:entries(n) + + if commented then return nil end + if type ~= nil then n.type = type end + + return n +end + +function Parser:is_identifier(t) + return t.type == "IDENT" or t.type == "STRING" or t.type == "RAWSTRING" +end + +function Parser:peek_identifier() + local t = self.tokenizer:peek() + if self:is_identifier(t) then return t end + return nil +end + +function Parser:identifier() + local t = self.tokenizer:peek() + if self:is_identifier(t) then return self.tokenizer:next().value end + fail("Expected identifier, got "..t.type, t) +end + +function Parser:ws() + local t = self.tokenizer:peek() + while t.type == "WS" do + self.tokenizer:next() + t = self.tokenizer:peek() + end +end + +function Parser:linespaces() + while self:is_linespace(self.tokenizer:peek()) do + self.tokenizer:next() + end +end + +function Parser:is_linespace(t) + return t.type == "NEWLINE" or t.type == "WS" +end + +function Parser:entries(n) + local commented = false + while true do + self:ws() + local peek = self.tokenizer:peek() + if peek.type == "IDENT" then + local k, v = self:prop() + if not commented then n:insert(k, v) end + commented = false + elseif peek.type == "LBRACE" then + local child_nodes = self:children() + if not commented then n.children = child_nodes end + self:node_term() + return + elseif peek.type == "SLASHDASH" then + commented = true + self.tokenizer:next() + self:ws() + elseif peek.type == "NEWLINE" or + peek.type == "EOF" or + peek.type == "SEMICOLON" then + self.tokenizer:next() + return + elseif peek.type == "STRING" then + local t = self.tokenizer:peek_next() + if t.type == "EQUALS" then + local k, v = self:prop() + if not commented then n:insert(k, v) end + else + local v = self:value() + if not commented then n:insert(v) end + end + commented = false + else + local v = self:value() + if not commented then n:insert(v) end + commented = false + end + end +end + +function Parser:prop() + local name = self:identifier() + self:expect("EQUALS") + local val = self:value() + return name, val +end + +function Parser:children() + self:expect("LBRACE") + local node_list = self:nodes() + self:linespaces() + self:expect("RBRACE") + return node_list +end + +function Parser:value() + local type = self:type() + local t = self.tokenizer:next() + if t.type == "STRING" or + t.type == "RAWSTRING" or + t.type == "INTEGER" or + t.type == "FLOAT" or + t.type == "TRUE" or + t.type == "FALSE" or + t.type == "NULL" then + return value.new(t.value, type) + end + fail("Expected value, got "..t.type, t) +end + +function Parser:type() + if self.tokenizer:peek().type ~= "LPAREN" then return nil end + self:expect("LPAREN") + local type = self:identifier() + self:expect("RPAREN") + return type +end + +function Parser:expect(type) + local t = self.tokenizer:peek() + if t.type == type then return self.tokenizer:next() + else fail("Expected "..type..", got "..t.type, t) end +end + +function Parser:node_term() + self:ws() + local t = self.tokenizer:peek() + if t.type == "NEWLINE" or t.type == "SEMICOLON" or t.type == "EOF" then + self.tokenizer:next() + else + fail("Unexpected "..t.type, t) + end +end + +function Parser:eof() + local t = self.tokenizer:peek() + if t.type == "EOF" then return end + + fail("Expected EOF, got "..t.type, t) +end + +function parser.parse(str) + local p = { + tokenizer=tokenizer.new(str), + depth=0 + } + setmetatable(p, { __index = Parser }) + p:check_version() + return p:document() +end + +return parser diff --git a/src/v1/tokenizer.lua b/src/v1/tokenizer.lua new file mode 100644 index 0000000..496b746 --- /dev/null +++ b/src/v1/tokenizer.lua @@ -0,0 +1,551 @@ +local utf8 = require "lua-utf8" + +local util = require "kdl.v1.util" + +local tokenizer = {} + +local Tokenizer = {} + +local function debom(str) + if utf8.sub(str, 1, 1) == "\u{FEFF}" then + return utf8.sub(str, 2) + end + return str +end + +function tokenizer.new(str, start) + local self = { + str=debom(str), + start=start or 1, + index=start or 1, + context=nil, + rawstring_hashes=-1, + buffer="", + done=false, + previous_context=nil, + comment_nesting=0, + peeked_tokens={}, + in_type=false, + last_token=nil, + line=1, + column=1, + line_at_start=1, + column_at_start=1 + } + setmetatable(self, { __index = Tokenizer }) + return self +end + +function Tokenizer:version_directive() + local match = self.str:match(util.VERSION_PATTERN) + return match and tonumber(match) +end + +function Tokenizer:fail(message) + error(message.." ("..self.line_at_start..":"..self.column_at_start..")") +end + +function Tokenizer:reset() + self.index = self.start +end + +function Tokenizer:tokens() + local a = {} + while not self.done do + table.insert(a, self:next()) + end + return a +end + +function Tokenizer:set_context(context) + self.previous_context = self.context + self.context = context +end + +function Tokenizer:peek() + if #self.peeked_tokens == 0 then + table.insert(self.peeked_tokens, self:_next()) + end + return self.peeked_tokens[1] +end + +function Tokenizer:peek_next() + if #self.peeked_tokens == 0 then + table.insert(self.peeked_tokens, self:_next()) + table.insert(self.peeked_tokens, self:_next()) + elseif #self.peeked_tokens == 1 then + table.insert(self.peeked_tokens, self:_next()) + end + return self.peeked_tokens[2] +end + +function Tokenizer:next() + if #self.peeked_tokens > 0 then + return table.remove(self.peeked_tokens, 1) + else + return self:_next() + end +end + +function Tokenizer:_next() + local token = self:_read_next() + if token ~= nil and token.type ~= false then self.last_token = token.type end + return token +end + +local function integer_context(n) + if n == "b" then return "binary" end + if n == "o" then return "octal" end + if n == "x" then return "hexadecimal" end +end + +local function munch_underscores(s) + local s2, _ = s:gsub("_", "") + return s2 +end + +local function valid_float(s) + return s:match("^[+-]?%d[0-9_]*(%.%d[0-9_]*)$") or + s:match("^[+-]?%d[0-9_]*$") or + s:match("^[+-]?%d[0-9_]*([eE][+-]?%d[0-9_]*)$") or + s:match("^[+-]?%d[0-9_]*(%.%d[0-9_]*)([eE][+-]?%d[0-9_]*)$") +end + +local function valid_integer(s) + return s:match("^[+-]?%d[0-9_]*$") +end + +function Tokenizer:parse_decimal(s) + if s:match("[.eE]") and valid_float(s) then + return { type="FLOAT", value=tonumber(munch_underscores(s)) } + elseif valid_integer(s) then + return { type="INTEGER", value=tonumber(munch_underscores(s), 10) } + else + if table.contains(util.NON_INITIAL_IDENTIFIER_CHARS, utf8.sub(s, 1, 1)) then self:fail("Invalid number: "..s) end + for i = 2,utf8.len(s) do + if table.contains(util.NON_IDENTIFIER_CHARS, utf8.sub(s, i, i)) then self:fail("Invalid number: "..s) end + end + return { type="IDENT", value=s } + end +end + +function Tokenizer:parse_hexadecimal(s) + if s:match("^[+-]?%x[0-9a-fA-F_]*$") then + return { type="INTEGER", value=tonumber(munch_underscores(s), 16)} + end + self:fail("Invalid hexadecimal: "..s) +end + +function Tokenizer:parse_octal(s) + if s:match("^[+-]?[0-7][0-7_]*$") then + return { type="INTEGER", value=tonumber(munch_underscores(s), 8)} + end + self:fail("Invalid octal: "..s) +end + +function Tokenizer:parse_binary(s) + if s:match("^[+-]?[01][01_]*$") then + return { type="INTEGER", value=tonumber(munch_underscores(s), 2)} + end + self:fail("Invalid binary: "..s) +end + +function Tokenizer:unescape(str) + local function char(i) + if i < 0 or i > utf8.len(str) then + return nil + end + return utf8.sub(str, i, i) + end + + local i = 1 + local buffer = "" + while i <= utf8.len(str) do + local c = char(i) + if c == nil then + return buffer + elseif c == "\\" then + local c2 = char(i+1) + if c2 == nil then return buffer + elseif c2 == "n" then buffer = buffer.."\n"; i = i+1 + elseif c2 == "r" then buffer = buffer.."\r"; i = i+1 + elseif c2 == "t" then buffer = buffer.."\t"; i = i+1 + elseif c2 == "\\" then buffer = buffer.."\\"; i = i+1 + elseif c2 == '"' then buffer = buffer..'"'; i = i+1 + elseif c2 == "b" then buffer = buffer.."\b"; i = i+1 + elseif c2 == "f" then buffer = buffer.."\f"; i = i+1 + elseif c2 == "s" then buffer = buffer.." "; i = i+1 + elseif c2 == "u" then + c2 = char(i+2) + if c2 ~= "{" then self:fail("Invalid unicode escape") end + local hex = "" + local j = i+3 + local cj = char(j) + while cj and cj:match("%x") do + hex = hex..cj + j = j+1 + cj = char(j) + end + if #hex > 6 or char(j) ~= "}" then self:fail("Invalid unicode escape: \\u{"..hex.."}") end + local code = tonumber(hex, 16) + if not code then self:fail("Invalid unicode escape: "..hex) end + if code < 0 or code > 0x10FFFF or (code >= 0xD800 and code <= 0xDFFF) then + self:fail(string.format("Invalid code point \\u{%x}", code)) + end + i = j + buffer = buffer..utf8.char(code) + elseif table.contains(util.WHITESPACE, c2) or table.contains(util.NEWLINES, c2) then + local j = i+2 + local cj = char(j) + while table.contains(util.WHITESPACE, cj) or table.contains(util.NEWLINES, cj) do + j = j+1 + cj = char(j) + end + i = j-1 + else + self:fail("Unexpected escape: \\"..c2) + end + else buffer = buffer..c + end + i = i+1 + end + + return buffer +end + +function Tokenizer:_read_next() + self.context = nil + self.previous_context = nil + self.line_at_start = self.line + self.column_at_start = self.column + while true do + ::continue:: + local c = self:char(self.index) + if self.context == nil then + if c == nil then + if self.done then + return self:_token("EOF", nil) + end + self.done = true + return self:_token("EOF", "") + elseif c == '"' then + self:set_context("string") + self.buffer = "" + self:traverse(1) + elseif c == "r" then + if self:char(self.index + 1) == '"' then + self:set_context("rawstring") + self:traverse(2) + self.rawstring_hashes = 0 + self.buffer = "" + goto continue + elseif self:char(self.index + 1) == "#" then + local i = self.index + 1 + self.rawstring_hashes = 0 + while self:char(i) == "#" do + self.rawstring_hashes = self.rawstring_hashes + 1 + i = i + 1 + end + if self:char(i) == '"' then + self:set_context("rawstring") + self:traverse(self.rawstring_hashes + 2) + self.buffer = "" + goto continue + end + end + self:set_context("ident") + self.buffer = c + self:traverse(1) + elseif c == "-" then + local n = self:char(self.index + 1) + local n2 = self:char(self.index + 2) + if n ~= nil and n:match("%d") then + if n == "0" and n2 ~= nil and n2:match("[box]") then + self:set_context(integer_context(n2)) + self:traverse(2) + else + self:set_context("decimal") + end + else + self:set_context("ident") + end + self.buffer = c + self:traverse(1) + elseif c ~= nil and c:match("[0-9+]") then + local n = self:char(self.index + 1) + local n2 = self:char(self.index + 2) + if c == "0" and n ~= nil and n:match("[box]") then + self.buffer = "" + self:set_context(integer_context(n)) + self:traverse(2) + elseif c == "+" and n == "0" and n2 ~= nil and n2:match("[box]") then + self.buffer = c + self:set_context(integer_context(n2)) + self:traverse(3) + else + self.buffer = c + self:set_context("decimal") + self:traverse(1) + end + elseif c == "\\" then + local t = tokenizer.new(self.str, self.index + 1) + local la = t:next() + if la.type == "NEWLINE" or la.type == "EOF" then + self.buffer = c..la.value + self:set_context("whitespace") + self:traverse_to(t.index) + goto continue + elseif la.type == "WS" then + local lan = t:next() + if lan.type == "NEWLINE" or lan.type == "EOF" then + self.buffer = c..la.value + if lan.type == "NEWLINE" then + self.buffer = self.buffer.."\n" + end + self:set_context("whitespace") + self:traverse_to(t.index) + goto continue + end + end + self:fail([[Unexpected '\']]) + elseif util.SYMBOLS[c] then + if c == "(" then + self.in_type = true + elseif c == ")" then + self.in_type = false + end + self:traverse(1) + return self:_token(util.SYMBOLS[c], c) + elseif c == "\r" or table.contains(util.NEWLINES, c) then + local nl = self:expect_newline(self.index) + self:traverse(utf8.len(nl)) + return self:_token("NEWLINE", nl) + elseif c == "/" then + local n = self:char(self.index + 1) + if n == "/" then + if self.in_type or self.last_token == "RPAREN" then self:fail("Unexpected '/'") end + self:set_context("single_line_comment") + self:traverse(2) + elseif n == "*" then + if self.in_type or self.last_token == "RPAREN" then self:fail("Unexpected '/'") end + self:set_context("multi_line_comment") + self.comment_nesting = 1 + self:traverse(2) + elseif n == "-" then + self:traverse(2) + return self:_token("SLASHDASH", "/-") + else + self:fail("Unexpected '"..c.."'") + end + elseif table.contains(util.WHITESPACE, c) then + self.buffer = c + self:set_context("whitespace") + self:traverse(1) + elseif not table.contains(util.NON_INITIAL_IDENTIFIER_CHARS, c) then + self.buffer = c + self:set_context("ident") + self:traverse(1) + else + self:fail("Unexpected '"..c.."'") + end + elseif self.context == "ident" then + if c ~= nil and not table.contains(util.NON_IDENTIFIER_CHARS, c) then + self.buffer = self.buffer..c + self:traverse(1) + else + if self.buffer == "true" then return self:_token("TRUE", true) + elseif self.buffer == "false" then return self:_token("FALSE", false) + elseif self.buffer == "null" then return self:_token("NULL", nil) + else return self:_token("IDENT", self.buffer) + end + end + elseif self.context == "string" then + if c == "\\" then + local c2 = self:char(self.index + 1) + self.buffer = self.buffer..c..c2 + if table.contains(util.NEWLINES, c2) then + local i = 2 + c2 = self:char(self.index + i) + while table.contains(util.NEWLINES, c2) do + self.buffer = self.buffer..c2 + i = i + 1 + c2 = self:char(self.index + i) + end + self:traverse(i) + else + self:traverse(2) + end + elseif c == '"' then + self:traverse(1) + return self:_token("STRING", self:unescape(self.buffer)) + elseif c == nil or c == "" then + self:fail("Unterminated string literal") + else + self.buffer = self.buffer..c + self:traverse(1) + end + elseif self.context == "rawstring" then + if c == nil or c == "" then + self:fail("Unterminated rawstring literal") + end + + if c == '"' then + local h = 0 + while self:char(self.index + 1 + h) == "#" and h < self.rawstring_hashes do + h = h + 1 + end + if h == self.rawstring_hashes then + self:traverse(1 + h) + return self:_token("RAWSTRING", self.buffer) + end + end + + self.buffer = self.buffer..c + self:traverse(1) + elseif self.context == "decimal" then + if c ~= nil and c:match("[0-9%.%-+_eE]") then + self.buffer = self.buffer..c + self:traverse(1) + elseif table.contains(util.WHITESPACE, c) or table.contains(util.NEWLINES, c) or c == nil then + return self:parse_decimal(self.buffer) + else + self:fail("Unexpected '"..c.."'") + end + elseif self.context == "hexadecimal" then + if c ~= nil and c:match("[0-9a-fA-F_]") then + self.buffer = self.buffer..c + self:traverse(1) + elseif table.contains(util.WHITESPACE, c) or table.contains(util.NEWLINES, c) or c == nil then + return self:parse_hexadecimal(self.buffer) + else + self:fail("Unexpected '"..c.."'") + end + elseif self.context == "octal" then + if c ~= nil and c:match("[0-7_]") then + self.buffer = self.buffer..c + self:traverse(1) + elseif table.contains(util.WHITESPACE, c) or table.contains(util.NEWLINES, c) or c == nil then + return self:parse_octal(self.buffer) + else + self:fail("Unexpected '"..c.."'") + end + elseif self.context == "binary" then + if c ~= nil and c:match("[01_]") then + self.buffer = self.buffer..c + self:traverse(1) + elseif table.contains(util.WHITESPACE, c) or table.contains(util.NEWLINES, c) or c == nil then + return self:parse_binary(self.buffer) + else + self:fail("Unexpected '"..c.."'") + end + elseif self.context == "single_line_comment" then + if table.contains(util.NEWLINES, c) or c == "\r" then + self:set_context(nil) + self.column_at_start = self.column + goto continue + elseif c == nil then + self.done = true + return self:_token("EOF", "") + else + self:traverse(1) + end + elseif self.context == "multi_line_comment" then + local n = self:char(self.index + 1) + if c == "/" and n == "*" then + self.comment_nesting = self.comment_nesting + 1 + self:traverse(2) + elseif c == "*" and n == "/" then + self.comment_nesting = self.comment_nesting - 1 + self:traverse(2) + if self.comment_nesting == 0 then self:revert_context() end + else + self:traverse(1) + end + elseif self.context == "whitespace" then + if table.contains(util.WHITESPACE, c) then + self.buffer = self.buffer..c + self:traverse(1) + elseif c == "\\" then + local t = tokenizer.new(self.str, self.index + 1) + local la = t:next() + if la.type == "NEWLINE" or la.type == "EOF" then + self.buffer = self.buffer..c..la.value + self:traverse_to(t.index) + goto continue + elseif la.type == "WS" then + local lan = t:next() + if lan.type == "NEWLINE" or lan.type == "EOF" then + self.buffer = self.buffer..c..la.value + if lan.type == "NEWLINE" then + self.buffer = self.buffer.."\n" + end + self:traverse_to(t.index) + goto continue + end + end + self:fail([[Unexpected '\']]) + elseif c == "/" and self:char(self.index + 1) == "*" then + self.comment_nesting = 1 + self:set_context("multi_line_comment") + self:traverse(2) + else + return self:_token("WS", self.buffer) + end + elseif self.context == nil then + self:fail("Unexpected nil context") + else + self:fail("Unexpected context "..self.context) + end + end +end + +function Tokenizer:char(i) + if i < 1 or i > utf8.len(self.str) then return nil end + local c = utf8.sub(self.str, i, i) + for _, value in pairs(util.FORBIDDEN) do + if c == value then self:fail("Forbidden character: "..c) end + end + return c +end + +function Tokenizer:_token(type, value) + return { type=type, value=value, line=self.line_at_start, column=self.column_at_start } +end + +function Tokenizer:traverse(n) + n = n or 1 + for i = 0,n-1 do + local c = self:char(self.index + i) + if c == "\r" then + self.column = 1 + elseif table.contains(util.NEWLINES, c) then + self.line = self.line + 1 + self.column = 1 + else + self.column = self.column + 1 + end + end + self.index = self.index + n +end + +function Tokenizer:traverse_to(i) + self:traverse(i - self.index) +end + +function Tokenizer:revert_context() + self.context = self.previous_context + self.previous_context = nil +end + +function Tokenizer:expect_newline(i) + local c = self:char(i) + if c == "\r" then + local n = self:char(i + 1) + if n == "\n" then return c..n end + elseif not table.contains(util.NEWLINES, c) then + self:fail("Expected NEWLINE, found '"..c.."'") + end + return c +end + +return tokenizer diff --git a/src/v1/util.lua b/src/v1/util.lua new file mode 100644 index 0000000..178bc1c --- /dev/null +++ b/src/v1/util.lua @@ -0,0 +1,81 @@ +local utf8 = require "lua-utf8" + +function table.contains(t, x) + for _, value in pairs(t) do + if value == x then return true end + end + return false +end + +function table.join(t, sep) + local s = "" + local first = true + sep = sep or "" + for _,v in ipairs(t) do + if first then + s = v + first = false + else + s = s .. sep .. v + end + end + return s +end + +function string:starts(with) + return utf8.sub(self,1,utf8.len(with)) == with +end + +local util = {} + +util.SYMBOLS = { + ["{"]="LBRACE", + ["}"]="RBRACE", + ["("]="LPAREN", + [")"]="RPAREN", + [";"]="SEMICOLON", + ["="]="EQUALS" +} + +util.DIGITS = { "0", "1", "2", "3", "4", "5", "6", "7", "8", "9" } + +util.WHITESPACE = { + "\u{0009}", "\u{000B}", "\u{0020}", "\u{00A0}", + "\u{1680}", "\u{2000}", "\u{2001}", "\u{2002}", + "\u{2003}", "\u{2004}", "\u{2005}", "\u{2006}", + "\u{2007}", "\u{2008}", "\u{2009}", "\u{200A}", + "\u{202F}", "\u{205F}", "\u{3000}" +} +util.ws = "["..table.join(util.WHITESPACE).."]" +util.wss = util.ws.."*" + +util.NEWLINES = { "\u{000A}", "\u{0085}", "\u{000C}", "\u{2028}", "\u{2029}" } + +util.nl = "["..table.join(util.NEWLINES).."]" +util.nls = util.nl.."*" + +util.NON_IDENTIFIER_CHARS = { + nil, + "\r", "\\", "[", "]", "<", ">", "[", "]", '"', ",", "/" +} +for _, value in pairs(util.WHITESPACE) do table.insert(util.NON_IDENTIFIER_CHARS, value) end +for _, value in pairs(util.NEWLINES) do table.insert(util.NON_IDENTIFIER_CHARS, value) end +for key, _ in pairs(util.SYMBOLS) do table.insert(util.NON_IDENTIFIER_CHARS, key) end +for i = 0x0000, 0x0020 do table.insert(util.NON_IDENTIFIER_CHARS, utf8.char(i)) end + +util.NON_INITIAL_IDENTIFIER_CHARS = {} +for _, value in pairs(util.NON_IDENTIFIER_CHARS) do table.insert(util.NON_INITIAL_IDENTIFIER_CHARS, value) end +for _, value in pairs(util.DIGITS) do table.insert(util.NON_INITIAL_IDENTIFIER_CHARS, value) end + +util.FORBIDDEN = { "\u{007F}", "\u{FEFF}" } +for i = 0x0000, 0x0008 do table.insert(util.FORBIDDEN, utf8.char(i)) end +for i = 0x000E, 0x001F do table.insert(util.FORBIDDEN, utf8.char(i)) end +for i = 0x200E, 0x200F do table.insert(util.FORBIDDEN, utf8.char(i)) end +for i = 0x202A, 0x202E do table.insert(util.FORBIDDEN, utf8.char(i)) end +for i = 0x2066, 0x2069 do table.insert(util.FORBIDDEN, utf8.char(i)) end + +util.RESERVED = { "true", "false", "null", "inf", "-inf", "nan" } + +util.VERSION_PATTERN = "^/%-"..util.ws.."+kdl%-version"..util.ws.."+([0-9]+)"..util.wss..util.nls + +return util diff --git a/src/value.lua b/src/value.lua index 0241b57..4750e4a 100644 --- a/src/value.lua +++ b/src/value.lua @@ -17,7 +17,7 @@ local function __tostring(self) elseif self.value ~= self.value then s = "#nan" else - s = tostring(self.value) + s = tostring(self.value):upper() end elseif self.value == true then s = "#true" @@ -46,4 +46,4 @@ function value.new(v, type) return self end -return value \ No newline at end of file +return value