diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ef39826..170c643 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -16,7 +16,7 @@ jobs: steps: - uses: actions/setup-go@v3 with: - go-version: 1.18 + go-version: 1.21 - name: Checkout code uses: actions/checkout@v2 @@ -53,7 +53,7 @@ jobs: uses: actions/checkout@v3 - uses: actions/setup-go@v3 with: - go-version: 1.18 + go-version: 1.21 - uses: actions/cache@v3 with: path: | @@ -65,7 +65,7 @@ jobs: - uses: zencargo/github-action-go-mod-tidy@v1 with: path: . - go-version: 1.18 + go-version: 1.21 test: runs-on: ubuntu-latest @@ -75,7 +75,7 @@ jobs: - name: Set up Go uses: actions/setup-go@v3 with: - go-version: 1.18 + go-version: 1.21 - name: Set up Cache uses: actions/cache@v3 with: diff --git a/go.mod b/go.mod index b823034..c03b753 100644 --- a/go.mod +++ b/go.mod @@ -1,10 +1,11 @@ module github.com/piprate/json-gold -go 1.18 +go 1.21 require ( - github.com/pquerna/cachecontrol v0.0.0-20180517163645-1555304b9b35 - github.com/stretchr/testify v1.8.3 + github.com/cayleygraph/quad v1.3.0 + github.com/pquerna/cachecontrol v0.2.0 + github.com/stretchr/testify v1.9.0 ) require ( diff --git a/go.sum b/go.sum index 0cf1aef..cb1a715 100644 --- a/go.sum +++ b/go.sum @@ -1,12 +1,18 @@ +github.com/cayleygraph/quad v1.3.0 h1:xg7HOLWWPgvZ4CcvzEpfCwq42L8mzYUR+8V0jtYoBzc= +github.com/cayleygraph/quad v1.3.0/go.mod h1:NadtM7uMm78FskmX++XiOOrNvgkq0E1KvvhQdMseMz4= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/pquerna/cachecontrol v0.0.0-20180517163645-1555304b9b35 h1:J9b7z+QKAmPf4YLrFg6oQUotqHQeUNWwkvo7jZp1GLU= -github.com/pquerna/cachecontrol v0.0.0-20180517163645-1555304b9b35/go.mod h1:prYjPmNq4d1NPVmpShWobRqXY3q7Vp+80DqgxxUrUIA= -github.com/stretchr/testify v1.8.3 h1:RP3t2pwF7cMEbC1dqtB6poj3niw/9gnV4Cjg5oW5gtY= -github.com/stretchr/testify v1.8.3/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/pquerna/cachecontrol v0.2.0 h1:vBXSNuE5MYP9IJ5kjsdo8uq+w41jSPgvba2DEnkRx9k= +github.com/pquerna/cachecontrol v0.2.0/go.mod h1:NrUG3Z7Rdu85UNR3vm7SOsl1nFIeSiQnrHV5K9mBcUI= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/ld/serialize_nquads.go b/ld/serialize_nquads.go index db4c2f7..e72f7af 100644 --- a/ld/serialize_nquads.go +++ b/ld/serialize_nquads.go @@ -19,8 +19,10 @@ import ( "bytes" "fmt" "io" - "regexp" "strings" + + "github.com/cayleygraph/quad" + "github.com/cayleygraph/quad/nquads" ) // NQuadRDFSerializer parses and serializes N-Quads. @@ -108,15 +110,6 @@ func toNQuad(triple *Quad, graphName string) string { return quad } -func unescape(str string) string { - str = strings.ReplaceAll(str, "\\\\", "\\") - str = strings.ReplaceAll(str, "\\\"", "\"") - str = strings.ReplaceAll(str, "\\n", "\n") - str = strings.ReplaceAll(str, "\\r", "\r") - str = strings.ReplaceAll(str, "\\t", "\t") - return str -} - func escape(str string) string { str = strings.ReplaceAll(str, "\\", "\\\\") str = strings.ReplaceAll(str, "\"", "\\\"") @@ -126,66 +119,6 @@ func escape(str string) string { return str } -const ( - wso = "[ \\t]*" - iri = "(?:<([^:]+:[^>]*)>)" - - // https://www.w3.org/TR/turtle/#grammar-production-BLANK_NODE_LABEL - - pnCharsBase = "A-Z" + "a-z" + - "\u00C0-\u00D6" + - "\u00D8-\u00F6" + - "\u00F8-\u02FF" + - "\u0370-\u037D" + - "\u037F-\u1FFF" + - "\u200C-\u200D" + - "\u2070-\u218F" + - "\u2C00-\u2FEF" + - "\u3001-\uD7FF" + - "\uF900-\uFDCF" + - "\uFDF0-\uFFFD" - // TODO: - //"\u10000-\uEFFFF" - - pnCharsU = pnCharsBase + "_" - - pnChars = pnCharsU + - "0-9" + - "-" + - "\u00B7" + - "\u0300-\u036F" + - "\u203F-\u2040" - - blankNodeLabel = "(_:" + - "(?:[" + pnCharsU + "0-9])" + - "(?:(?:[" + pnChars + ".])*(?:[" + pnChars + "]))?" + - ")" - - // '(_:' + - // '(?:[' + PN_CHARS_U + '0-9])' + - // '(?:(?:[' + PN_CHARS + '.])*(?:[' + PN_CHARS + ']))?' + - // ')'; - - bnode = blankNodeLabel - - plain = "\"([^\"\\\\]*(?:\\\\.[^\"\\\\]*)*)\"" - datatype = "(?:\\^\\^" + iri + ")" - language = "(?:@([a-z]+(?:-[a-zA-Z0-9]+)*))" - literal = "(?:" + plain + "(?:" + datatype + "|" + language + ")?)" - ws = "[ \\t]+" - - subject = "(?:" + iri + "|" + bnode + ")" + ws - property = iri + ws - object = "(?:" + iri + "|" + bnode + "|" + literal + ")" + wso - graph = "(?:\\.|(?:(?:" + iri + "|" + bnode + ")" + wso + "\\.))" -) - -var regexEmpty = regexp.MustCompile("^" + wso + "$") - -// full quad regex - -var regexQuad = regexp.MustCompile("^" + wso + subject + property + object + graph + wso + "$") //nolint:gocritic - type lineScanner interface { Bytes() []byte Scan() bool @@ -253,53 +186,63 @@ func ParseNQuadsFrom(o interface{}) (*RDFDataset, error) { lineNumber++ // skip empty lines - if regexEmpty.Match(line) { + if isEmpty(line) { continue } // parse quad - if !regexQuad.Match(line) { - return nil, NewJsonLdError(SyntaxError, fmt.Errorf("error while parsing N-Quads; invalid quad. line: %d", lineNumber)) + q, err := nquads.ParseRaw(string(line)) + if err != nil { + return nil, NewJsonLdError(SyntaxError, fmt.Errorf("error while parsing N-Quads; invalid quad. line: %d. reason: %w", lineNumber, err)) } - match := regexQuad.FindStringSubmatch(string(line)) // get subject var subject Node - if match[1] != "" { - subject = NewIRI(unescape(match[1])) - } else { - subject = NewBlankNode(unescape(match[2])) + switch v := q.Subject.(type) { + case quad.IRI: + subject = NewIRI(string(v)) + case quad.BNode: + subject = NewBlankNode(v.String()) + default: + return nil, fmt.Errorf("invalid subject: %s", q.Subject.String()) } // get predicate - predicate := NewIRI(unescape(match[3])) + var predicate Node + if iri, ok := q.Predicate.(quad.IRI); ok { + predicate = NewIRI(string(iri)) + } else { + return nil, fmt.Errorf("invalid predicate: %s", q.Predicate.String()) + } // get object var object Node - if match[4] != "" { - object = NewIRI(unescape(match[4])) - } else if match[5] != "" { - object = NewBlankNode(unescape(match[5])) - } else { - language := unescape(match[8]) - var datatype string - if match[7] != "" { - datatype = unescape(match[7]) - } else if match[8] != "" { - datatype = RDFLangString - } else { - datatype = XSDString - } - unescaped := unescape(match[6]) - object = NewLiteral(unescaped, datatype, language) + switch obj := q.Object.(type) { + case quad.IRI: + object = NewIRI(string(obj)) + case quad.BNode: + object = NewBlankNode(obj.String()) + case quad.TypedString: + object = NewLiteral(string(obj.Value), string(obj.Type), "") + case quad.LangString: + object = NewLiteral(string(obj.Value), RDFLangString, obj.Lang) + case quad.String: + object = NewLiteral(string(obj), XSDString, "") + default: + return nil, fmt.Errorf("invalid object: %s", q.Object.String()) } // get graph name ('@default' is used for the default graph) name := "@default" - if match[9] != "" { - name = unescape(match[9]) - } else if match[10] != "" { - name = unescape(match[10]) + if label := q.Label; label != nil { + switch label := label.(type) { + case quad.IRI: + name = string(label) + case quad.BNode: + name = label.String() + default: + return nil, fmt.Errorf("invalid label: %s", q.Label.String()) + } } triple := NewQuad(subject, predicate, object, name) @@ -331,3 +274,12 @@ func ParseNQuadsFrom(o interface{}) (*RDFDataset, error) { func ParseNQuads(input string) (*RDFDataset, error) { return ParseNQuadsFrom(input) } + +func isEmpty(line []byte) bool { + for _, b := range line { + if b != ' ' && b != '\t' { + return false + } + } + return true +} diff --git a/ld/serialize_nquads_bench_test.go b/ld/serialize_nquads_bench_test.go new file mode 100644 index 0000000..fdadd2d --- /dev/null +++ b/ld/serialize_nquads_bench_test.go @@ -0,0 +1,51 @@ +// Copyright 2026 Siemens AG +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ld + +import ( + "testing" +) + +var benchInput string = ` + . + +_:b0 . + + _:b0 . + + "literal value" . + + "Hello World"@en . + + "42"^^ . + + "Line 1\\nLine 2\\tTabbed" . + + . + + _:graph . + + "Quote: \"nested\" and backslash: \\\\" . +` + +func BenchmarkParseNQuadsFrom(b *testing.B) { + b.ReportAllocs() + for i := 0; i < b.N; i++ { + _, err := ParseNQuads(benchInput) + if err != nil { + b.Fatalf("failed to parse benchInput: %s", err) + } + } +}