diff --git a/.gitignore b/.gitignore index 57edc61..ead7226 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,3 @@ -test/*.json -test/test -test/*.pprof +benchmarks/*.json +benchmarks/test +*.pprof diff --git a/README.md b/README.md index 173bc53..9398e49 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,5 @@ # libjson -> WARNING: libjson is currently a work in progress :) - Fast and minimal JSON parser written in and for Go with a JIT query language ```go @@ -13,16 +11,18 @@ import ( func main() { input := `{ "hello": {"world": ["hi"] } }` - jsonObj, _ := New(input) // or libjson.NewReader(r io.Reader) + jsonObj, _ := libjson.New([]byte(input)) // or libjson.NewReader(r io.Reader) // accessing values - fmt.Println(Get[string](jsonObj, ".hello.world.0")) // hi, nil + fmt.Println(libjson.Get[string](jsonObj, ".hello.world.0")) // hi, nil } ``` ## Features -- [ECMA 404](https://ecma-international.org/wp-content/uploads/ECMA-404_2nd_edition_december_2017.pdf) +- Parser consumes and mutates the input to make most operations zero copy and zero alloc +- Full materialisation, no type access helpers or other weird overhead +- [ECMA 404](https://ecma-international.org/publications-and-standards/standards/ecma-404/) and [rfc8259](https://www.rfc-editor.org/rfc/rfc8259) compliant - tests against [JSONTestSuite](https://github.com/nst/JSONTestSuite), see [Parsing JSON is a Minefield @@ -30,156 +30,73 @@ func main() { - no trailing commata, comments, `Nan` or `Infinity` - top level atom/skalars, like strings, numbers, true, false and null - uft8 support via go [rune](https://go.dev/blog/strings) -- no reflection, uses a custom query language similar to JavaScript object access instead +- no reflection, uses a custom query language similar to JavaScript object access instead, or simply use the go values as is - generics for value insertion and extraction with `libjson.Get` and `libjson.Set` - caching of queries with `libjson.Compile`, just in time caching of queries - serialisation via `json.Marshal` +## Why is it faster than encoding/json? + +- zero-copy strings +- mutate input for string escaping instead of allocating +- no allocations for strings, views into the original input +- no reflection +- no copies for map keys +- very simple lexer and parser + ## Benchmarks -![libjson-vs-encodingjson](https://github.com/user-attachments/assets/b11bcce4-e7db-4c45-ab42-45a2042e2a51) +### Go internal +| Benchmark | ns/op | B/op | allocs/op | speedup | alloc reduction | +| --------------------- | ----------- | ----------- | --------- | ------- | --------------- | +| libjson Naive | 46,294,122 | 34,907,845 | 500,023 | 1.85x | 2.10x fewer | +| encoding/json Naive | 85,502,921 | 42,744,522 | 1,050,031 | - | - | +| libjson Escaped | 38,199,760 | 25,394,245 | 350,023 | 2.32x | 3.14x fewer | +| encoding/json Escaped | 88,478,499 | 37,544,406 | 1,100,030 | - | - | +| libjson Hard | 154,178,081 | 139,915,859 | 1,400,023 | 2.52x | 2.14x fewer | +| encoding/json Hard | 388,198,395 | 173,944,514 | 3,000,032 | - | - | -These results were generated with the following specs: +Run via -```text -OS: Arch Linux x86_64 -Kernel: 6.10.4-arch2-1 -Memory: 32024MiB -Go version: 1.23 +```shell +go test -bench=. -benchmem ``` -Below this section is a list of performance improvements and their impact on -the overall performance as well as the full results of -[test/bench.sh](test/bench.sh). +Results in: -### [b23001e](https://github.com/xNaCly/libjson/commit/b23001eca470935976a36cfbbc7a3c773d784a03) +```text +goos: linux +goarch: amd64 +pkg: github.com/xnacly/libjson +cpu: AMD Ryzen 7 3700X 8-Core Processor +BenchmarkLibJson_Naive-16 26 46294122 ns/op 34907845 B/op 500023 allocs/op +BenchmarkLibJson_Escaped-16 28 38199760 ns/op 25394245 B/op 350023 allocs/op +BenchmarkLibJson_Hard-16 7 154178081 ns/op 139915859 B/op 1400023 allocs/op +BenchmarkEncodingJson_Naive-16 13 85502921 ns/op 42744522 B/op 1050031 allocs/op +BenchmarkEncodingJson_Escaped-16 12 88478499 ns/op 37544406 B/op 1100030 allocs/op +BenchmarkEncodingJson_Hard-16 3 388198395 ns/op 173944514 B/op 3000032 allocs/op +PASS +ok github.com/xnacly/libjson 8.510s +``` -| JSON size | `encoding/json` | `libjson` | -| --------- | --------------- | --------- | -| 1MB | 24.2ms | 11.5ms | -| 5MB | 117.3ms | 48.5ms | -| 10MB | 225ms | 91ms | - -- manually inlined `parser::expect` +### HUGE inputs -### [0058abb](https://github.com/xNaCly/libjson/commit/0058abb7381735b27783f9809947d7e0f22d9b05) - -| JSON size | `encoding/json` | `libjson` | -| --------- | --------------- | --------- | -| 1MB | 24.2ms | 12.0ms | -| 5MB | 117.3ms | 49.8ms | -| 10MB | 225ms | 93.8ms | - -- replaced byte slices with offsets and lengths in the `token` struct - -### [88c5eb9](https://github.com/xNaCly/libjson/commit/88c5eb91c4fb1586af29b2cab3563b6ade424323) - -| JSON size | `encoding/json` | `libjson` | -| --------- | --------------- | --------- | -| 1MB | 25.2ms | 12.0ms | -| 5MB | 117.3ms | 50ms | -| 10MB | 227ms | 96ms | - -This commit made the tests more comparably by actually unmarshalling json into -a go data structure. - -### [a36a1bd](https://github.com/xNaCly/libjson/commit/a36a1bd042b10ce779c95c7c1e52232cf8d16fab) - -| JSON size | `encoding/json` | `libjson` | -| --------- | --------------- | --------- | -| 1MB | 12.0ms | 13.4ms | -| 5MB | 58.4ms | 66.3ms | -| 10MB | 114.0ms | 127.0ms | - -- switch `token.Val` from `string` to `[]byte`, allows zero values to be `nil` and not `""` -- move string allocation for `t_string` and `t_number` to `(*parser).atom()` - -### [58e19ff](https://github.com/xNaCly/libjson/commit/58e19ffa140b01ff873505cb500364c4fea566db) - -| JSON size | `encoding/json` | `libjson` | -| --------- | --------------- | --------- | -| 1MB | 12.3ms | 14.2ms | -| 5MB | 59.6ms | 68.8ms | -| 10MB | 115.3ms | 131.8ms | - -The changes below resulted in the following savings: \~6ms for 1MB, \~25ms for -5MB and \~60ms for 10MB. - -- reuse buffer `lexer.buf` for number and string processing -- switch from `(*bufio.Reader).ReadRune()` to `(*bufio.Reader).ReadByte()` -- used `*(*string)(unsafe.Pointer(&l.buf))` to skip strings.Builder usage for - number and string processing -- remove and inline buffer usage for null, true and false, skipping allocations -- benchmark the optimal initial cap for `lexer.buf`, maps and arrays to be 8 -- remove `errors.Is` and check for `t_eof` instead -- move number parsing to `(*parser).atom()` and change type of `token.Val` to string, - this saves a lot of assertions, etc - -### [58d9360](https://github.com/xNaCly/libjson/commit/58d9360bae0576e761e021ee52035713206fdab1) - -| JSON size | `encoding/json` | `libjson` | -| --------- | --------------- | --------- | -| 1MB | 12.2ms | 19.9ms | -| 5MB | 60.2ms | 95.2ms | -| 10MB | 117.2ms | 183.8ms | - -I had to change some things to account for issues occuring in the reading of -atoms, such as true, false and null. All of those are read by buffering the -size of chars they have and reading this buffer at once, instead of iterating -and multiple reads. This did not work correctly because i used -`(*bufio.Reader).Read`, which sometimes does not read all bytes fitting in the -buffer passed into it. Thats why these commit introduces a lot of performance -regressions. - -### [e08beba](https://github.com/xNaCly/libjson/commit/e08bebada39441d9b6a20cb05251488ddce68285) - -| JSON size | `encoding/json` | `libjson` | -| --------- | --------------- | --------- | -| 1MB | 11.7ms | 13.1ms | -| 5MB | 55.2ms | 64.8ms | - -The optimisation in this commit is to no longer tokenize the whole input before -starting the parser but attaching the lexer to the parser. This allows the -parser to invoke the tokenization of the next token on demand, for instance -once the parser needs to advance. This reduces the runtime around 4ms for the -1MB input and 14ms for 5MB, resulting in a 1.33x and a 1.22x runtime reduction, -pretty good for such a simple change. - -### [be686d2](https://github.com/xNaCly/libjson/commit/be686d2c85c07cdfa91295052db54001d8cd5cc8) - -| JSON size | `encoding/json` | `libjson` | -| --------- | --------------- | --------- | -| 1MB | 11.7ms | 17.4ms | -| 5MB | 55.2ms | 78.5ms | - -For the first naiive implementation, these results are fairly good and not too -far behind the `encoding/go` implementation, however there are some potential -low hanging fruit for performance improvements and I will invest some time into -them. - -No specific optimisations made here, except removing the check for duplicate -object keys, because -[rfc8259](https://www.rfc-editor.org/rfc/rfc8259) says: - -> When the names within an object are not -> unique, the behavior of software that receives such an object is -> unpredictable. Many implementations report the last name/value pair only. -> Other implementations report an error or fail to parse the object, and some -> implementations report all of the name/value pairs, including duplicates. - -Thus I can decide wheter or not I want to error on duplicate keys, or simply -let each duplicate key overwrite the previous value in the object, however -checking if a given key is already in the map/object requires that key to be -hashed and the map to be indexed with that key, omitting this check saves us -these operations, thus making the parser faster for large objects. - -### Reproduce locally +| Input size | library | time | faster | +| ----- | ------------- | ------- | ------ | +| 1MB | libjson | 8.7ms | 1.73x | +| | encoding/json | 15.0ms | | +| 5MB | libjson | 33.2ms | 1.99x | +| | encoding/json | 66.3ms | | +| 10MB | libjson | 64.4ms | 2.04x | +| | encoding/json | 131.6ms | | +| 100MB | libjson | 618.2ms | 2.06x | +| | encoding/json | 1273ms | | > Make sure you have the go toolchain and python3 installed for this. ```shell -cd test/ +cd benchmarks/ chmod +x ./bench.sh ./bench.sh ``` @@ -187,28 +104,50 @@ chmod +x ./bench.sh Output looks something like: ```text -fetching example data +generating example data building executable -Benchmark 1: ./test ./1MB.json - Time (mean ± σ): 13.1 ms ± 0.2 ms [User: 12.1 ms, System: 2.8 ms] - Range (min … max): 12.7 ms … 13.8 ms 210 runs +Benchmark 1: ./test -s ./1MB.json + Time (mean ± σ): 8.6 ms ± 0.2 ms [User: 10.1 ms, System: 2.8 ms] + Range (min … max): 8.3 ms … 8.8 ms 10 runs + +Benchmark 2: ./test -s -libjson=false ./1MB.json + Time (mean ± σ): 15.1 ms ± 0.3 ms [User: 15.6 ms, System: 3.2 ms] + Range (min … max): 14.7 ms … 15.6 ms 10 runs + +Summary + ./test -s ./1MB.json ran + 1.76 ± 0.05 times faster than ./test -s -libjson=false ./1MB.json +Benchmark 1: ./test -s ./5MB.json + Time (mean ± σ): 33.6 ms ± 0.8 ms [User: 40.4 ms, System: 10.1 ms] + Range (min … max): 32.5 ms … 34.9 ms 10 runs + +Benchmark 2: ./test -s -libjson=false ./5MB.json + Time (mean ± σ): 66.2 ms ± 0.7 ms [User: 66.5 ms, System: 9.4 ms] + Range (min … max): 65.3 ms … 67.7 ms 10 runs + +Summary + ./test -s ./5MB.json ran + 1.97 ± 0.05 times faster than ./test -s -libjson=false ./5MB.json +Benchmark 1: ./test -s ./10MB.json + Time (mean ± σ): 64.3 ms ± 1.4 ms [User: 83.6 ms, System: 12.4 ms] + Range (min … max): 62.9 ms … 67.5 ms 10 runs -Benchmark 2: ./test -libjson=false ./1MB.json - Time (mean ± σ): 11.7 ms ± 0.3 ms [User: 9.5 ms, System: 2.1 ms] - Range (min … max): 11.1 ms … 12.7 ms 237 runs +Benchmark 2: ./test -s -libjson=false ./10MB.json + Time (mean ± σ): 132.4 ms ± 1.4 ms [User: 169.4 ms, System: 11.7 ms] + Range (min … max): 130.7 ms … 135.3 ms 10 runs Summary - ./test -libjson=false ./1MB.json ran - 1.12 ± 0.03 times faster than ./test ./1MB.json -Benchmark 1: ./test ./5MB.json - Time (mean ± σ): 64.2 ms ± 0.9 ms [User: 79.3 ms, System: 13.1 ms] - Range (min … max): 62.6 ms … 67.0 ms 46 runs + ./test -s ./10MB.json ran + 2.06 ± 0.05 times faster than ./test -s -libjson=false ./10MB.json +Benchmark 1: ./test -s ./100MB.json + Time (mean ± σ): 613.2 ms ± 2.9 ms [User: 803.8 ms, System: 65.6 ms] + Range (min … max): 609.0 ms … 618.7 ms 10 runs -Benchmark 2: ./test -libjson=false ./5MB.json - Time (mean ± σ): 55.2 ms ± 1.1 ms [User: 51.3 ms, System: 6.3 ms] - Range (min … max): 53.6 ms … 58.0 ms 53 runs +Benchmark 2: ./test -s -libjson=false ./100MB.json + Time (mean ± σ): 1.276 s ± 0.012 s [User: 1.522 s, System: 0.072 s] + Range (min … max): 1.262 s … 1.299 s 10 runs Summary - ./test -libjson=false ./5MB.json ran - 1.16 ± 0.03 times faster than ./test ./5MB.json + ./test -s ./100MB.json ran + 2.08 ± 0.02 times faster than ./test -s -libjson=false ./100MB.json ``` diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh new file mode 100755 index 0000000..0787939 --- /dev/null +++ b/benchmarks/bench.sh @@ -0,0 +1,15 @@ +#!/bin/bash +echo "generating example data" +python3 gen.py + +echo "building executable" +rm ./test +go build -o ./test ../cmd/lj.go + +for SIZE in 1MB 5MB 10MB 100MB; do + hyperfine \ + --warmup 1 \ + --runs 10 \ + "./test -s ./${SIZE}.json" \ + "./test -s -libjson=false ./${SIZE}.json" +done diff --git a/benchmarks/gen.py b/benchmarks/gen.py new file mode 100644 index 0000000..f169beb --- /dev/null +++ b/benchmarks/gen.py @@ -0,0 +1,47 @@ +from os.path import exists +import math +import json + +sizes =[1,5,10,100] + +line = json.dumps({ + "id": 12345, + "name": "very_long_string_with_escapes_and_unicode_abcdefghijklmnopqrstuvwxyz_0123456789", + "description": "This string contains\nmultiple\nlines\nand \"quotes\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"", + "nested": { + "level1": { + "level2": { + "level3": { + "level4": { + "array": [ + "short", + "string_with_escape\\n", + "another\\tvalue", + "unicode\u2603", + "escaped_quote_\"_and_backslash_\\", + 11234567890,1234567890,1234567890,1234567890,1234567890,1234567890,1234567890,1234567890,1234567890,1234567890,1234567890,1234567890,234567890, + -1.2345e67, + 3.1415926535897932384626433832795028841971, + True, + False, + None, + "\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC\u0041\u0042\u0043\u00A9\u20AC", + "mix\\n\\t\\r\\\\\\\"end" + ] + } + } + } + } + } +}) + +def write_data(size: int): + name = f"{size}MB.json" + if not exists(name): + with open(name, mode="w", encoding="utf8") as f: + f.write("[\n") + size = math.floor((size*1000000)/len(line)) + f.write(",\n".join([line for _ in range(0, size)])) + f.write("\n]") + +[write_data(size) for size in sizes] diff --git a/cmd/lj.go b/cmd/lj.go index f87b538..c65f6fc 100644 --- a/cmd/lj.go +++ b/cmd/lj.go @@ -1,9 +1,14 @@ package main import ( + "encoding/json" + "flag" "fmt" "log" "os" + "path/filepath" + "runtime/debug" + "runtime/pprof" "github.com/xnacly/libjson" ) @@ -16,17 +21,69 @@ func Must[T any](t T, err error) T { } func main() { - args := os.Args + noGc := flag.Bool("nogc", false, "disable the go garbage collector") + useLibjson := flag.Bool("libjson", true, "use libjson, if false use encoding/json") + usePprof := flag.Bool("pprof", false, "use pprof cpu tracing") + query := flag.String("q", ".", "query the parsed json") + silent := flag.Bool("s", false, "no stdoutput") + escape := flag.Bool("e", false, "escapes input with Gos '%#+v'") + flag.Parse() + + if *noGc { + debug.SetGCPercent(-1) + } + + args := flag.Args() + + var filePath string var file *os.File if info, err := os.Stdin.Stat(); err != nil || info.Mode()&os.ModeCharDevice != 0 { // we are in a pipe - if len(args) == 1 { - log.Fatalln("Wanted a file as first argument, got nothing, exiting") + if len(args) == 0 { + log.Fatalln("Wanted a file as an argument, got nothing, exiting") } - file = Must(os.Open(args[1])) + filePath = args[0] + file = Must(os.Open(filePath)) } else { file = os.Stdin + filePath = "stdin" + } + + if *usePprof { + f, err := os.Create(filepath.Base(filePath) + ".pprof") + if err != nil { + panic(err) + } + pprof.StartCPUProfile(f) + defer pprof.StopCPUProfile() + } + + if *useLibjson { + out := Must(libjson.NewReader(file)) + if !*silent { + out := Must(libjson.Get[any](&out, *query)) + if *escape { + fmt.Printf("%#+v\n", out) + } else { + fmt.Println(out) + } + } + } else { + if *query != "." { + panic("With -libjson=false, there is no support for querying the json") + } + + decoder := json.NewDecoder(file) + var out any + if err := decoder.Decode(&out); err != nil { + panic(err) + } + + if !*silent { + if *escape { + fmt.Printf("%#+v\n", out) + } else { + fmt.Println(out) + } + } } - query := os.Args[len(os.Args)-1] - json := Must(libjson.NewReader(file)) - fmt.Printf("%+#v\n", Must(libjson.Get[any](&json, query))) } diff --git a/go.mod b/go.mod index 3d85827..096bd1c 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/xnacly/libjson -go 1.23.0 +go 1.26.1 require github.com/stretchr/testify v1.9.0 diff --git a/hex.go b/hex.go new file mode 100644 index 0000000..8fa4a59 --- /dev/null +++ b/hex.go @@ -0,0 +1,54 @@ +package libjson + +import "errors" + +var invalid_hex_err = errors.New("invalid hex") + +var hexTable [256]byte + +func init() { + for i := 0; i < 256; i++ { + hexTable[i] = 0xFF + } + for i := byte('0'); i <= '9'; i++ { + hexTable[i] = i - '0' + } + for i := byte('a'); i <= 'f'; i++ { + hexTable[i] = i - 'a' + 10 + } + for i := byte('A'); i <= 'F'; i++ { + hexTable[i] = i - 'A' + 10 + } +} + +// hex4 converts 4 ASCII hex bytes to a rune. +// Returns an error if any byte is invalid. +func hex4(b []byte) (r rune, err error) { + var v byte + + v = hexTable[b[0]] + if v == 0xFF { + return 0, invalid_hex_err + } + r = rune(v) << 12 + + v = hexTable[b[1]] + if v == 0xFF { + return 0, invalid_hex_err + } + r |= rune(v) << 8 + + v = hexTable[b[2]] + if v == 0xFF { + return 0, invalid_hex_err + } + r |= rune(v) << 4 + + v = hexTable[b[3]] + if v == 0xFF { + return 0, invalid_hex_err + } + r |= rune(v) + + return r, nil +} diff --git a/json.go b/json.go index 8477a8c..c433ffb 100644 --- a/json.go +++ b/json.go @@ -9,7 +9,7 @@ func NewReader(r io.Reader) (JSON, error) { if err != nil { return JSON{}, err } - p := parser{l: lexer{data: data}} + p := parser{l: lexer{data: data, len: len(data)}} obj, err := p.parse(data) if err != nil { return JSON{}, err @@ -17,8 +17,9 @@ func NewReader(r io.Reader) (JSON, error) { return JSON{obj}, nil } +// data is consumed and possibly mutated, DO NOT REUSE func New(data []byte) (JSON, error) { - p := parser{l: lexer{data: data}} + p := parser{l: lexer{data: data, len: len(data)}} obj, err := p.parse(data) if err != nil { return JSON{}, err diff --git a/json_test.go b/json_test.go index f7af6c3..85ccdd2 100644 --- a/json_test.go +++ b/json_test.go @@ -9,31 +9,88 @@ import ( ) const amount = 50_000 +const naiveInput = `{"key1":"value","array":[],"obj":{},"atomArray":[11201,1e112,true,false,null,"str"]},` +const escapedInput = `{"text":"line1\nline2\nline3","quote":"\"hello\"","path":"C:\\\\Users\\\\name","unicode":"\u0041\u0042\u0043","mix":"abc\\ndef\"ghi\u263A"},` +const hardInput = `{ + "id": 12345, + "name": "very_long_string_with_escapes_and_unicode_abcdefghijklmnopqrstuvwxyz_0123456789", + "description": "This string contains\nmultiple\nlines\nand \"quotes\" and unicode \u2764\u2764\u2764", + "nested": { + "level1": { + "level2": { + "level3": { + "level4": { + "array": [ + "short", + "string_with_escape\\n", + "another\\tvalue", + "unicode\u2603", + "escaped_quote_\"_and_backslash_\\", + 1234567890, + -1.2345e67, + 3.141592653589793, + true, + false, + null, + "ABC\u00a9\u20ac", + "mix\\n\\t\\r\\\\\\\"end" + ] + } + } + } + } + } +},` -func BenchmarkLibJson(b *testing.B) { - data := strings.Repeat(`{"key1": "value","array": [],"obj": {},"atomArray": [11201,1e112,true,false,null,"str"]},`, amount) +func benchmarkWithInput(b *testing.B, input string) { + data := strings.Repeat(input, amount) d := []byte("[" + data[:len(data)-1] + "]") + b.ResetTimer() for i := 0; i < b.N; i++ { - _, err := New(d) + buf := make([]byte, len(d)) + copy(buf, d) + b.StartTimer() + _, err := New(buf) + b.StopTimer() assert.NoError(b, err) } b.ReportAllocs() } -func BenchmarkEncodingJson(b *testing.B) { - data := strings.Repeat(`{"key1": "value","array": [],"obj": {},"atomArray": [11201,1e112,true,false,null,"str"]},`, amount) +func benchmarkEncodingJsonWithInput(b *testing.B, input string) { + data := strings.Repeat(input, amount) d := []byte("[" + data[:len(data)-1] + "]") + b.ResetTimer() for i := 0; i < b.N; i++ { - v := []struct { - Key1 string - Array []any - Obj any - AtomArray []any - }{} + var v any err := json.Unmarshal(d, &v) assert.NoError(b, err) } b.ReportAllocs() } + +func BenchmarkLibJson_Naive(b *testing.B) { + benchmarkWithInput(b, naiveInput) +} + +func BenchmarkLibJson_Escaped(b *testing.B) { + benchmarkWithInput(b, escapedInput) +} + +func BenchmarkLibJson_Hard(b *testing.B) { + benchmarkWithInput(b, hardInput) +} + +func BenchmarkEncodingJson_Naive(b *testing.B) { + benchmarkEncodingJsonWithInput(b, naiveInput) +} + +func BenchmarkEncodingJson_Escaped(b *testing.B) { + benchmarkEncodingJsonWithInput(b, escapedInput) +} + +func BenchmarkEncodingJson_Hard(b *testing.B) { + benchmarkEncodingJsonWithInput(b, hardInput) +} diff --git a/lexer.go b/lexer.go index e3888d7..d49434b 100644 --- a/lexer.go +++ b/lexer.go @@ -9,6 +9,7 @@ import ( type lexer struct { data []byte pos int + len int } var numChar [256]bool @@ -25,7 +26,7 @@ func init() { } func (l *lexer) next() (token, error) { - for l.pos < len(l.data) { + for l.pos < l.len { cc := l.data[l.pos] if cc == ' ' || cc == '\n' || cc == '\t' || cc == '\r' { l.pos++ @@ -34,7 +35,7 @@ func (l *lexer) next() (token, error) { } } - if l.pos >= len(l.data) { + if l.pos >= l.len { return empty, nil } @@ -57,16 +58,33 @@ func (l *lexer) next() (token, error) { tt = t_colon case '"': start := l.pos - for i := start; i < len(l.data); i++ { - if l.data[i] == '"' { + for i := start; i < l.len; i++ { + if c := l.data[i]; c == '"' { t := token{Type: t_string, Start: start, End: i} l.pos = i + 1 return t, nil + } else if c == '\\' { // OH NO ITS ESCAPING :O + i++ + if i >= l.len { + return empty, errors.New("Unterminated string escape") + } + + switch l.data[i] { + case '"', '\\', '/', 'b', 'f', 'n', 'r', 't': + // we simply skip the escaped char, the parser has to + case 'u': + if i+4 > l.len { + return empty, errors.New("Unterminated string") + } + i += 4 + default: + return empty, fmt.Errorf("Invalid escape %q", l.data[i]) + } } } return empty, errors.New("Unterminated string") case 't': // this should always be the 'true' atom and is therefore optimised here - if l.pos+3 > len(l.data) { + if l.pos+3 > l.len { return empty, errors.New("Failed to read the expected 'true' atom") } if !(l.data[l.pos] == 'r' && l.data[l.pos+1] == 'u' && l.data[l.pos+2] == 'e') { @@ -75,7 +93,7 @@ func (l *lexer) next() (token, error) { l.pos += 3 tt = t_true case 'f': // this should always be the 'false' atom and is therefore optimised here - if l.pos+4 > len(l.data) { + if l.pos+4 > l.len { return empty, errors.New("Failed to read the expected 'false' atom") } if !(l.data[l.pos] == 'a' && l.data[l.pos+1] == 'l' && l.data[l.pos+2] == 's' && l.data[l.pos+3] == 'e') { @@ -84,7 +102,7 @@ func (l *lexer) next() (token, error) { l.pos += 4 tt = t_false case 'n': // this should always be the 'null' atom and is therefore optimised here - if l.pos+3 > len(l.data) { + if l.pos+3 > l.len { return empty, errors.New("Failed to read the expected 'null' atom") } if !(l.data[l.pos] == 'u' && l.data[l.pos+1] == 'l' && l.data[l.pos+2] == 'l') { @@ -95,7 +113,7 @@ func (l *lexer) next() (token, error) { default: if cc == '-' || (cc >= '0' && cc <= '9') { start := l.pos - 1 - for l.pos < len(l.data) && numChar[l.data[l.pos]] { + for l.pos < l.len && numChar[l.data[l.pos]] { l.pos++ } @@ -115,6 +133,7 @@ func (l *lexer) lex(r io.Reader) ([]token, error) { if err != nil { return nil, err } + l.len = len(l.data) toks := make([]token, 0, len(l.data)/2) for { diff --git a/parser.go b/parser.go index 98faa54..a247c1d 100644 --- a/parser.go +++ b/parser.go @@ -1,7 +1,10 @@ package libjson import ( + "errors" "fmt" + "strconv" + "unicode/utf8" "unsafe" ) @@ -57,16 +60,16 @@ func (p *parser) object() (map[string]any, error) { return nil, err } - m := make(map[string]any, 4) - if p.cur_tok.Type == t_right_curly { err := p.advance() if err != nil { return nil, err } - return m, nil + return make(map[string]any, 0), nil } + m := make(map[string]any, 8) + for p.cur_tok.Type != t_eof && p.cur_tok.Type != t_right_curly { if len(m) > 0 { if p.cur_tok.Type != t_comma { @@ -161,17 +164,97 @@ func (p *parser) array() ([]any, error) { return a, p.advance() } +var badEscapeErr = errors.New("bad escape") + +// unescapes JSON escapes in a buffer into their non-JSON representation +// +// Returns the end of the in place escaped buffer so the caller can resize to +// the new, smaller buffer size +// +// The implementation may look weird, but is optimised to have the least +// possible branches +func unescapeInPlace(in []byte) (int, error) { + curEnd := 0 + for i := 0; i < len(in); i++ { + b := in[i] + if b != '\\' { + in[curEnd] = b + curEnd++ + continue + } + + // check if there’s at least 1 more byte for the escape + if i+1 >= len(in) { + return 0, badEscapeErr + } + i++ // skip \ + b = in[i] + + switch b { + case '"', '\\', '/': + in[curEnd] = b + curEnd++ + case 'b': + in[curEnd] = '\b' + curEnd++ + case 'f': + in[curEnd] = '\f' + curEnd++ + case 'n': + in[curEnd] = '\n' + curEnd++ + case 'r': + in[curEnd] = '\r' + curEnd++ + case 't': + in[curEnd] = '\t' + curEnd++ + case 'u': // \uXXXX + + // From ECMA-404: + // + // However, whether a processor of JSON texts interprets such a surrogate pair + // as a single code point or as an explicit surrogate pair is a semantic + // decision that is determined by the specific processor. + // + // meaning we dont merge unicode points, firstly because fuck + // utf16, and secondly because its simpler to just keep two unicode + // points separate compared to increasing the complexity of this + // decoding + + if i+4 >= len(in) { + return 0, badEscapeErr + } + + r, err := hex4(in[i+1 : i+5]) + if err != nil { + return 0, err + } + n := utf8.EncodeRune(in[curEnd:], r) + curEnd += n + i += 4 + } // we dont need a default case since we check all possible escapes in the lexer + } + + return curEnd, nil +} + func (p *parser) atom() (any, error) { var r any switch p.cur_tok.Type { case t_string: in := p.input[p.cur_tok.Start:p.cur_tok.End] + end, err := unescapeInPlace(in) + if err != nil { + return nil, err + } + in = in[:end] r = *(*string)(unsafe.Pointer(&in)) case t_number: raw := p.input[p.cur_tok.Start:p.cur_tok.End] - number, err := parseFloat(raw) + number, err := strconv.ParseFloat(*(*string)(unsafe.Pointer(&raw)), 64) if err != nil { - return empty, fmt.Errorf("Invalid floating point number %q: %w", string(raw), err) + return nil, fmt.Errorf("Invalid floating point number %q: %w", string(raw), err) } r = number case t_true: diff --git a/parser_test.go b/parser_test.go index 907473b..30731f5 100644 --- a/parser_test.go +++ b/parser_test.go @@ -30,7 +30,7 @@ func TestParserAtoms(t *testing.T) { for i, in := range input { t.Run(in, func(t *testing.T) { in := []byte(in) - p := &parser{l: lexer{data: in}} + p := &parser{l: lexer{data: in, len: len(in)}} out, err := p.parse(in) assert.NoError(t, err) assert.EqualValues(t, wanted[i], out) @@ -54,7 +54,7 @@ func TestParserArray(t *testing.T) { for i, in := range input { t.Run(in, func(t *testing.T) { in := []byte(in) - p := &parser{l: lexer{data: in}} + p := &parser{l: lexer{data: in, len: len(in)}} out, err := p.parse(in) assert.NoError(t, err) assert.EqualValues(t, wanted[i], out) @@ -82,7 +82,7 @@ func TestParserObject(t *testing.T) { for i, in := range input { t.Run(in, func(t *testing.T) { in := []byte(in) - p := &parser{l: lexer{data: in}} + p := &parser{l: lexer{data: in, len: len(in)}} out, err := p.parse(in) assert.NoError(t, err) assert.EqualValues(t, wanted[i], out) @@ -110,7 +110,7 @@ func TestParserEdge(t *testing.T) { for i, in := range input { t.Run(in, func(t *testing.T) { in := []byte(in) - p := &parser{l: lexer{data: in}} + p := &parser{l: lexer{data: in, len: len(in)}} out, err := p.parse(in) assert.NoError(t, err) assert.EqualValues(t, wanted[i], out) @@ -145,7 +145,7 @@ func TestParserFail(t *testing.T) { for _, in := range input { t.Run(in, func(t *testing.T) { in := []byte(in) - p := &parser{l: lexer{data: in}} + p := &parser{l: lexer{data: in, len: len(in)}} out, err := p.parse(in) assert.Error(t, err) assert.Nil(t, out) diff --git a/test/bench.sh b/test/bench.sh deleted file mode 100755 index 9585625..0000000 --- a/test/bench.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash -echo "generating example data" -python3 gen.py - -echo "building executable" -rm ./test -go build ./test.go - -hyperfine "./test ./1MB.json" "./test -libjson=false ./1MB.json" -hyperfine "./test ./5MB.json" "./test -libjson=false ./5MB.json" -hyperfine "./test ./10MB.json" "./test -libjson=false ./10MB.json" diff --git a/test/gen.py b/test/gen.py deleted file mode 100644 index 50d2bcb..0000000 --- a/test/gen.py +++ /dev/null @@ -1,22 +0,0 @@ -from os.path import exists -import math - -sizes =[1,5,10] - -line = """\t{ - "key1": "value", - "array": [], - "obj": {}, - "atomArray": [11201,1e112,true,false,null,"str"] - }""" - -def write_data(size: int): - name = f"{size}MB.json" - if not exists(name): - with open(name, mode="w", encoding="utf8") as f: - f.write("[\n") - size = math.floor((size*1000000)/len(line)) - f.write(",\n".join([line for _ in range(0, size)])) - f.write("\n]") - -[write_data(size) for size in sizes] diff --git a/test/test.go b/test/test.go deleted file mode 100644 index 3a6dfe0..0000000 --- a/test/test.go +++ /dev/null @@ -1,49 +0,0 @@ -package main - -import ( - "encoding/json" - "flag" - "log" - "os" - - // "runtime/pprof" - - "github.com/xnacly/libjson" -) - -func main() { - // f, err := os.Create("cpu.pprof") - // if err != nil { - // panic(err) - // } - // pprof.StartCPUProfile(f) - // defer pprof.StopCPUProfile() - lj := flag.Bool("libjson", true, "benchmark libjson or gojson") - flag.Parse() - args := flag.Args() - if len(args) == 0 { - log.Fatalln("Wanted a file as first argument, got nothing, exiting") - } - file, err := os.Open(args[0]) - if err != nil { - log.Fatalln(err) - } - if *lj { - _, err := libjson.NewReader(file) - if err != nil { - log.Fatalln(err) - } - } else { - v := []struct { - Key1 string - Array []any - Obj any - AtomArray []any - }{} - d := json.NewDecoder(file) - err := d.Decode(&v) - if err != nil { - log.Fatalln(err) - } - } -} diff --git a/types.go b/tokens.go similarity index 100% rename from types.go rename to tokens.go