From 338e4fcd72ebacdf87b0961cbde9c9ffa17222ea Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Wed, 7 Jan 2026 19:07:01 +0200 Subject: [PATCH 1/7] feat: add benchmark for decoding and serializing wide JSON objects --- arrow-json/Cargo.toml | 4 ++ arrow-json/benches/reader.rs | 119 +++++++++++++++++++++++++++++++++++ 2 files changed, 123 insertions(+) create mode 100644 arrow-json/benches/reader.rs diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index f2653ec4e46e..28cfdb05f5b8 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -65,3 +65,7 @@ rand = { version = "0.9", default-features = false, features = ["std", "std_rng" [[bench]] name = "serde" harness = false + +[[bench]] +name = "reader" +harness = false diff --git a/arrow-json/benches/reader.rs b/arrow-json/benches/reader.rs new file mode 100644 index 000000000000..0fcd260c5932 --- /dev/null +++ b/arrow-json/benches/reader.rs @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_json::ReaderBuilder; +use arrow_schema::{DataType, Field, Schema}; +use criterion::{Criterion, criterion_group, criterion_main}; +use serde_json::{Map, Number, Value}; +use std::fmt::Write; +use std::sync::Arc; + +fn build_schema(field_count: usize) -> Arc { + // Builds a schema with fields named f0..f{field_count-1}, all Int64 and non-nullable. + let fields: Vec = (0..field_count) + .map(|i| Field::new(format!("f{i}"), DataType::Int64, false)) + .collect(); + Arc::new(Schema::new(fields)) +} + +fn build_wide_json(rows: usize, fields: usize) -> Vec { + // Builds newline-delimited JSON objects with "wide" schema. + // Example (rows=2, fields=3): + // {"f0":0,"f1":1,"f2":2} + // {"f0":1,"f1":2,"f2":3} + let mut out = String::with_capacity(rows * fields * 12); + for row in 0..rows { + out.push('{'); + for field in 0..fields { + if field > 0 { + out.push(','); + } + let value = row as i64 + field as i64; + write!(&mut out, "\"f{field}\":{value}").unwrap(); + } + out.push('}'); + out.push('\n'); + } + out.into_bytes() +} + +fn build_wide_values(rows: usize, fields: usize) -> Vec { + // Mirrors build_wide_json but returns structured serde_json::Value objects. + let mut out = Vec::with_capacity(rows); + for row in 0..rows { + let mut map = Map::with_capacity(fields); + for field in 0..fields { + let key = format!("f{field}"); + let value = Number::from((row + field) as i64); + map.insert(key, Value::Number(value)); + } + out.push(Value::Object(map)); + } + out +} + +fn bench_decode_wide_object(c: &mut Criterion) { + let rows = 4096; + let fields = 64; + let data = build_wide_json(rows, fields); + let schema = build_schema(fields); + + c.bench_function("decode_wide_object_i64_json", |b| { + b.iter(|| { + let mut decoder = ReaderBuilder::new(schema.clone()) + .with_batch_size(1024) + .build_decoder() + .unwrap(); + + let mut offset = 0; + while offset < data.len() { + let read = decoder.decode(&data[offset..]).unwrap(); + if read == 0 { + break; + } + offset += read; + while let Some(_batch) = decoder.flush().unwrap() {} + } + }) + }); +} + +fn bench_serialize_wide_object(c: &mut Criterion) { + let rows = 4096; + let fields = 64; + let values = build_wide_values(rows, fields); + let schema = build_schema(fields); + + c.bench_function("decode_wide_object_i64_serialize", |b| { + b.iter(|| { + let mut decoder = ReaderBuilder::new(schema.clone()) + .with_batch_size(1024) + .build_decoder() + .unwrap(); + + decoder.serialize(&values).unwrap(); + while let Some(_batch) = decoder.flush().unwrap() {} + }) + }); +} + +criterion_group!( + benches, + bench_decode_wide_object, + bench_serialize_wide_object +); +criterion_main!(benches); From 49862838599b173b63cad127127d1952ae5e7c1a Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Wed, 7 Jan 2026 19:34:49 +0200 Subject: [PATCH 2/7] feat: add benchmarks for decoding and serializing wide JSON objects --- arrow-json/Cargo.toml | 6 +- arrow-json/benches/binary_hex.rs | 107 ++++++++++++++++++ .../benches/{reader.rs => wide_object.rs} | 0 3 files changed, 112 insertions(+), 1 deletion(-) create mode 100644 arrow-json/benches/binary_hex.rs rename arrow-json/benches/{reader.rs => wide_object.rs} (100%) diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index 28cfdb05f5b8..4d7b6b08aab2 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -67,5 +67,9 @@ name = "serde" harness = false [[bench]] -name = "reader" +name = "wide_object" +harness = false + +[[bench]] +name = "binary_hex" harness = false diff --git a/arrow-json/benches/binary_hex.rs b/arrow-json/benches/binary_hex.rs new file mode 100644 index 000000000000..993382d29462 --- /dev/null +++ b/arrow-json/benches/binary_hex.rs @@ -0,0 +1,107 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_json::ReaderBuilder; +use arrow_schema::{DataType, Field}; +use criterion::{Criterion, criterion_group, criterion_main}; +use std::hint::black_box; +use std::sync::Arc; + +const BINARY_ROWS: usize = 1 << 15; +const BINARY_BYTES: usize = 64; + +fn bench_decode(c: &mut Criterion, name: &str, data: &[u8], field: Arc, rows: usize) { + c.bench_function(name, |b| { + b.iter(|| { + let mut decoder = ReaderBuilder::new_with_field(field.clone()) + .with_batch_size(rows) + .build_decoder() + .unwrap(); + + let mut offset = 0; + while offset < data.len() { + let read = decoder.decode(black_box(&data[offset..])).unwrap(); + if read == 0 { + break; + } + offset += read; + } + + let batch = decoder.flush().unwrap(); + black_box(batch); + }) + }); +} + +#[inline] +fn append_hex_byte(buf: &mut String, byte: u8) { + const HEX: &[u8; 16] = b"0123456789abcdef"; + buf.push(HEX[(byte >> 4) as usize] as char); + buf.push(HEX[(byte & 0x0f) as usize] as char); +} + +fn build_hex_lines(rows: usize, bytes_per_row: usize) -> Vec { + let mut data = String::with_capacity(rows * (bytes_per_row * 2 + 3)); + for row in 0..rows { + data.push('"'); + for i in 0..bytes_per_row { + let byte = ((row + i) & 0xff) as u8; + append_hex_byte(&mut data, byte); + } + data.push('"'); + data.push('\n'); + } + data.into_bytes() +} + +fn criterion_benchmark(c: &mut Criterion) { + let binary_data = build_hex_lines(BINARY_ROWS, BINARY_BYTES); + + let binary_field = Arc::new(Field::new("item", DataType::Binary, false)); + bench_decode( + c, + "decode_binary_hex_json", + &binary_data, + binary_field, + BINARY_ROWS, + ); + + let fixed_field = Arc::new(Field::new( + "item", + DataType::FixedSizeBinary(BINARY_BYTES as i32), + false, + )); + bench_decode( + c, + "decode_fixed_binary_hex_json", + &binary_data, + fixed_field, + BINARY_ROWS, + ); + + let view_field = Arc::new(Field::new("item", DataType::BinaryView, false)); + bench_decode( + c, + "decode_binary_view_hex_json", + &binary_data, + view_field, + BINARY_ROWS, + ); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/arrow-json/benches/reader.rs b/arrow-json/benches/wide_object.rs similarity index 100% rename from arrow-json/benches/reader.rs rename to arrow-json/benches/wide_object.rs From 886bc40457dd9da1b29e9749bef9b552b1666cf0 Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Wed, 7 Jan 2026 19:36:30 +0200 Subject: [PATCH 3/7] feat: add benchmark for wide JSON projection decoding --- arrow-json/Cargo.toml | 4 + arrow-json/benches/wide_projection.rs | 125 ++++++++++++++++++++++++++ 2 files changed, 129 insertions(+) create mode 100644 arrow-json/benches/wide_projection.rs diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index 4d7b6b08aab2..a8e58a172030 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -73,3 +73,7 @@ harness = false [[bench]] name = "binary_hex" harness = false + +[[bench]] +name = "wide_projection" +harness = false diff --git a/arrow-json/benches/wide_projection.rs b/arrow-json/benches/wide_projection.rs new file mode 100644 index 000000000000..e484e473eb2f --- /dev/null +++ b/arrow-json/benches/wide_projection.rs @@ -0,0 +1,125 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_json::ReaderBuilder; +use arrow_schema::{DataType, Field, Schema}; +use criterion::{ + BenchmarkId, Criterion, SamplingMode, Throughput, criterion_group, criterion_main, +}; +use std::fmt::Write; +use std::hint::black_box; +use std::sync::Arc; + +// Projection benchmark constants +const WIDE_PROJECTION_ROWS: usize = 1 << 14; // 16K rows +const WIDE_PROJECTION_TOTAL_FIELDS: usize = 100; // 100 fields total, select only 3 + +fn bench_decode_schema( + c: &mut Criterion, + name: &str, + data: &[u8], + schema: Arc, + rows: usize, + projection: bool, +) { + let mut group = c.benchmark_group(name); + group.throughput(Throughput::Bytes(data.len() as u64)); + group.sample_size(50); + group.measurement_time(std::time::Duration::from_secs(5)); + group.warm_up_time(std::time::Duration::from_secs(2)); + group.sampling_mode(SamplingMode::Flat); + group.bench_function(BenchmarkId::from_parameter(rows), |b| { + b.iter(|| { + let mut decoder = ReaderBuilder::new(schema.clone()) + .with_batch_size(rows) + .build_decoder() + .unwrap(); + + let mut offset = 0; + while offset < data.len() { + let read = decoder.decode(black_box(&data[offset..])).unwrap(); + if read == 0 { + break; + } + offset += read; + } + + let batch = decoder.flush().unwrap(); + black_box(batch); + }) + }); + group.finish(); +} + +fn build_wide_projection_json(rows: usize, total_fields: usize) -> Vec { + // Estimate: each field ~15 bytes ("fXX":VVVVVVV,), total ~15*100 + overhead + let per_row_size = total_fields * 15 + 10; + let mut data = String::with_capacity(rows * per_row_size); + + for _row in 0..rows { + data.push('{'); + for i in 0..total_fields { + if i > 0 { + data.push(','); + } + // Use fixed-width values for stable benchmarks: 7 digits + let _ = write!(data, "\"f{}\":{:07}", i, i); + } + data.push('}'); + data.push('\n'); + } + data.into_bytes() +} + +fn criterion_benchmark(c: &mut Criterion) { + // Wide projection workload: tests overhead of parsing unused fields + let wide_projection_data = + build_wide_projection_json(WIDE_PROJECTION_ROWS, WIDE_PROJECTION_TOTAL_FIELDS); + + // Full schema: all 100 fields + let mut full_fields = Vec::new(); + for i in 0..WIDE_PROJECTION_TOTAL_FIELDS { + full_fields.push(Field::new(format!("f{}", i), DataType::Int64, false)); + } + let full_schema = Arc::new(Schema::new(full_fields)); + bench_decode_schema( + c, + "decode_wide_projection_full_json", + &wide_projection_data, + full_schema, + WIDE_PROJECTION_ROWS, + false, + ); + + // Projected schema: only 3 fields (f0, f10, f50) out of 100 + let projected_schema = Arc::new(Schema::new(vec![ + Field::new("f0", DataType::Int64, false), + Field::new("f10", DataType::Int64, false), + Field::new("f50", DataType::Int64, false), + ])); + bench_decode_schema( + c, + "decode_wide_projection_narrow_json", + &wide_projection_data, + projected_schema, + WIDE_PROJECTION_ROWS, + true, + ); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); From 9df6f2525cc5808b013f6eb6b1e7757ce36e01c0 Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Wed, 7 Jan 2026 19:42:32 +0200 Subject: [PATCH 4/7] feat: update benchmarks for wide JSON decoding and projection with increased row and batch sizes --- arrow-json/benches/binary_hex.rs | 31 ++++++--------------------- arrow-json/benches/wide_object.rs | 11 ++++++---- arrow-json/benches/wide_projection.rs | 6 ++++-- 3 files changed, 18 insertions(+), 30 deletions(-) diff --git a/arrow-json/benches/binary_hex.rs b/arrow-json/benches/binary_hex.rs index 993382d29462..7162126570d4 100644 --- a/arrow-json/benches/binary_hex.rs +++ b/arrow-json/benches/binary_hex.rs @@ -21,14 +21,15 @@ use criterion::{Criterion, criterion_group, criterion_main}; use std::hint::black_box; use std::sync::Arc; -const BINARY_ROWS: usize = 1 << 15; +const BINARY_ROWS: usize = 1 << 17; const BINARY_BYTES: usize = 64; +const BINARY_BATCH_SIZE: usize = 1 << 13; -fn bench_decode(c: &mut Criterion, name: &str, data: &[u8], field: Arc, rows: usize) { +fn bench_decode(c: &mut Criterion, name: &str, data: &[u8], field: Arc) { c.bench_function(name, |b| { b.iter(|| { let mut decoder = ReaderBuilder::new_with_field(field.clone()) - .with_batch_size(rows) + .with_batch_size(BINARY_BATCH_SIZE) .build_decoder() .unwrap(); @@ -72,35 +73,17 @@ fn criterion_benchmark(c: &mut Criterion) { let binary_data = build_hex_lines(BINARY_ROWS, BINARY_BYTES); let binary_field = Arc::new(Field::new("item", DataType::Binary, false)); - bench_decode( - c, - "decode_binary_hex_json", - &binary_data, - binary_field, - BINARY_ROWS, - ); + bench_decode(c, "decode_binary_hex_json", &binary_data, binary_field); let fixed_field = Arc::new(Field::new( "item", DataType::FixedSizeBinary(BINARY_BYTES as i32), false, )); - bench_decode( - c, - "decode_fixed_binary_hex_json", - &binary_data, - fixed_field, - BINARY_ROWS, - ); + bench_decode(c, "decode_fixed_binary_hex_json", &binary_data, fixed_field); let view_field = Arc::new(Field::new("item", DataType::BinaryView, false)); - bench_decode( - c, - "decode_binary_view_hex_json", - &binary_data, - view_field, - BINARY_ROWS, - ); + bench_decode(c, "decode_binary_view_hex_json", &binary_data, view_field); } criterion_group!(benches, criterion_benchmark); diff --git a/arrow-json/benches/wide_object.rs b/arrow-json/benches/wide_object.rs index 0fcd260c5932..3d8f4c651149 100644 --- a/arrow-json/benches/wide_object.rs +++ b/arrow-json/benches/wide_object.rs @@ -22,6 +22,9 @@ use serde_json::{Map, Number, Value}; use std::fmt::Write; use std::sync::Arc; +const WIDE_ROWS: usize = 1 << 17; // 128K rows +const WIDE_BATCH_SIZE: usize = 1 << 13; // 8K rows per batch + fn build_schema(field_count: usize) -> Arc { // Builds a schema with fields named f0..f{field_count-1}, all Int64 and non-nullable. let fields: Vec = (0..field_count) @@ -67,7 +70,7 @@ fn build_wide_values(rows: usize, fields: usize) -> Vec { } fn bench_decode_wide_object(c: &mut Criterion) { - let rows = 4096; + let rows = WIDE_ROWS; let fields = 64; let data = build_wide_json(rows, fields); let schema = build_schema(fields); @@ -75,7 +78,7 @@ fn bench_decode_wide_object(c: &mut Criterion) { c.bench_function("decode_wide_object_i64_json", |b| { b.iter(|| { let mut decoder = ReaderBuilder::new(schema.clone()) - .with_batch_size(1024) + .with_batch_size(WIDE_BATCH_SIZE) .build_decoder() .unwrap(); @@ -93,7 +96,7 @@ fn bench_decode_wide_object(c: &mut Criterion) { } fn bench_serialize_wide_object(c: &mut Criterion) { - let rows = 4096; + let rows = WIDE_ROWS; let fields = 64; let values = build_wide_values(rows, fields); let schema = build_schema(fields); @@ -101,7 +104,7 @@ fn bench_serialize_wide_object(c: &mut Criterion) { c.bench_function("decode_wide_object_i64_serialize", |b| { b.iter(|| { let mut decoder = ReaderBuilder::new(schema.clone()) - .with_batch_size(1024) + .with_batch_size(WIDE_BATCH_SIZE) .build_decoder() .unwrap(); diff --git a/arrow-json/benches/wide_projection.rs b/arrow-json/benches/wide_projection.rs index e484e473eb2f..efb3b163aeba 100644 --- a/arrow-json/benches/wide_projection.rs +++ b/arrow-json/benches/wide_projection.rs @@ -25,8 +25,9 @@ use std::hint::black_box; use std::sync::Arc; // Projection benchmark constants -const WIDE_PROJECTION_ROWS: usize = 1 << 14; // 16K rows +const WIDE_PROJECTION_ROWS: usize = 1 << 17; // 128K rows const WIDE_PROJECTION_TOTAL_FIELDS: usize = 100; // 100 fields total, select only 3 +const WIDE_PROJECTION_BATCH_SIZE: usize = 1 << 13; // 8K rows per batch fn bench_decode_schema( c: &mut Criterion, @@ -45,7 +46,8 @@ fn bench_decode_schema( group.bench_function(BenchmarkId::from_parameter(rows), |b| { b.iter(|| { let mut decoder = ReaderBuilder::new(schema.clone()) - .with_batch_size(rows) + .with_batch_size(WIDE_PROJECTION_BATCH_SIZE) + .with_projection(projection) .build_decoder() .unwrap(); From ac1f197ecda3a49d1d1c9ce64f245cee5335d82c Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Wed, 7 Jan 2026 19:54:04 +0200 Subject: [PATCH 5/7] chore --- arrow-json/benches/wide_projection.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arrow-json/benches/wide_projection.rs b/arrow-json/benches/wide_projection.rs index efb3b163aeba..a575585e28aa 100644 --- a/arrow-json/benches/wide_projection.rs +++ b/arrow-json/benches/wide_projection.rs @@ -35,7 +35,6 @@ fn bench_decode_schema( data: &[u8], schema: Arc, rows: usize, - projection: bool, ) { let mut group = c.benchmark_group(name); group.throughput(Throughput::Bytes(data.len() as u64)); @@ -47,7 +46,6 @@ fn bench_decode_schema( b.iter(|| { let mut decoder = ReaderBuilder::new(schema.clone()) .with_batch_size(WIDE_PROJECTION_BATCH_SIZE) - .with_projection(projection) .build_decoder() .unwrap(); @@ -104,7 +102,6 @@ fn criterion_benchmark(c: &mut Criterion) { &wide_projection_data, full_schema, WIDE_PROJECTION_ROWS, - false, ); // Projected schema: only 3 fields (f0, f10, f50) out of 100 @@ -119,7 +116,6 @@ fn criterion_benchmark(c: &mut Criterion) { &wide_projection_data, projected_schema, WIDE_PROJECTION_ROWS, - true, ); } From f61a9a08c08cfa3987f814f2ec6bbcf600755d62 Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Wed, 7 Jan 2026 22:29:16 +0200 Subject: [PATCH 6/7] feat: refactor benchmarks by removing obsolete wide_object and wide_projection files and adding json-reader benchmark --- arrow-json/Cargo.toml | 10 +- arrow-json/benches/binary_hex.rs | 90 --------- arrow-json/benches/json-reader.rs | 256 ++++++++++++++++++++++++++ arrow-json/benches/wide_object.rs | 122 ------------ arrow-json/benches/wide_projection.rs | 123 ------------- 5 files changed, 257 insertions(+), 344 deletions(-) delete mode 100644 arrow-json/benches/binary_hex.rs create mode 100644 arrow-json/benches/json-reader.rs delete mode 100644 arrow-json/benches/wide_object.rs delete mode 100644 arrow-json/benches/wide_projection.rs diff --git a/arrow-json/Cargo.toml b/arrow-json/Cargo.toml index a8e58a172030..5fcde480eb6d 100644 --- a/arrow-json/Cargo.toml +++ b/arrow-json/Cargo.toml @@ -67,13 +67,5 @@ name = "serde" harness = false [[bench]] -name = "wide_object" -harness = false - -[[bench]] -name = "binary_hex" -harness = false - -[[bench]] -name = "wide_projection" +name = "json-reader" harness = false diff --git a/arrow-json/benches/binary_hex.rs b/arrow-json/benches/binary_hex.rs deleted file mode 100644 index 7162126570d4..000000000000 --- a/arrow-json/benches/binary_hex.rs +++ /dev/null @@ -1,90 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use arrow_json::ReaderBuilder; -use arrow_schema::{DataType, Field}; -use criterion::{Criterion, criterion_group, criterion_main}; -use std::hint::black_box; -use std::sync::Arc; - -const BINARY_ROWS: usize = 1 << 17; -const BINARY_BYTES: usize = 64; -const BINARY_BATCH_SIZE: usize = 1 << 13; - -fn bench_decode(c: &mut Criterion, name: &str, data: &[u8], field: Arc) { - c.bench_function(name, |b| { - b.iter(|| { - let mut decoder = ReaderBuilder::new_with_field(field.clone()) - .with_batch_size(BINARY_BATCH_SIZE) - .build_decoder() - .unwrap(); - - let mut offset = 0; - while offset < data.len() { - let read = decoder.decode(black_box(&data[offset..])).unwrap(); - if read == 0 { - break; - } - offset += read; - } - - let batch = decoder.flush().unwrap(); - black_box(batch); - }) - }); -} - -#[inline] -fn append_hex_byte(buf: &mut String, byte: u8) { - const HEX: &[u8; 16] = b"0123456789abcdef"; - buf.push(HEX[(byte >> 4) as usize] as char); - buf.push(HEX[(byte & 0x0f) as usize] as char); -} - -fn build_hex_lines(rows: usize, bytes_per_row: usize) -> Vec { - let mut data = String::with_capacity(rows * (bytes_per_row * 2 + 3)); - for row in 0..rows { - data.push('"'); - for i in 0..bytes_per_row { - let byte = ((row + i) & 0xff) as u8; - append_hex_byte(&mut data, byte); - } - data.push('"'); - data.push('\n'); - } - data.into_bytes() -} - -fn criterion_benchmark(c: &mut Criterion) { - let binary_data = build_hex_lines(BINARY_ROWS, BINARY_BYTES); - - let binary_field = Arc::new(Field::new("item", DataType::Binary, false)); - bench_decode(c, "decode_binary_hex_json", &binary_data, binary_field); - - let fixed_field = Arc::new(Field::new( - "item", - DataType::FixedSizeBinary(BINARY_BYTES as i32), - false, - )); - bench_decode(c, "decode_fixed_binary_hex_json", &binary_data, fixed_field); - - let view_field = Arc::new(Field::new("item", DataType::BinaryView, false)); - bench_decode(c, "decode_binary_view_hex_json", &binary_data, view_field); -} - -criterion_group!(benches, criterion_benchmark); -criterion_main!(benches); diff --git a/arrow-json/benches/json-reader.rs b/arrow-json/benches/json-reader.rs new file mode 100644 index 000000000000..743d434765cc --- /dev/null +++ b/arrow-json/benches/json-reader.rs @@ -0,0 +1,256 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_json::reader::Decoder; +use arrow_json::ReaderBuilder; +use arrow_schema::{DataType, Field, Schema}; +use criterion::{ + BenchmarkId, Criterion, SamplingMode, Throughput, criterion_group, criterion_main, +}; +use serde_json::{Map, Number, Value}; +use std::fmt::Write; +use std::hint::black_box; +use std::sync::Arc; + +const ROWS: usize = 1 << 17; // 128K rows +const BATCH_SIZE: usize = 1 << 13; // 8K rows per batch + +const WIDE_FIELDS: usize = 64; +const BINARY_BYTES: usize = 64; +const WIDE_PROJECTION_TOTAL_FIELDS: usize = 100; // 100 fields total, select only 3 + +fn decode_and_flush(decoder: &mut Decoder, data: &[u8]) { + let mut offset = 0; + while offset < data.len() { + let read = decoder.decode(black_box(&data[offset..])).unwrap(); + if read == 0 { + break; + } + offset += read; + while let Some(_batch) = decoder.flush().unwrap() {} + } +} + +fn build_schema(field_count: usize) -> Arc { + // Builds a schema with fields named f0..f{field_count-1}, all Int64 and non-nullable. + let fields: Vec = (0..field_count) + .map(|i| Field::new(format!("f{i}"), DataType::Int64, false)) + .collect(); + Arc::new(Schema::new(fields)) +} + +fn build_projection_schema(indices: &[usize]) -> Arc { + let fields: Vec = indices + .iter() + .map(|i| Field::new(format!("f{i}"), DataType::Int64, false)) + .collect(); + Arc::new(Schema::new(fields)) +} + +fn build_wide_json(rows: usize, fields: usize) -> Vec { + // Builds newline-delimited JSON objects with "wide" schema. + // Example (rows=2, fields=3): + // {"f0":0,"f1":1,"f2":2} + // {"f0":1,"f1":2,"f2":3} + let mut out = String::with_capacity(rows * fields * 12); + for row in 0..rows { + out.push('{'); + for field in 0..fields { + if field > 0 { + out.push(','); + } + let value = row as i64 + field as i64; + write!(&mut out, "\"f{field}\":{value}").unwrap(); + } + out.push('}'); + out.push('\n'); + } + out.into_bytes() +} + +fn build_wide_values(rows: usize, fields: usize) -> Vec { + // Mirrors build_wide_json but returns structured serde_json::Value objects. + let mut out = Vec::with_capacity(rows); + for row in 0..rows { + let mut map = Map::with_capacity(fields); + for field in 0..fields { + let key = format!("f{field}"); + let value = Number::from((row + field) as i64); + map.insert(key, Value::Number(value)); + } + out.push(Value::Object(map)); + } + out +} + +fn bench_decode_wide_object(c: &mut Criterion) { + let data = build_wide_json(ROWS, WIDE_FIELDS); + let schema = build_schema(WIDE_FIELDS); + + c.bench_function("decode_wide_object_i64_json", |b| { + b.iter(|| { + let mut decoder = ReaderBuilder::new(schema.clone()) + .with_batch_size(BATCH_SIZE) + .build_decoder() + .unwrap(); + decode_and_flush(&mut decoder, &data); + }) + }); +} + +fn bench_serialize_wide_object(c: &mut Criterion) { + let values = build_wide_values(ROWS, WIDE_FIELDS); + let schema = build_schema(WIDE_FIELDS); + + c.bench_function("decode_wide_object_i64_serialize", |b| { + b.iter(|| { + let mut decoder = ReaderBuilder::new(schema.clone()) + .with_batch_size(BATCH_SIZE) + .build_decoder() + .unwrap(); + + decoder.serialize(&values).unwrap(); + while let Some(_batch) = decoder.flush().unwrap() {} + }) + }); +} + +fn bench_decode_binary(c: &mut Criterion, name: &str, data: &[u8], field: Arc) { + c.bench_function(name, |b| { + b.iter(|| { + let mut decoder = ReaderBuilder::new_with_field(field.clone()) + .with_batch_size(BATCH_SIZE) + .build_decoder() + .unwrap(); + decode_and_flush(&mut decoder, data); + }) + }); +} + +#[inline] +fn append_hex_byte(buf: &mut String, byte: u8) { + const HEX: &[u8; 16] = b"0123456789abcdef"; + buf.push(HEX[(byte >> 4) as usize] as char); + buf.push(HEX[(byte & 0x0f) as usize] as char); +} + +fn build_hex_lines(rows: usize, bytes_per_row: usize) -> Vec { + let mut data = String::with_capacity(rows * (bytes_per_row * 2 + 3)); + for row in 0..rows { + data.push('"'); + for i in 0..bytes_per_row { + let byte = ((row + i) & 0xff) as u8; + append_hex_byte(&mut data, byte); + } + data.push('"'); + data.push('\n'); + } + data.into_bytes() +} + +fn bench_binary_hex(c: &mut Criterion) { + let binary_data = build_hex_lines(ROWS, BINARY_BYTES); + + let binary_field = Arc::new(Field::new("item", DataType::Binary, false)); + bench_decode_binary(c, "decode_binary_hex_json", &binary_data, binary_field); + + let fixed_field = Arc::new(Field::new( + "item", + DataType::FixedSizeBinary(BINARY_BYTES as i32), + false, + )); + bench_decode_binary(c, "decode_fixed_binary_hex_json", &binary_data, fixed_field); + + let view_field = Arc::new(Field::new("item", DataType::BinaryView, false)); + bench_decode_binary(c, "decode_binary_view_hex_json", &binary_data, view_field); +} + +fn bench_decode_schema( + c: &mut Criterion, + name: &str, + data: &[u8], + schema: Arc, +) { + let mut group = c.benchmark_group(name); + group.throughput(Throughput::Bytes(data.len() as u64)); + group.sample_size(50); + group.measurement_time(std::time::Duration::from_secs(5)); + group.warm_up_time(std::time::Duration::from_secs(2)); + group.sampling_mode(SamplingMode::Flat); + group.bench_function(BenchmarkId::from_parameter(ROWS), |b| { + b.iter(|| { + let mut decoder = ReaderBuilder::new(schema.clone()) + .with_batch_size(BATCH_SIZE) + .build_decoder() + .unwrap(); + decode_and_flush(&mut decoder, data); + }) + }); + group.finish(); +} + +fn build_wide_projection_json(rows: usize, total_fields: usize) -> Vec { + // Estimate: each field ~15 bytes ("fXX":VVVVVVV,), total ~15*100 + overhead + let per_row_size = total_fields * 15 + 10; + let mut data = String::with_capacity(rows * per_row_size); + + for _row in 0..rows { + data.push('{'); + for i in 0..total_fields { + if i > 0 { + data.push(','); + } + // Use fixed-width values for stable benchmarks: 7 digits + let _ = write!(data, "\"f{}\":{:07}", i, i); + } + data.push('}'); + data.push('\n'); + } + data.into_bytes() +} + +fn bench_wide_projection(c: &mut Criterion) { + // Wide projection workload: tests overhead of parsing unused fields + let wide_projection_data = + build_wide_projection_json(ROWS, WIDE_PROJECTION_TOTAL_FIELDS); + + let full_schema = build_schema(WIDE_PROJECTION_TOTAL_FIELDS); + bench_decode_schema( + c, + "decode_wide_projection_full_json", + &wide_projection_data, + full_schema, + ); + + // Projected schema: only 3 fields (f0, f10, f50) out of 100 + let projected_schema = build_projection_schema(&[0, 10, 50]); + bench_decode_schema( + c, + "decode_wide_projection_narrow_json", + &wide_projection_data, + projected_schema, + ); +} + +criterion_group!( + benches, + bench_decode_wide_object, + bench_serialize_wide_object, + bench_binary_hex, + bench_wide_projection +); +criterion_main!(benches); diff --git a/arrow-json/benches/wide_object.rs b/arrow-json/benches/wide_object.rs deleted file mode 100644 index 3d8f4c651149..000000000000 --- a/arrow-json/benches/wide_object.rs +++ /dev/null @@ -1,122 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use arrow_json::ReaderBuilder; -use arrow_schema::{DataType, Field, Schema}; -use criterion::{Criterion, criterion_group, criterion_main}; -use serde_json::{Map, Number, Value}; -use std::fmt::Write; -use std::sync::Arc; - -const WIDE_ROWS: usize = 1 << 17; // 128K rows -const WIDE_BATCH_SIZE: usize = 1 << 13; // 8K rows per batch - -fn build_schema(field_count: usize) -> Arc { - // Builds a schema with fields named f0..f{field_count-1}, all Int64 and non-nullable. - let fields: Vec = (0..field_count) - .map(|i| Field::new(format!("f{i}"), DataType::Int64, false)) - .collect(); - Arc::new(Schema::new(fields)) -} - -fn build_wide_json(rows: usize, fields: usize) -> Vec { - // Builds newline-delimited JSON objects with "wide" schema. - // Example (rows=2, fields=3): - // {"f0":0,"f1":1,"f2":2} - // {"f0":1,"f1":2,"f2":3} - let mut out = String::with_capacity(rows * fields * 12); - for row in 0..rows { - out.push('{'); - for field in 0..fields { - if field > 0 { - out.push(','); - } - let value = row as i64 + field as i64; - write!(&mut out, "\"f{field}\":{value}").unwrap(); - } - out.push('}'); - out.push('\n'); - } - out.into_bytes() -} - -fn build_wide_values(rows: usize, fields: usize) -> Vec { - // Mirrors build_wide_json but returns structured serde_json::Value objects. - let mut out = Vec::with_capacity(rows); - for row in 0..rows { - let mut map = Map::with_capacity(fields); - for field in 0..fields { - let key = format!("f{field}"); - let value = Number::from((row + field) as i64); - map.insert(key, Value::Number(value)); - } - out.push(Value::Object(map)); - } - out -} - -fn bench_decode_wide_object(c: &mut Criterion) { - let rows = WIDE_ROWS; - let fields = 64; - let data = build_wide_json(rows, fields); - let schema = build_schema(fields); - - c.bench_function("decode_wide_object_i64_json", |b| { - b.iter(|| { - let mut decoder = ReaderBuilder::new(schema.clone()) - .with_batch_size(WIDE_BATCH_SIZE) - .build_decoder() - .unwrap(); - - let mut offset = 0; - while offset < data.len() { - let read = decoder.decode(&data[offset..]).unwrap(); - if read == 0 { - break; - } - offset += read; - while let Some(_batch) = decoder.flush().unwrap() {} - } - }) - }); -} - -fn bench_serialize_wide_object(c: &mut Criterion) { - let rows = WIDE_ROWS; - let fields = 64; - let values = build_wide_values(rows, fields); - let schema = build_schema(fields); - - c.bench_function("decode_wide_object_i64_serialize", |b| { - b.iter(|| { - let mut decoder = ReaderBuilder::new(schema.clone()) - .with_batch_size(WIDE_BATCH_SIZE) - .build_decoder() - .unwrap(); - - decoder.serialize(&values).unwrap(); - while let Some(_batch) = decoder.flush().unwrap() {} - }) - }); -} - -criterion_group!( - benches, - bench_decode_wide_object, - bench_serialize_wide_object -); -criterion_main!(benches); diff --git a/arrow-json/benches/wide_projection.rs b/arrow-json/benches/wide_projection.rs deleted file mode 100644 index a575585e28aa..000000000000 --- a/arrow-json/benches/wide_projection.rs +++ /dev/null @@ -1,123 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use arrow_json::ReaderBuilder; -use arrow_schema::{DataType, Field, Schema}; -use criterion::{ - BenchmarkId, Criterion, SamplingMode, Throughput, criterion_group, criterion_main, -}; -use std::fmt::Write; -use std::hint::black_box; -use std::sync::Arc; - -// Projection benchmark constants -const WIDE_PROJECTION_ROWS: usize = 1 << 17; // 128K rows -const WIDE_PROJECTION_TOTAL_FIELDS: usize = 100; // 100 fields total, select only 3 -const WIDE_PROJECTION_BATCH_SIZE: usize = 1 << 13; // 8K rows per batch - -fn bench_decode_schema( - c: &mut Criterion, - name: &str, - data: &[u8], - schema: Arc, - rows: usize, -) { - let mut group = c.benchmark_group(name); - group.throughput(Throughput::Bytes(data.len() as u64)); - group.sample_size(50); - group.measurement_time(std::time::Duration::from_secs(5)); - group.warm_up_time(std::time::Duration::from_secs(2)); - group.sampling_mode(SamplingMode::Flat); - group.bench_function(BenchmarkId::from_parameter(rows), |b| { - b.iter(|| { - let mut decoder = ReaderBuilder::new(schema.clone()) - .with_batch_size(WIDE_PROJECTION_BATCH_SIZE) - .build_decoder() - .unwrap(); - - let mut offset = 0; - while offset < data.len() { - let read = decoder.decode(black_box(&data[offset..])).unwrap(); - if read == 0 { - break; - } - offset += read; - } - - let batch = decoder.flush().unwrap(); - black_box(batch); - }) - }); - group.finish(); -} - -fn build_wide_projection_json(rows: usize, total_fields: usize) -> Vec { - // Estimate: each field ~15 bytes ("fXX":VVVVVVV,), total ~15*100 + overhead - let per_row_size = total_fields * 15 + 10; - let mut data = String::with_capacity(rows * per_row_size); - - for _row in 0..rows { - data.push('{'); - for i in 0..total_fields { - if i > 0 { - data.push(','); - } - // Use fixed-width values for stable benchmarks: 7 digits - let _ = write!(data, "\"f{}\":{:07}", i, i); - } - data.push('}'); - data.push('\n'); - } - data.into_bytes() -} - -fn criterion_benchmark(c: &mut Criterion) { - // Wide projection workload: tests overhead of parsing unused fields - let wide_projection_data = - build_wide_projection_json(WIDE_PROJECTION_ROWS, WIDE_PROJECTION_TOTAL_FIELDS); - - // Full schema: all 100 fields - let mut full_fields = Vec::new(); - for i in 0..WIDE_PROJECTION_TOTAL_FIELDS { - full_fields.push(Field::new(format!("f{}", i), DataType::Int64, false)); - } - let full_schema = Arc::new(Schema::new(full_fields)); - bench_decode_schema( - c, - "decode_wide_projection_full_json", - &wide_projection_data, - full_schema, - WIDE_PROJECTION_ROWS, - ); - - // Projected schema: only 3 fields (f0, f10, f50) out of 100 - let projected_schema = Arc::new(Schema::new(vec![ - Field::new("f0", DataType::Int64, false), - Field::new("f10", DataType::Int64, false), - Field::new("f50", DataType::Int64, false), - ])); - bench_decode_schema( - c, - "decode_wide_projection_narrow_json", - &wide_projection_data, - projected_schema, - WIDE_PROJECTION_ROWS, - ); -} - -criterion_group!(benches, criterion_benchmark); -criterion_main!(benches); From fbf50dee1e6adf40a3f9310e369486e1c6350dd6 Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Wed, 7 Jan 2026 22:45:38 +0200 Subject: [PATCH 7/7] chore: fmt --- arrow-json/benches/json-reader.rs | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/arrow-json/benches/json-reader.rs b/arrow-json/benches/json-reader.rs index 743d434765cc..504839f8ffe2 100644 --- a/arrow-json/benches/json-reader.rs +++ b/arrow-json/benches/json-reader.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -use arrow_json::reader::Decoder; use arrow_json::ReaderBuilder; +use arrow_json::reader::Decoder; use arrow_schema::{DataType, Field, Schema}; use criterion::{ BenchmarkId, Criterion, SamplingMode, Throughput, criterion_group, criterion_main, @@ -179,12 +179,7 @@ fn bench_binary_hex(c: &mut Criterion) { bench_decode_binary(c, "decode_binary_view_hex_json", &binary_data, view_field); } -fn bench_decode_schema( - c: &mut Criterion, - name: &str, - data: &[u8], - schema: Arc, -) { +fn bench_decode_schema(c: &mut Criterion, name: &str, data: &[u8], schema: Arc) { let mut group = c.benchmark_group(name); group.throughput(Throughput::Bytes(data.len() as u64)); group.sample_size(50); @@ -225,8 +220,7 @@ fn build_wide_projection_json(rows: usize, total_fields: usize) -> Vec { fn bench_wide_projection(c: &mut Criterion) { // Wide projection workload: tests overhead of parsing unused fields - let wide_projection_data = - build_wide_projection_json(ROWS, WIDE_PROJECTION_TOTAL_FIELDS); + let wide_projection_data = build_wide_projection_json(ROWS, WIDE_PROJECTION_TOTAL_FIELDS); let full_schema = build_schema(WIDE_PROJECTION_TOTAL_FIELDS); bench_decode_schema(