Skip to content

Commit 2879b22

Browse files
committed
feat: add benchmark for wide JSON projection decoding
1 parent 04769f2 commit 2879b22

2 files changed

Lines changed: 129 additions & 0 deletions

File tree

arrow-json/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,3 +73,7 @@ harness = false
7373
[[bench]]
7474
name = "binary_hex"
7575
harness = false
76+
77+
[[bench]]
78+
name = "wide_projection"
79+
harness = false
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use arrow_json::ReaderBuilder;
19+
use arrow_schema::{DataType, Field, Schema};
20+
use criterion::{
21+
BenchmarkId, Criterion, SamplingMode, Throughput, criterion_group, criterion_main,
22+
};
23+
use std::fmt::Write;
24+
use std::hint::black_box;
25+
use std::sync::Arc;
26+
27+
// Projection benchmark constants
28+
const WIDE_PROJECTION_ROWS: usize = 1 << 14; // 16K rows
29+
const WIDE_PROJECTION_TOTAL_FIELDS: usize = 100; // 100 fields total, select only 3
30+
31+
fn bench_decode_schema(
32+
c: &mut Criterion,
33+
name: &str,
34+
data: &[u8],
35+
schema: Arc<Schema>,
36+
rows: usize,
37+
projection: bool,
38+
) {
39+
let mut group = c.benchmark_group(name);
40+
group.throughput(Throughput::Bytes(data.len() as u64));
41+
group.sample_size(50);
42+
group.measurement_time(std::time::Duration::from_secs(5));
43+
group.warm_up_time(std::time::Duration::from_secs(2));
44+
group.sampling_mode(SamplingMode::Flat);
45+
group.bench_function(BenchmarkId::from_parameter(rows), |b| {
46+
b.iter(|| {
47+
let mut decoder = ReaderBuilder::new(schema.clone())
48+
.with_batch_size(rows)
49+
.build_decoder()
50+
.unwrap();
51+
52+
let mut offset = 0;
53+
while offset < data.len() {
54+
let read = decoder.decode(black_box(&data[offset..])).unwrap();
55+
if read == 0 {
56+
break;
57+
}
58+
offset += read;
59+
}
60+
61+
let batch = decoder.flush().unwrap();
62+
black_box(batch);
63+
})
64+
});
65+
group.finish();
66+
}
67+
68+
fn build_wide_projection_json(rows: usize, total_fields: usize) -> Vec<u8> {
69+
// Estimate: each field ~15 bytes ("fXX":VVVVVVV,), total ~15*100 + overhead
70+
let per_row_size = total_fields * 15 + 10;
71+
let mut data = String::with_capacity(rows * per_row_size);
72+
73+
for _row in 0..rows {
74+
data.push('{');
75+
for i in 0..total_fields {
76+
if i > 0 {
77+
data.push(',');
78+
}
79+
// Use fixed-width values for stable benchmarks: 7 digits
80+
let _ = write!(data, "\"f{}\":{:07}", i, i);
81+
}
82+
data.push('}');
83+
data.push('\n');
84+
}
85+
data.into_bytes()
86+
}
87+
88+
fn criterion_benchmark(c: &mut Criterion) {
89+
// Wide projection workload: tests overhead of parsing unused fields
90+
let wide_projection_data =
91+
build_wide_projection_json(WIDE_PROJECTION_ROWS, WIDE_PROJECTION_TOTAL_FIELDS);
92+
93+
// Full schema: all 100 fields
94+
let mut full_fields = Vec::new();
95+
for i in 0..WIDE_PROJECTION_TOTAL_FIELDS {
96+
full_fields.push(Field::new(format!("f{}", i), DataType::Int64, false));
97+
}
98+
let full_schema = Arc::new(Schema::new(full_fields));
99+
bench_decode_schema(
100+
c,
101+
"decode_wide_projection_full_json",
102+
&wide_projection_data,
103+
full_schema,
104+
WIDE_PROJECTION_ROWS,
105+
false,
106+
);
107+
108+
// Projected schema: only 3 fields (f0, f10, f50) out of 100
109+
let projected_schema = Arc::new(Schema::new(vec![
110+
Field::new("f0", DataType::Int64, false),
111+
Field::new("f10", DataType::Int64, false),
112+
Field::new("f50", DataType::Int64, false),
113+
]));
114+
bench_decode_schema(
115+
c,
116+
"decode_wide_projection_narrow_json",
117+
&wide_projection_data,
118+
projected_schema,
119+
WIDE_PROJECTION_ROWS,
120+
true,
121+
);
122+
}
123+
124+
criterion_group!(benches, criterion_benchmark);
125+
criterion_main!(benches);

0 commit comments

Comments
 (0)