Skip to content

Commit 06c3bd0

Browse files
authored
feat(parquet): add wide-schema writer overhead benchmark (#9723)
# Which issue does this PR close? - Contributes to #9722 # Rationale for this change Existing writer benchmarks use narrow schemas (5–10 columns) and primarily measure data encoding throughput. They don't capture per-column structural overhead that dominates at high column cardinality (thousands to hundreds of thousands of columns), such as allocation, and metadata assembly. # What changes are included in this PR? This commit adds benchmarks to fill that gap by writing a single-row batch through `ArrowWriter` with 1k/5k/10k flat `Float32` columns and per-column `WriterProperties` entries, isolating the cost of the writer infrastructure itself. Baseline results (Apple M1 Max): ``` writer_overhead/1000_cols/per_column_props 3.72 ms writer_overhead/5000_cols/per_column_props 54.96 ms writer_overhead/10000_cols/per_column_props 220.73 ms ``` # Are these changes tested? N/A # Are there any user-facing changes? N/A Signed-off-by: Hippolyte Barraud <hippolyte.barraud@datadoghq.com>
1 parent 38d78c3 commit 06c3bd0

File tree

2 files changed

+91
-0
lines changed

2 files changed

+91
-0
lines changed

parquet/Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,11 @@ name = "push_decoder"
226226
required-features = ["arrow"]
227227
harness = false
228228

229+
[[bench]]
230+
name = "writer_overhead"
231+
required-features = ["arrow"]
232+
harness = false
233+
229234
[[bench]]
230235
name = "arrow_reader"
231236
required-features = ["arrow", "test_common", "experimental"]

parquet/benches/writer_overhead.rs

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
//! Benchmarks for writer per-column overhead at high column cardinality.
19+
//!
20+
//! These benchmarks measure the structural cost of creating, writing, and
21+
//! closing a parquet file with many columns while keeping actual data
22+
//! encoding negligible (1 row per column). This isolates overhead such as
23+
//! `WriterProperties` per-column lookups, `GenericColumnWriter` allocation,
24+
//! and metadata assembly.
25+
26+
use criterion::{Criterion, criterion_group, criterion_main};
27+
use std::hint::black_box;
28+
use std::io::Empty;
29+
use std::sync::Arc;
30+
31+
use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
32+
use arrow_array::{Float32Array, RecordBatch};
33+
use parquet::arrow::ArrowWriter;
34+
use parquet::basic::Compression;
35+
use parquet::file::properties::WriterProperties;
36+
use parquet::schema::types::ColumnPath;
37+
38+
const COLUMN_COUNTS: &[usize] = &[1_000, 5_000, 10_000];
39+
40+
fn make_wide_schema(num_columns: usize) -> SchemaRef {
41+
let fields: Vec<Field> = (0..num_columns)
42+
.map(|i| Field::new(format!("c{i}"), DataType::Float32, false))
43+
.collect();
44+
Arc::new(Schema::new(fields))
45+
}
46+
47+
fn make_single_row_batch(schema: &SchemaRef) -> RecordBatch {
48+
let columns: Vec<Arc<dyn arrow_array::Array>> = (0..schema.fields().len())
49+
.map(|_| Arc::new(Float32Array::from(vec![0.0f32])) as _)
50+
.collect();
51+
RecordBatch::try_new(schema.clone(), columns).unwrap()
52+
}
53+
54+
/// Build WriterProperties with a per-column property set for every column,
55+
/// populating the internal HashMap so that per-column lookups are exercised.
56+
fn make_per_column_props(schema: &SchemaRef) -> WriterProperties {
57+
let mut builder = WriterProperties::builder().set_dictionary_enabled(false);
58+
for field in schema.fields() {
59+
builder = builder.set_column_compression(
60+
ColumnPath::from(field.name().as_str()),
61+
Compression::UNCOMPRESSED,
62+
);
63+
}
64+
builder.build()
65+
}
66+
67+
fn bench_writer_overhead(c: &mut Criterion) {
68+
for &num_cols in COLUMN_COUNTS {
69+
let schema = make_wide_schema(num_cols);
70+
let batch = make_single_row_batch(&schema);
71+
let props = make_per_column_props(&schema);
72+
73+
c.bench_function(&format!("writer_overhead/{num_cols}_cols"), |b| {
74+
b.iter(|| {
75+
let mut writer =
76+
ArrowWriter::try_new(Empty::default(), schema.clone(), Some(props.clone()))
77+
.unwrap();
78+
writer.write(black_box(&batch)).unwrap();
79+
black_box(writer.close()).unwrap();
80+
});
81+
});
82+
}
83+
}
84+
85+
criterion_group!(benches, bench_writer_overhead);
86+
criterion_main!(benches);

0 commit comments

Comments
 (0)