feat(parquet): add wide-schema writer overhead benchmark (#9723)

HippoBaro · web-flow · commit 06c3bd06c889 · 2026-04-15T16:56:00.000-04:00
# Which issue does this PR close? - Contributes to #9722 # Rationale for this change Existing writer benchmarks use narrow schemas (5–10 columns) and primarily measure data encoding throughput. They don't capture per-column structural overhead that dominates at high column cardinality (thousands to hundreds of thousands of columns), such as allocation, and metadata assembly. # What changes are included in this PR? This commit adds benchmarks to fill that gap by writing a single-row batch through `ArrowWriter` with 1k/5k/10k flat `Float32` columns and per-column `WriterProperties` entries, isolating the cost of the writer infrastructure itself. Baseline results (Apple M1 Max): ``` writer_overhead/1000_cols/per_column_props 3.72 ms writer_overhead/5000_cols/per_column_props 54.96 ms writer_overhead/10000_cols/per_column_props 220.73 ms ``` # Are these changes tested? N/A # Are there any user-facing changes? N/A Signed-off-by: Hippolyte Barraud <hippolyte.barraud@datadoghq.com>
diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml
@@ -226,6 +226,11 @@ name = "push_decoder"
 required-features = ["arrow"]
 harness = false
 
+[[bench]]
+name = "writer_overhead"
+required-features = ["arrow"]
+harness = false
+
 [[bench]]
 name = "arrow_reader"
 required-features = ["arrow", "test_common", "experimental"]
diff --git a/parquet/benches/writer_overhead.rs b/parquet/benches/writer_overhead.rs
@@ -0,0 +1,86 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Benchmarks for writer per-column overhead at high column cardinality.
+//!
+//! These benchmarks measure the structural cost of creating, writing, and
+//! closing a parquet file with many columns while keeping actual data
+//! encoding negligible (1 row per column). This isolates overhead such as
+//! `WriterProperties` per-column lookups, `GenericColumnWriter` allocation,
+//! and metadata assembly.
+
+use criterion::{Criterion, criterion_group, criterion_main};
+use std::hint::black_box;
+use std::io::Empty;
+use std::sync::Arc;
+
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use arrow_array::{Float32Array, RecordBatch};
+use parquet::arrow::ArrowWriter;
+use parquet::basic::Compression;
+use parquet::file::properties::WriterProperties;
+use parquet::schema::types::ColumnPath;
+
+const COLUMN_COUNTS: &[usize] = &[1_000, 5_000, 10_000];
+
+fn make_wide_schema(num_columns: usize) -> SchemaRef {
+    let fields: Vec<Field> = (0..num_columns)
+        .map(|i| Field::new(format!("c{i}"), DataType::Float32, false))
+        .collect();
+    Arc::new(Schema::new(fields))
+}
+
+fn make_single_row_batch(schema: &SchemaRef) -> RecordBatch {
+    let columns: Vec<Arc<dyn arrow_array::Array>> = (0..schema.fields().len())
+        .map(|_| Arc::new(Float32Array::from(vec![0.0f32])) as _)
+        .collect();
+    RecordBatch::try_new(schema.clone(), columns).unwrap()
+}
+
+/// Build WriterProperties with a per-column property set for every column,
+/// populating the internal HashMap so that per-column lookups are exercised.
+fn make_per_column_props(schema: &SchemaRef) -> WriterProperties {
+    let mut builder = WriterProperties::builder().set_dictionary_enabled(false);
+    for field in schema.fields() {
+        builder = builder.set_column_compression(
+            ColumnPath::from(field.name().as_str()),
+            Compression::UNCOMPRESSED,
+        );
+    }
+    builder.build()
+}
+
+fn bench_writer_overhead(c: &mut Criterion) {
+    for &num_cols in COLUMN_COUNTS {
+        let schema = make_wide_schema(num_cols);
+        let batch = make_single_row_batch(&schema);
+        let props = make_per_column_props(&schema);
+
+        c.bench_function(&format!("writer_overhead/{num_cols}_cols"), |b| {
+            b.iter(|| {
+                let mut writer =
+                    ArrowWriter::try_new(Empty::default(), schema.clone(), Some(props.clone()))
+                        .unwrap();
+                writer.write(black_box(&batch)).unwrap();
+                black_box(writer.close()).unwrap();
+            });
+        });
+    }
+}
+
+criterion_group!(benches, bench_writer_overhead);
+criterion_main!(benches);