From e6e5b7c469ad3b8e9c73be5650f2f0ecfd849223 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Wed, 31 Dec 2025 22:22:00 +0200 Subject: [PATCH 1/2] bench: added to row_format benchmark conversion of 50 non-nested columns --- arrow/benches/row_format.rs | 106 ++++++++++++++++++++++++++++++++++- arrow/src/util/bench_util.rs | 38 +++++++++++++ 2 files changed, 141 insertions(+), 3 deletions(-) diff --git a/arrow/benches/row_format.rs b/arrow/benches/row_format.rs index d67095ac2c43..dd759a30017d 100644 --- a/arrow/benches/row_format.rs +++ b/arrow/benches/row_format.rs @@ -23,13 +23,15 @@ use arrow::array::ArrayRef; use arrow::datatypes::{Int64Type, UInt64Type}; use arrow::row::{RowConverter, SortField}; use arrow::util::bench_util::{ - create_boolean_array, create_dict_from_values, create_primitive_array, - create_string_array_with_len, create_string_dict_array, create_string_view_array_with_len, + create_boolean_array, create_boolean_array_with_seed, create_dict_from_values, + create_f64_array_with_seed, create_primitive_array, create_primitive_array_with_seed, + create_string_array_with_len, create_string_array_with_len_range_and_prefix_and_seed, + create_string_dict_array, create_string_view_array_with_len, create_string_view_array_with_max_len, }; use arrow::util::data_gen::create_random_array; use arrow_array::Array; -use arrow_array::types::Int32Type; +use arrow_array::types::{Int8Type, Int32Type}; use arrow_schema::{DataType, Field}; use criterion::Criterion; use std::{hint, sync::Arc}; @@ -85,6 +87,101 @@ fn bench_iter(c: &mut Criterion) { }); } +/// A single benchmark with a medium number of columns (around 50) without nested columns for real-world use cases +/// This also makes sure there is a large gap between each value in the column and how it is laid out in the row format. +/// and it is on the edge of not fitting in L3 on some machines +fn run_benchmark_on_medium_amount_and_types_of_columns_without_nesting( + batch_size: usize, + c: &mut Criterion, +) { + let mut seed = 0; + + let mut cols: Vec = vec![]; + + for nulls in [0.0, 0.1, 0.2, 0.5] { + seed += 1; + cols.push(Arc::new(create_primitive_array_with_seed::( + batch_size, nulls, seed, + )) as ArrayRef); + } + + for nulls in [0.0, 0.1, 0.2, 0.5] { + seed += 1; + cols.push(Arc::new(create_primitive_array_with_seed::( + batch_size, nulls, seed, + )) as ArrayRef); + } + + for nulls in [0.0, 0.1, 0.2, 0.5] { + seed += 1; + cols.push(Arc::new(create_primitive_array_with_seed::( + batch_size, nulls, seed, + )) as ArrayRef); + } + + for _ in 0..10 { + seed += 1; + cols.push(Arc::new(create_primitive_array_with_seed::( + batch_size, 0.0, seed, + )) as ArrayRef); + } + + for nulls in [0.0, 0.1, 0.2, 0.5] { + seed += 1; + cols.push(Arc::new( + create_string_array_with_len_range_and_prefix_and_seed::( + batch_size, nulls, 0, 50, "", seed, + ), + )); + } + + for _ in 0..3 { + seed += 1; + cols.push(Arc::new( + create_string_array_with_len_range_and_prefix_and_seed::( + batch_size, 0.0, 0, 10, "", seed, + ), + )); + } + for _ in 0..3 { + seed += 1; + cols.push(Arc::new( + create_string_array_with_len_range_and_prefix_and_seed::( + batch_size, 0.0, 10, 20, "", seed, + ), + )); + } + for _ in 0..3 { + seed += 1; + cols.push(Arc::new( + create_string_array_with_len_range_and_prefix_and_seed::( + batch_size, 0.0, 20, 30, "", seed, + ), + )); + } + + for nulls in [0.0, 0.1, 0.2, 0.5] { + seed += 1; + cols.push(Arc::new(create_boolean_array_with_seed( + batch_size, nulls, 0.5, seed, + ))); + } + + for _ in 0..10 { + seed += 1; + cols.push(Arc::new(create_primitive_array_with_seed::( + batch_size, 0.0, seed, + )) as ArrayRef); + } + + for nulls in [0.0, 0.1, 0.2, 0.5] { + seed += 1; + cols.push(Arc::new(create_f64_array_with_seed(batch_size, nulls, seed)) as ArrayRef); + } + + do_bench(c, format!("{batch_size} lot of columns").as_str(), cols); +} + fn row_bench(c: &mut Criterion) { let cols = vec![Arc::new(create_primitive_array::(4096, 0.)) as ArrayRef]; do_bench(c, "4096 u64(0)", cols); @@ -279,6 +376,9 @@ fn row_bench(c: &mut Criterion) { ]; do_bench(c, "4096 large_list(0) sliced to 10 of u64(0)", cols); + run_benchmark_on_medium_amount_and_types_of_columns_without_nesting(4096, c); + run_benchmark_on_medium_amount_and_types_of_columns_without_nesting(8192, c); + bench_iter(c); } diff --git a/arrow/src/util/bench_util.rs b/arrow/src/util/bench_util.rs index 1f1dcff9b62a..bcf7a559e960 100644 --- a/arrow/src/util/bench_util.rs +++ b/arrow/src/util/bench_util.rs @@ -117,6 +117,29 @@ where .collect() } +/// Creates a random array of a given size and null density based on the provided seed +pub fn create_boolean_array_with_seed( + size: usize, + null_density: f32, + true_density: f32, + seed: u64, +) -> BooleanArray +where + StandardUniform: Distribution, +{ + let mut rng = StdRng::seed_from_u64(seed); + (0..size) + .map(|_| { + if rng.random::() < null_density { + None + } else { + let value = rng.random::() < true_density; + Some(value) + } + }) + .collect() +} + /// Creates a random (but fixed-seeded) string array of a given size and null density. /// /// Strings have a random length @@ -734,3 +757,18 @@ pub fn create_f64_array(size: usize, nan_density: f32) -> Float64Array { }) .collect() } + +/// Creates a random f64 array of a given size and nan-value density based on a given seed +pub fn create_f64_array_with_seed(size: usize, nan_density: f32, seed: u64) -> Float64Array { + let mut rng = StdRng::seed_from_u64(seed); + + (0..size) + .map(|_| { + if rng.random::() < nan_density { + Some(f64::NAN) + } else { + Some(rng.random()) + } + }) + .collect() +} From 3c5bf6340721bea225e6b06ae25b24316551dbf2 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 9 Jan 2026 15:49:12 -0500 Subject: [PATCH 2/2] Explicitly list number of columns --- arrow/benches/row_format.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arrow/benches/row_format.rs b/arrow/benches/row_format.rs index dd759a30017d..1c120bb2f24e 100644 --- a/arrow/benches/row_format.rs +++ b/arrow/benches/row_format.rs @@ -179,7 +179,8 @@ fn run_benchmark_on_medium_amount_and_types_of_columns_without_nesting( cols.push(Arc::new(create_f64_array_with_seed(batch_size, nulls, seed)) as ArrayRef); } - do_bench(c, format!("{batch_size} lot of columns").as_str(), cols); + assert_eq!(cols.len(), 53); + do_bench(c, format!("{batch_size} 53 columns").as_str(), cols); } fn row_bench(c: &mut Criterion) {