Skip to content

Commit 5a1e482

Browse files
rluvatonalamb
andauthored
bench: added to row_format benchmark conversion of 53 non-nested columns (#9081)
# Which issue does this PR close? N/A # Rationale for this change I noticed that converting around 50 columns the conversion become very slow, so adding a benchmark as I'm optimizing those parts # What changes are included in this PR? added new benchmark for `row_format` that convert 50 columns arrays # Are these changes tested? N/A # Are there any user-facing changes? Nope --------- Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent b736f08 commit 5a1e482

File tree

2 files changed

+142
-3
lines changed

2 files changed

+142
-3
lines changed

arrow/benches/row_format.rs

Lines changed: 104 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,15 @@ use arrow::array::ArrayRef;
2323
use arrow::datatypes::{Int64Type, UInt64Type};
2424
use arrow::row::{RowConverter, SortField};
2525
use arrow::util::bench_util::{
26-
create_boolean_array, create_dict_from_values, create_primitive_array,
27-
create_string_array_with_len, create_string_dict_array, create_string_view_array_with_len,
26+
create_boolean_array, create_boolean_array_with_seed, create_dict_from_values,
27+
create_f64_array_with_seed, create_primitive_array, create_primitive_array_with_seed,
28+
create_string_array_with_len, create_string_array_with_len_range_and_prefix_and_seed,
29+
create_string_dict_array, create_string_view_array_with_len,
2830
create_string_view_array_with_max_len,
2931
};
3032
use arrow::util::data_gen::create_random_array;
3133
use arrow_array::Array;
32-
use arrow_array::types::Int32Type;
34+
use arrow_array::types::{Int8Type, Int32Type};
3335
use arrow_schema::{DataType, Field};
3436
use criterion::Criterion;
3537
use std::{hint, sync::Arc};
@@ -85,6 +87,102 @@ fn bench_iter(c: &mut Criterion) {
8587
});
8688
}
8789

90+
/// A single benchmark with a medium number of columns (around 50) without nested columns for real-world use cases
91+
/// This also makes sure there is a large gap between each value in the column and how it is laid out in the row format.
92+
/// and it is on the edge of not fitting in L3 on some machines
93+
fn run_benchmark_on_medium_amount_and_types_of_columns_without_nesting(
94+
batch_size: usize,
95+
c: &mut Criterion,
96+
) {
97+
let mut seed = 0;
98+
99+
let mut cols: Vec<ArrayRef> = vec![];
100+
101+
for nulls in [0.0, 0.1, 0.2, 0.5] {
102+
seed += 1;
103+
cols.push(Arc::new(create_primitive_array_with_seed::<Int8Type>(
104+
batch_size, nulls, seed,
105+
)) as ArrayRef);
106+
}
107+
108+
for nulls in [0.0, 0.1, 0.2, 0.5] {
109+
seed += 1;
110+
cols.push(Arc::new(create_primitive_array_with_seed::<Int32Type>(
111+
batch_size, nulls, seed,
112+
)) as ArrayRef);
113+
}
114+
115+
for nulls in [0.0, 0.1, 0.2, 0.5] {
116+
seed += 1;
117+
cols.push(Arc::new(create_primitive_array_with_seed::<Int64Type>(
118+
batch_size, nulls, seed,
119+
)) as ArrayRef);
120+
}
121+
122+
for _ in 0..10 {
123+
seed += 1;
124+
cols.push(Arc::new(create_primitive_array_with_seed::<Int64Type>(
125+
batch_size, 0.0, seed,
126+
)) as ArrayRef);
127+
}
128+
129+
for nulls in [0.0, 0.1, 0.2, 0.5] {
130+
seed += 1;
131+
cols.push(Arc::new(
132+
create_string_array_with_len_range_and_prefix_and_seed::<i32>(
133+
batch_size, nulls, 0, 50, "", seed,
134+
),
135+
));
136+
}
137+
138+
for _ in 0..3 {
139+
seed += 1;
140+
cols.push(Arc::new(
141+
create_string_array_with_len_range_and_prefix_and_seed::<i32>(
142+
batch_size, 0.0, 0, 10, "", seed,
143+
),
144+
));
145+
}
146+
for _ in 0..3 {
147+
seed += 1;
148+
cols.push(Arc::new(
149+
create_string_array_with_len_range_and_prefix_and_seed::<i32>(
150+
batch_size, 0.0, 10, 20, "", seed,
151+
),
152+
));
153+
}
154+
for _ in 0..3 {
155+
seed += 1;
156+
cols.push(Arc::new(
157+
create_string_array_with_len_range_and_prefix_and_seed::<i32>(
158+
batch_size, 0.0, 20, 30, "", seed,
159+
),
160+
));
161+
}
162+
163+
for nulls in [0.0, 0.1, 0.2, 0.5] {
164+
seed += 1;
165+
cols.push(Arc::new(create_boolean_array_with_seed(
166+
batch_size, nulls, 0.5, seed,
167+
)));
168+
}
169+
170+
for _ in 0..10 {
171+
seed += 1;
172+
cols.push(Arc::new(create_primitive_array_with_seed::<Int64Type>(
173+
batch_size, 0.0, seed,
174+
)) as ArrayRef);
175+
}
176+
177+
for nulls in [0.0, 0.1, 0.2, 0.5] {
178+
seed += 1;
179+
cols.push(Arc::new(create_f64_array_with_seed(batch_size, nulls, seed)) as ArrayRef);
180+
}
181+
182+
assert_eq!(cols.len(), 53);
183+
do_bench(c, format!("{batch_size} 53 columns").as_str(), cols);
184+
}
185+
88186
fn row_bench(c: &mut Criterion) {
89187
let cols = vec![Arc::new(create_primitive_array::<UInt64Type>(4096, 0.)) as ArrayRef];
90188
do_bench(c, "4096 u64(0)", cols);
@@ -279,6 +377,9 @@ fn row_bench(c: &mut Criterion) {
279377
];
280378
do_bench(c, "4096 large_list(0) sliced to 10 of u64(0)", cols);
281379

380+
run_benchmark_on_medium_amount_and_types_of_columns_without_nesting(4096, c);
381+
run_benchmark_on_medium_amount_and_types_of_columns_without_nesting(8192, c);
382+
282383
bench_iter(c);
283384
}
284385

arrow/src/util/bench_util.rs

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,29 @@ where
117117
.collect()
118118
}
119119

120+
/// Creates a random array of a given size and null density based on the provided seed
121+
pub fn create_boolean_array_with_seed(
122+
size: usize,
123+
null_density: f32,
124+
true_density: f32,
125+
seed: u64,
126+
) -> BooleanArray
127+
where
128+
StandardUniform: Distribution<bool>,
129+
{
130+
let mut rng = StdRng::seed_from_u64(seed);
131+
(0..size)
132+
.map(|_| {
133+
if rng.random::<f32>() < null_density {
134+
None
135+
} else {
136+
let value = rng.random::<f32>() < true_density;
137+
Some(value)
138+
}
139+
})
140+
.collect()
141+
}
142+
120143
/// Creates a random (but fixed-seeded) string array of a given size and null density.
121144
///
122145
/// Strings have a random length
@@ -734,3 +757,18 @@ pub fn create_f64_array(size: usize, nan_density: f32) -> Float64Array {
734757
})
735758
.collect()
736759
}
760+
761+
/// Creates a random f64 array of a given size and nan-value density based on a given seed
762+
pub fn create_f64_array_with_seed(size: usize, nan_density: f32, seed: u64) -> Float64Array {
763+
let mut rng = StdRng::seed_from_u64(seed);
764+
765+
(0..size)
766+
.map(|_| {
767+
if rng.random::<f32>() < nan_density {
768+
Some(f64::NAN)
769+
} else {
770+
Some(rng.random())
771+
}
772+
})
773+
.collect()
774+
}

0 commit comments

Comments
 (0)