Skip to content

Commit 814ee42

Browse files
authored
Add benchmarks for Utf8View scalars for zip (#8988)
# Which issue does this PR close? N/A # Rationale for this change I have a PR to improve zip perf for Utf8View/BinaryView scalars and I need benchmarks for that. - #8963 # What changes are included in this PR? This extends the zip benchmarks by one new Input Generator for StringViews and two more functions to test scalar combinations of different StringViews combinations. # Are these changes tested? N/A # Are there any user-facing changes? No
1 parent 7f656ff commit 814ee42

File tree

2 files changed

+75
-0
lines changed

2 files changed

+75
-0
lines changed

arrow/benches/zip_kernels.rs

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ use rand::distr::{Distribution, StandardUniform};
2121
use rand::prelude::StdRng;
2222
use rand::{Rng, SeedableRng};
2323
use std::hint;
24+
use std::ops::Range;
2425
use std::sync::Arc;
2526

2627
use arrow::array::*;
@@ -133,6 +134,35 @@ where
133134
}
134135
}
135136

137+
struct GenerateStringView {
138+
range: Range<usize>,
139+
description: String,
140+
_marker: std::marker::PhantomData<StringViewType>,
141+
}
142+
143+
impl InputGenerator for GenerateStringView {
144+
fn name(&self) -> &str {
145+
self.description.as_str()
146+
}
147+
fn generate_scalar_with_null_value(&self) -> ArrayRef {
148+
new_null_array(&DataType::Utf8View, 1)
149+
}
150+
151+
fn generate_non_null_scalars(&self, seed: u64, number_of_scalars: usize) -> Vec<ArrayRef> {
152+
let array = self.generate_array(seed, number_of_scalars, 0.0);
153+
(0..number_of_scalars).map(|i| array.slice(i, 1)).collect()
154+
}
155+
156+
fn generate_array(&self, seed: u64, array_length: usize, null_percentage: f32) -> ArrayRef {
157+
Arc::new(create_string_view_array_with_len_range_and_seed(
158+
array_length,
159+
null_percentage,
160+
self.range.clone(),
161+
seed,
162+
))
163+
}
164+
}
165+
136166
fn mask_cases(len: usize) -> Vec<(&'static str, BooleanArray)> {
137167
vec![
138168
("all_true", create_boolean_array(len, 0.0, 1.0)),
@@ -273,6 +303,24 @@ fn add_benchmark(c: &mut Criterion) {
273303
_marker: std::marker::PhantomData,
274304
},
275305
);
306+
307+
bench_zip_on_input_generator(
308+
c,
309+
&GenerateStringView {
310+
description: "string_views size (3..10)".to_string(),
311+
range: 3..10,
312+
_marker: std::marker::PhantomData,
313+
},
314+
);
315+
316+
bench_zip_on_input_generator(
317+
c,
318+
&GenerateStringView {
319+
description: "string_views size (10..100)".to_string(),
320+
range: 10..100,
321+
_marker: std::marker::PhantomData,
322+
},
323+
);
276324
}
277325

278326
criterion_group!(benches, add_benchmark);

arrow/src/util/bench_util.rs

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,33 @@ pub fn create_string_array_with_len_range_and_prefix_and_seed<Offset: OffsetSize
208208
})
209209
.collect()
210210
}
211+
/// Creates a string view array of a given range, null density and length
212+
///
213+
/// Arguments:
214+
/// - `size`: number of string view array
215+
/// - `null_density`: density of nulls in the string view array
216+
/// - `range`: range size of each string in the string view array
217+
/// - `seed`: seed for the random number generator
218+
pub fn create_string_view_array_with_len_range_and_seed(
219+
size: usize,
220+
null_density: f32,
221+
range: Range<usize>,
222+
seed: u64,
223+
) -> StringViewArray {
224+
let rng = &mut StdRng::seed_from_u64(seed);
225+
(0..size)
226+
.map(|_| {
227+
if rng.random::<f32>() < null_density {
228+
None
229+
} else {
230+
let str_len = rng.random_range(range.clone());
231+
let value = rng.sample_iter(&Alphanumeric).take(str_len).collect();
232+
let value = String::from_utf8(value).unwrap();
233+
Some(value)
234+
}
235+
})
236+
.collect()
237+
}
211238

212239
fn create_string_view_array_with_len_range_and_prefix(
213240
size: usize,

0 commit comments

Comments
 (0)