Add List to interleave_kernels benchmark (#8980)

alamb · web-flow · commit 026a2606aee6 · 2025-12-11T11:20:54.000+01:00
# Which issue does this PR close? - Part of #8953 # Rationale for this change While reviewing #8953 from @asubiotto I noticed there was no benchmark for interleave with ListArray. Let's add some so we can evaluate the performance impact of ttps://github.com//pull/8953 and future changes. # What changes are included in this PR? Add benchmark for list interleaving # Are these changes tested? I ran the bechmarks manually ```shell cargo bench --bench interleave_kernels -- list ``` # Are there any user-facing changes? No
diff --git a/arrow/benches/interleave_kernels.rs b/arrow/benches/interleave_kernels.rs
@@ -116,6 +116,11 @@ fn add_benchmark(c: &mut Criterion) {
 
     let string_view = create_string_view_array(1024, 0.0);
 
+    // use 8192 as a standard list size for better coverage
+    let list_i64 = create_primitive_list_array_with_seed::<i32, Int64Type>(8192, 0.1, 0.1, 20, 42);
+    let list_i64_no_nulls =
+        create_primitive_list_array_with_seed::<i32, Int64Type>(8192, 0.0, 0.0, 20, 42);
+
     let cases: &[(&str, &dyn Array)] = &[
         ("i32(0.0)", &i32),
         ("i32(0.5)", &i32_opt),
@@ -136,6 +141,8 @@ fn add_benchmark(c: &mut Criterion) {
             "struct(i32(0.0), str(20, 0.0)",
             &struct_i32_no_nulls_string_no_nulls,
         ),
+        ("list<i64>(0.1,0.1,20)", &list_i64),
+        ("list<i64>(0.0,0.0,20)", &list_i64_no_nulls),
     ];
 
     for (prefix, base) in cases {
diff --git a/arrow/src/util/bench_util.rs b/arrow/src/util/bench_util.rs
@@ -398,6 +398,49 @@ pub fn create_string_dict_array<K: ArrowDictionaryKeyType>(
     data.iter().map(|x| x.as_deref()).collect()
 }
 
+/// Create a List/LargeList Array  of primitive values
+///
+/// Arguments:
+/// - `size`: number of lists in the array
+/// - `null_density`: density of nulls in the list array
+/// - `list_null_density`: density of nulls in the primitive arrays inside the lists
+/// - `max_list_size`: maximum size of each list (actual size is random between 0 and max_list_size)
+/// - `seed`: seed for the random number generator
+pub fn create_primitive_list_array_with_seed<O, T>(
+    size: usize,
+    null_density: f32,
+    list_null_density: f32,
+    max_list_size: usize,
+    seed: u64,
+) -> GenericListArray<O>
+where
+    O: OffsetSizeTrait,
+    T: ArrowPrimitiveType,
+    StandardUniform: Distribution<T::Native>,
+{
+    let mut rng = StdRng::seed_from_u64(seed);
+
+    let values = (0..size).map(|_| {
+        if rng.random::<f32>() < null_density {
+            None
+        } else {
+            let list_size = rng.random_range(0..=max_list_size);
+            let list_values: Vec<Option<T::Native>> = (0..list_size)
+                .map(|_| {
+                    if rng.random::<f32>() < list_null_density {
+                        None
+                    } else {
+                        Some(rng.random())
+                    }
+                })
+                .collect();
+            Some(list_values)
+        }
+    });
+
+    GenericListArray::<O>::from_iter_primitive::<T, _, _>(values)
+}
+
 /// Create primitive run array for given logical and physical array lengths
 pub fn create_primitive_run_array<R: RunEndIndexType, V: ArrowPrimitiveType>(
     logical_array_len: usize,