Skip to content

Commit a1674ad

Browse files
committed
arrow-select: implement specialized interleave_list
Previously, List and LargeList would fall through to the interleave_fallback match arm, which is inefficient. This commit implements interleave_list, which interleaves a list's child arrays and rebuilds the offsets buffer. Running it on production tests reduced memory by 80%. Signed-off-by: Alfonso Subiotto Marques <alfonso.subiotto@polarsignals.com>
1 parent 6ff8cc4 commit a1674ad

File tree

1 file changed

+61
-8
lines changed

1 file changed

+61
-8
lines changed

arrow-select/src/interleave.rs

Lines changed: 61 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ use arrow_array::*;
2626
use arrow_buffer::{ArrowNativeType, BooleanBuffer, MutableBuffer, NullBuffer, OffsetBuffer};
2727
use arrow_data::ByteView;
2828
use arrow_data::transform::MutableArrayData;
29-
use arrow_schema::{ArrowError, DataType, Fields};
29+
use arrow_schema::{ArrowError, DataType, FieldRef, Fields};
3030
use std::sync::Arc;
3131

3232
macro_rules! primitive_helper {
@@ -106,6 +106,8 @@ pub fn interleave(
106106
_ => unreachable!("illegal dictionary key type {k}")
107107
},
108108
DataType::Struct(fields) => interleave_struct(fields, values, indices),
109+
DataType::List(field) => interleave_list::<i32>(values, indices, field),
110+
DataType::LargeList(field) => interleave_list::<i64>(values, indices, field),
109111
_ => interleave_fallback(values, indices)
110112
}
111113
}
@@ -319,6 +321,45 @@ fn interleave_struct(
319321
Ok(Arc::new(struct_array))
320322
}
321323

324+
fn interleave_list<O: OffsetSizeTrait>(
325+
values: &[&dyn Array],
326+
indices: &[(usize, usize)],
327+
field: &FieldRef,
328+
) -> Result<ArrayRef, ArrowError> {
329+
let interleaved = Interleave::<'_, GenericListArray<O>>::new(values, indices);
330+
331+
let mut child_indices = Vec::new();
332+
let mut offsets = Vec::with_capacity(indices.len() + 1);
333+
offsets.push(O::from_usize(0).unwrap());
334+
335+
for (array, row) in indices {
336+
let list = interleaved.arrays[*array];
337+
let start = list.value_offsets()[*row].as_usize();
338+
let end = list.value_offsets()[*row + 1].as_usize();
339+
340+
child_indices.extend((start..end).map(|i| (*array, i)));
341+
offsets.push(O::from_usize(child_indices.len()).expect("offset overflow"));
342+
}
343+
344+
let child_arrays: Vec<&dyn Array> = interleaved
345+
.arrays
346+
.iter()
347+
.map(|list| list.values().as_ref())
348+
.collect();
349+
350+
let interleaved_values = interleave(&child_arrays, &child_indices)?;
351+
352+
let offsets = OffsetBuffer::new(offsets.into());
353+
let list_array = GenericListArray::<O>::new(
354+
field.clone(),
355+
offsets,
356+
interleaved_values,
357+
interleaved.nulls,
358+
);
359+
360+
Ok(Arc::new(list_array))
361+
}
362+
322363
/// Fallback implementation of interleave using [`MutableArrayData`]
323364
fn interleave_fallback(
324365
values: &[&dyn Array],
@@ -488,7 +529,7 @@ pub fn interleave_record_batch(
488529
mod tests {
489530
use super::*;
490531
use arrow_array::Int32RunArray;
491-
use arrow_array::builder::{Int32Builder, ListBuilder, PrimitiveRunBuilder};
532+
use arrow_array::builder::{GenericListBuilder, Int32Builder, PrimitiveRunBuilder};
492533
use arrow_array::types::Int8Type;
493534
use arrow_schema::Field;
494535

@@ -622,10 +663,9 @@ mod tests {
622663
assert_eq!(string_result, vec!["v0", "v0", "v49"]);
623664
}
624665

625-
#[test]
626-
fn test_lists() {
666+
fn test_interleave_lists<O: OffsetSizeTrait>() {
627667
// [[1, 2], null, [3]]
628-
let mut a = ListBuilder::new(Int32Builder::new());
668+
let mut a = GenericListBuilder::<O, _>::new(Int32Builder::new());
629669
a.values().append_value(1);
630670
a.values().append_value(2);
631671
a.append(true);
@@ -635,7 +675,7 @@ mod tests {
635675
let a = a.finish();
636676

637677
// [[4], null, [5, 6, null]]
638-
let mut b = ListBuilder::new(Int32Builder::new());
678+
let mut b = GenericListBuilder::<O, _>::new(Int32Builder::new());
639679
b.values().append_value(4);
640680
b.append(true);
641681
b.append(false);
@@ -646,10 +686,13 @@ mod tests {
646686
let b = b.finish();
647687

648688
let values = interleave(&[&a, &b], &[(0, 2), (0, 1), (1, 0), (1, 2), (1, 1)]).unwrap();
649-
let v = values.as_any().downcast_ref::<ListArray>().unwrap();
689+
let v = values
690+
.as_any()
691+
.downcast_ref::<GenericListArray<O>>()
692+
.unwrap();
650693

651694
// [[3], null, [4], [5, 6, null], null]
652-
let mut expected = ListBuilder::new(Int32Builder::new());
695+
let mut expected = GenericListBuilder::<O, _>::new(Int32Builder::new());
653696
expected.values().append_value(3);
654697
expected.append(true);
655698
expected.append(false);
@@ -665,6 +708,16 @@ mod tests {
665708
assert_eq!(v, &expected);
666709
}
667710

711+
#[test]
712+
fn test_lists() {
713+
test_interleave_lists::<i32>();
714+
}
715+
716+
#[test]
717+
fn test_large_lists() {
718+
test_interleave_lists::<i64>();
719+
}
720+
668721
#[test]
669722
fn test_struct_without_nulls() {
670723
let fields = Fields::from(vec![

0 commit comments

Comments
 (0)