Skip to content

Commit 5b441ec

Browse files
committed
arrow-select: implement specialized interleave_list
Previously, List and LargeList would fall through to the interleave_fallback match arm, which is inefficient. This commit implements interleave_list, which interleaves a list's child arrays and rebuilds the offsets buffer. Running it on production tests reduced memory by 80%. Signed-off-by: Alfonso Subiotto Marques <alfonso.subiotto@polarsignals.com>
1 parent f8796fd commit 5b441ec

File tree

1 file changed

+66
-8
lines changed

1 file changed

+66
-8
lines changed

arrow-select/src/interleave.rs

Lines changed: 66 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ use arrow_array::*;
2626
use arrow_buffer::{ArrowNativeType, BooleanBuffer, MutableBuffer, NullBuffer, OffsetBuffer};
2727
use arrow_data::ByteView;
2828
use arrow_data::transform::MutableArrayData;
29-
use arrow_schema::{ArrowError, DataType, Fields};
29+
use arrow_schema::{ArrowError, DataType, FieldRef, Fields};
3030
use std::sync::Arc;
3131

3232
macro_rules! primitive_helper {
@@ -106,6 +106,8 @@ pub fn interleave(
106106
_ => unreachable!("illegal dictionary key type {k}")
107107
},
108108
DataType::Struct(fields) => interleave_struct(fields, values, indices),
109+
DataType::List(field) => interleave_list::<i32>(values, indices, field),
110+
DataType::LargeList(field) => interleave_list::<i64>(values, indices, field),
109111
_ => interleave_fallback(values, indices)
110112
}
111113
}
@@ -319,6 +321,50 @@ fn interleave_struct(
319321
Ok(Arc::new(struct_array))
320322
}
321323

324+
fn interleave_list<O: OffsetSizeTrait>(
325+
values: &[&dyn Array],
326+
indices: &[(usize, usize)],
327+
field: &FieldRef,
328+
) -> Result<ArrayRef, ArrowError> {
329+
let interleaved = Interleave::<'_, GenericListArray<O>>::new(values, indices);
330+
331+
let mut capacity = 0usize;
332+
let mut offsets = Vec::with_capacity(indices.len() + 1);
333+
offsets.push(O::from_usize(0).unwrap());
334+
offsets.extend(indices.iter().map(|(array, row)| {
335+
let o = interleaved.arrays[*array].value_offsets();
336+
let element_len = o[*row + 1].as_usize() - o[*row].as_usize();
337+
capacity += element_len;
338+
O::from_usize(capacity).expect("offset overflow")
339+
}));
340+
341+
let mut child_indices = Vec::with_capacity(capacity);
342+
for (array, row) in indices {
343+
let list = interleaved.arrays[*array];
344+
let start = list.value_offsets()[*row].as_usize();
345+
let end = list.value_offsets()[*row + 1].as_usize();
346+
child_indices.extend((start..end).map(|i| (*array, i)));
347+
}
348+
349+
let child_arrays: Vec<&dyn Array> = interleaved
350+
.arrays
351+
.iter()
352+
.map(|list| list.values().as_ref())
353+
.collect();
354+
355+
let interleaved_values = interleave(&child_arrays, &child_indices)?;
356+
357+
let offsets = OffsetBuffer::new(offsets.into());
358+
let list_array = GenericListArray::<O>::new(
359+
field.clone(),
360+
offsets,
361+
interleaved_values,
362+
interleaved.nulls,
363+
);
364+
365+
Ok(Arc::new(list_array))
366+
}
367+
322368
/// Fallback implementation of interleave using [`MutableArrayData`]
323369
fn interleave_fallback(
324370
values: &[&dyn Array],
@@ -488,7 +534,7 @@ pub fn interleave_record_batch(
488534
mod tests {
489535
use super::*;
490536
use arrow_array::Int32RunArray;
491-
use arrow_array::builder::{Int32Builder, ListBuilder, PrimitiveRunBuilder};
537+
use arrow_array::builder::{GenericListBuilder, Int32Builder, PrimitiveRunBuilder};
492538
use arrow_array::types::Int8Type;
493539
use arrow_schema::Field;
494540

@@ -622,10 +668,9 @@ mod tests {
622668
assert_eq!(string_result, vec!["v0", "v0", "v49"]);
623669
}
624670

625-
#[test]
626-
fn test_lists() {
671+
fn test_interleave_lists<O: OffsetSizeTrait>() {
627672
// [[1, 2], null, [3]]
628-
let mut a = ListBuilder::new(Int32Builder::new());
673+
let mut a = GenericListBuilder::<O, _>::new(Int32Builder::new());
629674
a.values().append_value(1);
630675
a.values().append_value(2);
631676
a.append(true);
@@ -635,7 +680,7 @@ mod tests {
635680
let a = a.finish();
636681

637682
// [[4], null, [5, 6, null]]
638-
let mut b = ListBuilder::new(Int32Builder::new());
683+
let mut b = GenericListBuilder::<O, _>::new(Int32Builder::new());
639684
b.values().append_value(4);
640685
b.append(true);
641686
b.append(false);
@@ -646,10 +691,13 @@ mod tests {
646691
let b = b.finish();
647692

648693
let values = interleave(&[&a, &b], &[(0, 2), (0, 1), (1, 0), (1, 2), (1, 1)]).unwrap();
649-
let v = values.as_any().downcast_ref::<ListArray>().unwrap();
694+
let v = values
695+
.as_any()
696+
.downcast_ref::<GenericListArray<O>>()
697+
.unwrap();
650698

651699
// [[3], null, [4], [5, 6, null], null]
652-
let mut expected = ListBuilder::new(Int32Builder::new());
700+
let mut expected = GenericListBuilder::<O, _>::new(Int32Builder::new());
653701
expected.values().append_value(3);
654702
expected.append(true);
655703
expected.append(false);
@@ -665,6 +713,16 @@ mod tests {
665713
assert_eq!(v, &expected);
666714
}
667715

716+
#[test]
717+
fn test_lists() {
718+
test_interleave_lists::<i32>();
719+
}
720+
721+
#[test]
722+
fn test_large_lists() {
723+
test_interleave_lists::<i64>();
724+
}
725+
668726
#[test]
669727
fn test_struct_without_nulls() {
670728
let fields = Fields::from(vec![

0 commit comments

Comments
 (0)