Skip to content

Commit a291eec

Browse files
committed
arrow-select: implement specialized interleave_list
Previously, List and LargeList would fall through to the interleave_fallback match arm, which is inefficient. This commit implements interleave_list, which interleaves a list's child arrays and rebuilds the offsets buffer. Running it on production tests reduced memory by 80%. Signed-off-by: Alfonso Subiotto Marques <alfonso.subiotto@polarsignals.com>
1 parent bab30ae commit a291eec

File tree

1 file changed

+61
-8
lines changed

1 file changed

+61
-8
lines changed

arrow-select/src/interleave.rs

Lines changed: 61 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ use arrow_array::*;
2525
use arrow_buffer::{ArrowNativeType, BooleanBuffer, MutableBuffer, NullBuffer, OffsetBuffer};
2626
use arrow_data::ByteView;
2727
use arrow_data::transform::MutableArrayData;
28-
use arrow_schema::{ArrowError, DataType, Fields};
28+
use arrow_schema::{ArrowError, DataType, FieldRef, Fields};
2929
use std::sync::Arc;
3030

3131
macro_rules! primitive_helper {
@@ -105,6 +105,8 @@ pub fn interleave(
105105
_ => unreachable!("illegal dictionary key type {k}")
106106
},
107107
DataType::Struct(fields) => interleave_struct(fields, values, indices),
108+
DataType::List(field) => interleave_list::<i32>(values, indices, field),
109+
DataType::LargeList(field) => interleave_list::<i64>(values, indices, field),
108110
_ => interleave_fallback(values, indices)
109111
}
110112
}
@@ -312,6 +314,45 @@ fn interleave_struct(
312314
Ok(Arc::new(struct_array))
313315
}
314316

317+
fn interleave_list<O: OffsetSizeTrait>(
318+
values: &[&dyn Array],
319+
indices: &[(usize, usize)],
320+
field: &FieldRef,
321+
) -> Result<ArrayRef, ArrowError> {
322+
let interleaved = Interleave::<'_, GenericListArray<O>>::new(values, indices);
323+
324+
let mut child_indices = Vec::new();
325+
let mut offsets = Vec::with_capacity(indices.len() + 1);
326+
offsets.push(O::from_usize(0).unwrap());
327+
328+
for (array, row) in indices {
329+
let list = interleaved.arrays[*array];
330+
let start = list.value_offsets()[*row].as_usize();
331+
let end = list.value_offsets()[*row + 1].as_usize();
332+
333+
child_indices.extend((start..end).map(|i| (*array, i)));
334+
offsets.push(O::from_usize(child_indices.len()).expect("offset overflow"));
335+
}
336+
337+
let child_arrays: Vec<&dyn Array> = interleaved
338+
.arrays
339+
.iter()
340+
.map(|list| list.values().as_ref())
341+
.collect();
342+
343+
let interleaved_values = interleave(&child_arrays, &child_indices)?;
344+
345+
let offsets = OffsetBuffer::new(offsets.into());
346+
let list_array = GenericListArray::<O>::new(
347+
field.clone(),
348+
offsets,
349+
interleaved_values,
350+
interleaved.nulls,
351+
);
352+
353+
Ok(Arc::new(list_array))
354+
}
355+
315356
/// Fallback implementation of interleave using [`MutableArrayData`]
316357
fn interleave_fallback(
317358
values: &[&dyn Array],
@@ -411,7 +452,7 @@ pub fn interleave_record_batch(
411452
mod tests {
412453
use super::*;
413454
use arrow_array::Int32RunArray;
414-
use arrow_array::builder::{Int32Builder, ListBuilder, PrimitiveRunBuilder};
455+
use arrow_array::builder::{GenericListBuilder, Int32Builder, PrimitiveRunBuilder};
415456
use arrow_schema::Field;
416457

417458
#[test]
@@ -509,10 +550,9 @@ mod tests {
509550
assert_eq!(actual, expected);
510551
}
511552

512-
#[test]
513-
fn test_lists() {
553+
fn test_interleave_lists<O: OffsetSizeTrait>() {
514554
// [[1, 2], null, [3]]
515-
let mut a = ListBuilder::new(Int32Builder::new());
555+
let mut a = GenericListBuilder::<O, _>::new(Int32Builder::new());
516556
a.values().append_value(1);
517557
a.values().append_value(2);
518558
a.append(true);
@@ -522,7 +562,7 @@ mod tests {
522562
let a = a.finish();
523563

524564
// [[4], null, [5, 6, null]]
525-
let mut b = ListBuilder::new(Int32Builder::new());
565+
let mut b = GenericListBuilder::<O, _>::new(Int32Builder::new());
526566
b.values().append_value(4);
527567
b.append(true);
528568
b.append(false);
@@ -533,10 +573,13 @@ mod tests {
533573
let b = b.finish();
534574

535575
let values = interleave(&[&a, &b], &[(0, 2), (0, 1), (1, 0), (1, 2), (1, 1)]).unwrap();
536-
let v = values.as_any().downcast_ref::<ListArray>().unwrap();
576+
let v = values
577+
.as_any()
578+
.downcast_ref::<GenericListArray<O>>()
579+
.unwrap();
537580

538581
// [[3], null, [4], [5, 6, null], null]
539-
let mut expected = ListBuilder::new(Int32Builder::new());
582+
let mut expected = GenericListBuilder::<O, _>::new(Int32Builder::new());
540583
expected.values().append_value(3);
541584
expected.append(true);
542585
expected.append(false);
@@ -552,6 +595,16 @@ mod tests {
552595
assert_eq!(v, &expected);
553596
}
554597

598+
#[test]
599+
fn test_lists() {
600+
test_interleave_lists::<i32>();
601+
}
602+
603+
#[test]
604+
fn test_large_lists() {
605+
test_interleave_lists::<i64>();
606+
}
607+
555608
#[test]
556609
fn test_struct_without_nulls() {
557610
let fields = Fields::from(vec![

0 commit comments

Comments
 (0)