Skip to content

Commit c66bad6

Browse files
committed
Improve take on List arrays
Signed-off-by: Adam Gutglick <adam@spiraldb.com>
1 parent aac969d commit c66bad6

File tree

1 file changed

+127
-121
lines changed

1 file changed

+127
-121
lines changed

arrow-select/src/take.rs

Lines changed: 127 additions & 121 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,10 @@ use arrow_buffer::{
2929
ArrowNativeType, BooleanBuffer, Buffer, MutableBuffer, NullBuffer, OffsetBuffer, ScalarBuffer,
3030
bit_util,
3131
};
32-
use arrow_data::ArrayDataBuilder;
32+
use arrow_data::{ArrayDataBuilder, transform::MutableArrayData};
3333
use arrow_schema::{ArrowError, DataType, FieldRef, UnionMode};
3434

35-
use num_traits::{One, Zero};
35+
use num_traits::Zero;
3636

3737
/// Take elements by index from [Array], creating a new [Array] from those indexes.
3838
///
@@ -611,9 +611,8 @@ fn take_byte_view<T: ByteViewType, IndexType: ArrowPrimitiveType>(
611611

612612
/// `take` implementation for list arrays
613613
///
614-
/// Calculates the index and indexed offset for the inner array,
615-
/// applying `take` on the inner array, then reconstructing a list array
616-
/// with the indexed offsets
614+
/// Copies the selected list entries' child slices into a new child array
615+
/// via `MutableArrayData`, then reconstructs a list array with new offsets
617616
fn take_list<IndexType, OffsetType>(
618617
values: &GenericListArray<OffsetType::Native>,
619618
indices: &PrimitiveArray<IndexType>,
@@ -624,23 +623,63 @@ where
624623
OffsetType::Native: OffsetSizeTrait,
625624
PrimitiveArray<OffsetType>: From<Vec<OffsetType::Native>>,
626625
{
627-
// TODO: Some optimizations can be done here such as if it is
628-
// taking the whole list or a contiguous sublist
629-
let (list_indices, offsets, null_buf) =
630-
take_value_indices_from_list::<IndexType, OffsetType>(values, indices)?;
631-
632-
let taken = take_impl::<OffsetType>(values.values().as_ref(), &list_indices)?;
633-
let value_offsets = Buffer::from_vec(offsets);
634-
// create a new list with taken data and computed null information
626+
let list_offsets = values.value_offsets();
627+
let child_data = values.values().to_data();
628+
let nulls = take_nulls(values.nulls(), indices);
629+
630+
let mut new_offsets = Vec::with_capacity(indices.len() + 1);
631+
new_offsets.push(OffsetType::Native::zero());
632+
633+
let use_nulls = child_data.null_count() > 0;
634+
let capacity = if values.len() > 0 {
635+
(child_data.len() / values.len()) * indices.len()
636+
} else {
637+
0
638+
};
639+
let mut array_data = MutableArrayData::new(vec![&child_data], use_nulls, capacity);
640+
641+
match nulls.as_ref().filter(|n| n.null_count() > 0) {
642+
None => {
643+
for index in indices.values() {
644+
let ix = index.as_usize();
645+
let start = list_offsets[ix].as_usize();
646+
let end = list_offsets[ix + 1].as_usize();
647+
array_data.extend(0, start, end);
648+
new_offsets.push(OffsetType::Native::from_usize(array_data.len()).unwrap());
649+
}
650+
}
651+
Some(output_nulls) => {
652+
new_offsets.resize(indices.len() + 1, OffsetType::Native::zero());
653+
let mut last_filled = 0;
654+
for i in output_nulls.valid_indices() {
655+
let current = OffsetType::Native::from_usize(array_data.len()).unwrap();
656+
if last_filled < i {
657+
new_offsets[last_filled + 1..=i].fill(current);
658+
}
659+
// SAFETY: `i` comes from validity bitmap over `indices`, so in-bounds.
660+
let ix = unsafe { indices.value_unchecked(i) }.as_usize();
661+
let start = list_offsets[ix].as_usize();
662+
let end = list_offsets[ix + 1].as_usize();
663+
array_data.extend(0, start, end);
664+
new_offsets[i + 1] = OffsetType::Native::from_usize(array_data.len()).unwrap();
665+
last_filled = i + 1;
666+
}
667+
let final_offset = OffsetType::Native::from_usize(array_data.len()).unwrap();
668+
new_offsets[last_filled + 1..].fill(final_offset);
669+
}
670+
};
671+
672+
let child_data = array_data.freeze();
673+
let value_offsets = Buffer::from_vec(new_offsets);
674+
635675
let list_data = ArrayDataBuilder::new(values.data_type().clone())
636676
.len(indices.len())
637-
.null_bit_buffer(Some(null_buf.into()))
677+
.nulls(nulls)
638678
.offset(0)
639-
.add_child_data(taken.into_data())
679+
.add_child_data(child_data)
640680
.add_buffer(value_offsets);
641681

642682
let list_data = unsafe { list_data.build_unchecked() };
643-
644683
Ok(GenericListArray::<OffsetType::Native>::from(list_data))
645684
}
646685

@@ -925,78 +964,6 @@ fn take_run<T: RunEndIndexType, I: ArrowPrimitiveType>(
925964
Ok(array_data.into())
926965
}
927966

928-
/// Takes/filters a list array's inner data using the offsets of the list array.
929-
///
930-
/// Where a list array has indices `[0,2,5,10]`, taking indices of `[2,0]` returns
931-
/// an array of the indices `[5..10, 0..2]` and offsets `[0,5,7]` (5 elements and 2
932-
/// elements)
933-
#[allow(clippy::type_complexity)]
934-
fn take_value_indices_from_list<IndexType, OffsetType>(
935-
list: &GenericListArray<OffsetType::Native>,
936-
indices: &PrimitiveArray<IndexType>,
937-
) -> Result<
938-
(
939-
PrimitiveArray<OffsetType>,
940-
Vec<OffsetType::Native>,
941-
MutableBuffer,
942-
),
943-
ArrowError,
944-
>
945-
where
946-
IndexType: ArrowPrimitiveType,
947-
OffsetType: ArrowPrimitiveType,
948-
OffsetType::Native: OffsetSizeTrait + std::ops::Add + Zero + One,
949-
PrimitiveArray<OffsetType>: From<Vec<OffsetType::Native>>,
950-
{
951-
// TODO: benchmark this function, there might be a faster unsafe alternative
952-
let offsets: &[OffsetType::Native] = list.value_offsets();
953-
954-
let mut new_offsets = Vec::with_capacity(indices.len());
955-
let mut values = Vec::new();
956-
let mut current_offset = OffsetType::Native::zero();
957-
// add first offset
958-
new_offsets.push(OffsetType::Native::zero());
959-
960-
// Initialize null buffer
961-
let num_bytes = bit_util::ceil(indices.len(), 8);
962-
let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, true);
963-
let null_slice = null_buf.as_slice_mut();
964-
965-
// compute the value indices, and set offsets accordingly
966-
for i in 0..indices.len() {
967-
if indices.is_valid(i) {
968-
let ix = indices
969-
.value(i)
970-
.to_usize()
971-
.ok_or_else(|| ArrowError::ComputeError("Cast to usize failed".to_string()))?;
972-
let start = offsets[ix];
973-
let end = offsets[ix + 1];
974-
current_offset += end - start;
975-
new_offsets.push(current_offset);
976-
977-
let mut curr = start;
978-
979-
// if start == end, this slot is empty
980-
while curr < end {
981-
values.push(curr);
982-
curr += One::one();
983-
}
984-
if !list.is_valid(ix) {
985-
bit_util::unset_bit(null_slice, i);
986-
}
987-
} else {
988-
bit_util::unset_bit(null_slice, i);
989-
new_offsets.push(current_offset);
990-
}
991-
}
992-
993-
Ok((
994-
PrimitiveArray::<OffsetType>::from(values),
995-
new_offsets,
996-
null_buf,
997-
))
998-
}
999-
1000967
/// Takes/filters a fixed size list array's inner data using the offsets of the list array.
1001968
fn take_value_indices_from_fixed_size_list<IndexType>(
1002969
list: &FixedSizeListArray,
@@ -1985,6 +1952,78 @@ mod tests {
19851952
}};
19861953
}
19871954

1955+
fn test_take_sliced_list_generic<S: OffsetSizeTrait + 'static>() {
1956+
let list = build_generic_list::<S, Int32Type>(vec![
1957+
Some(vec![0, 1]),
1958+
Some(vec![2, 3, 4]),
1959+
None,
1960+
Some(vec![]),
1961+
Some(vec![5, 6]),
1962+
Some(vec![7]),
1963+
]);
1964+
let sliced = list.slice(1, 4);
1965+
let indices = UInt32Array::from(vec![Some(3), Some(0), None, Some(2), Some(1)]);
1966+
1967+
let taken = take(&sliced, &indices, None).unwrap();
1968+
let taken = taken.as_list::<S>();
1969+
1970+
let expected = build_generic_list::<S, Int32Type>(vec![
1971+
Some(vec![5, 6]),
1972+
Some(vec![2, 3, 4]),
1973+
None,
1974+
Some(vec![]),
1975+
None,
1976+
]);
1977+
1978+
assert_eq!(taken, &expected);
1979+
}
1980+
1981+
fn test_take_sliced_list_with_value_nulls_generic<S: OffsetSizeTrait + 'static>() {
1982+
let list = GenericListArray::<S>::from_iter_primitive::<Int32Type, _, _>(vec![
1983+
Some(vec![Some(10)]),
1984+
Some(vec![None, Some(1)]),
1985+
None,
1986+
Some(vec![Some(2), None]),
1987+
Some(vec![]),
1988+
Some(vec![Some(3)]),
1989+
]);
1990+
let sliced = list.slice(1, 4);
1991+
let indices = UInt32Array::from(vec![Some(2), Some(0), None, Some(3), Some(1)]);
1992+
1993+
let taken = take(&sliced, &indices, None).unwrap();
1994+
let taken = taken.as_list::<S>();
1995+
1996+
let expected = GenericListArray::<S>::from_iter_primitive::<Int32Type, _, _>(vec![
1997+
Some(vec![Some(2), None]),
1998+
Some(vec![None, Some(1)]),
1999+
None,
2000+
Some(vec![]),
2001+
None,
2002+
]);
2003+
2004+
assert_eq!(taken, &expected);
2005+
}
2006+
2007+
#[test]
2008+
fn test_take_sliced_list() {
2009+
test_take_sliced_list_generic::<i32>();
2010+
}
2011+
2012+
#[test]
2013+
fn test_take_sliced_large_list() {
2014+
test_take_sliced_list_generic::<i64>();
2015+
}
2016+
2017+
#[test]
2018+
fn test_take_sliced_list_with_value_nulls() {
2019+
test_take_sliced_list_with_value_nulls_generic::<i32>();
2020+
}
2021+
2022+
#[test]
2023+
fn test_take_sliced_large_list_with_value_nulls() {
2024+
test_take_sliced_list_with_value_nulls_generic::<i64>();
2025+
}
2026+
19882027
fn test_take_list_view_generic<OffsetType: OffsetSizeTrait, ValuesType: ArrowPrimitiveType, F>(
19892028
values: Vec<Option<Vec<Option<ValuesType::Native>>>>,
19902029
take_indices: Vec<Option<usize>>,
@@ -2497,39 +2536,6 @@ mod tests {
24972536
)
24982537
}
24992538

2500-
#[test]
2501-
fn test_take_value_index_from_list() {
2502-
let list = build_generic_list::<i32, Int32Type>(vec![
2503-
Some(vec![0, 1]),
2504-
Some(vec![2, 3, 4]),
2505-
Some(vec![5, 6, 7, 8, 9]),
2506-
]);
2507-
let indices = UInt32Array::from(vec![2, 0]);
2508-
2509-
let (indexed, offsets, null_buf) = take_value_indices_from_list(&list, &indices).unwrap();
2510-
2511-
assert_eq!(indexed, Int32Array::from(vec![5, 6, 7, 8, 9, 0, 1]));
2512-
assert_eq!(offsets, vec![0, 5, 7]);
2513-
assert_eq!(null_buf.as_slice(), &[0b11111111]);
2514-
}
2515-
2516-
#[test]
2517-
fn test_take_value_index_from_large_list() {
2518-
let list = build_generic_list::<i64, Int32Type>(vec![
2519-
Some(vec![0, 1]),
2520-
Some(vec![2, 3, 4]),
2521-
Some(vec![5, 6, 7, 8, 9]),
2522-
]);
2523-
let indices = UInt32Array::from(vec![2, 0]);
2524-
2525-
let (indexed, offsets, null_buf) =
2526-
take_value_indices_from_list::<_, Int64Type>(&list, &indices).unwrap();
2527-
2528-
assert_eq!(indexed, Int64Array::from(vec![5, 6, 7, 8, 9, 0, 1]));
2529-
assert_eq!(offsets, vec![0, 5, 7]);
2530-
assert_eq!(null_buf.as_slice(), &[0b11111111]);
2531-
}
2532-
25332539
#[test]
25342540
fn test_take_runs() {
25352541
let logical_array: Vec<i32> = vec![1_i32, 1, 2, 2, 1, 1, 1, 2, 2, 1, 1, 2, 2];

0 commit comments

Comments
 (0)