Skip to content

Commit 95d54ac

Browse files
committed
Improve take on List arrays
Signed-off-by: Adam Gutglick <adam@spiraldb.com>
1 parent aac969d commit 95d54ac

File tree

1 file changed

+129
-121
lines changed

1 file changed

+129
-121
lines changed

arrow-select/src/take.rs

Lines changed: 129 additions & 121 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,10 @@ use arrow_buffer::{
2929
ArrowNativeType, BooleanBuffer, Buffer, MutableBuffer, NullBuffer, OffsetBuffer, ScalarBuffer,
3030
bit_util,
3131
};
32-
use arrow_data::ArrayDataBuilder;
32+
use arrow_data::{ArrayDataBuilder, transform::MutableArrayData};
3333
use arrow_schema::{ArrowError, DataType, FieldRef, UnionMode};
3434

35-
use num_traits::{One, Zero};
35+
use num_traits::Zero;
3636

3737
/// Take elements by index from [Array], creating a new [Array] from those indexes.
3838
///
@@ -611,9 +611,8 @@ fn take_byte_view<T: ByteViewType, IndexType: ArrowPrimitiveType>(
611611

612612
/// `take` implementation for list arrays
613613
///
614-
/// Calculates the index and indexed offset for the inner array,
615-
/// applying `take` on the inner array, then reconstructing a list array
616-
/// with the indexed offsets
614+
/// Copies the selected list entries' child slices into a new child array
615+
/// via `MutableArrayData`, then reconstructs a list array with new offsets
617616
fn take_list<IndexType, OffsetType>(
618617
values: &GenericListArray<OffsetType::Native>,
619618
indices: &PrimitiveArray<IndexType>,
@@ -624,23 +623,65 @@ where
624623
OffsetType::Native: OffsetSizeTrait,
625624
PrimitiveArray<OffsetType>: From<Vec<OffsetType::Native>>,
626625
{
627-
// TODO: Some optimizations can be done here such as if it is
628-
// taking the whole list or a contiguous sublist
629-
let (list_indices, offsets, null_buf) =
630-
take_value_indices_from_list::<IndexType, OffsetType>(values, indices)?;
631-
632-
let taken = take_impl::<OffsetType>(values.values().as_ref(), &list_indices)?;
633-
let value_offsets = Buffer::from_vec(offsets);
634-
// create a new list with taken data and computed null information
626+
let list_offsets = values.value_offsets();
627+
let child_data = values.values().to_data();
628+
let nulls = take_nulls(values.nulls(), indices);
629+
630+
let mut new_offsets = Vec::with_capacity(indices.len() + 1);
631+
new_offsets.push(OffsetType::Native::zero());
632+
633+
let use_nulls = child_data.null_count() > 0;
634+
635+
let capacity = child_data
636+
.len()
637+
.checked_div(values.len())
638+
.map(|v| v * indices.len())
639+
.unwrap_or_default();
640+
641+
let mut array_data = MutableArrayData::new(vec![&child_data], use_nulls, capacity);
642+
643+
match nulls.as_ref().filter(|n| n.null_count() > 0) {
644+
None => {
645+
for index in indices.values() {
646+
let ix = index.as_usize();
647+
let start = list_offsets[ix].as_usize();
648+
let end = list_offsets[ix + 1].as_usize();
649+
array_data.extend(0, start, end);
650+
new_offsets.push(OffsetType::Native::from_usize(array_data.len()).unwrap());
651+
}
652+
}
653+
Some(output_nulls) => {
654+
new_offsets.resize(indices.len() + 1, OffsetType::Native::zero());
655+
let mut last_filled = 0;
656+
for i in output_nulls.valid_indices() {
657+
let current = OffsetType::Native::from_usize(array_data.len()).unwrap();
658+
if last_filled < i {
659+
new_offsets[last_filled + 1..=i].fill(current);
660+
}
661+
// SAFETY: `i` comes from validity bitmap over `indices`, so in-bounds.
662+
let ix = unsafe { indices.value_unchecked(i) }.as_usize();
663+
let start = list_offsets[ix].as_usize();
664+
let end = list_offsets[ix + 1].as_usize();
665+
array_data.extend(0, start, end);
666+
new_offsets[i + 1] = OffsetType::Native::from_usize(array_data.len()).unwrap();
667+
last_filled = i + 1;
668+
}
669+
let final_offset = OffsetType::Native::from_usize(array_data.len()).unwrap();
670+
new_offsets[last_filled + 1..].fill(final_offset);
671+
}
672+
};
673+
674+
let child_data = array_data.freeze();
675+
let value_offsets = Buffer::from_vec(new_offsets);
676+
635677
let list_data = ArrayDataBuilder::new(values.data_type().clone())
636678
.len(indices.len())
637-
.null_bit_buffer(Some(null_buf.into()))
679+
.nulls(nulls)
638680
.offset(0)
639-
.add_child_data(taken.into_data())
681+
.add_child_data(child_data)
640682
.add_buffer(value_offsets);
641683

642684
let list_data = unsafe { list_data.build_unchecked() };
643-
644685
Ok(GenericListArray::<OffsetType::Native>::from(list_data))
645686
}
646687

@@ -925,78 +966,6 @@ fn take_run<T: RunEndIndexType, I: ArrowPrimitiveType>(
925966
Ok(array_data.into())
926967
}
927968

928-
/// Takes/filters a list array's inner data using the offsets of the list array.
929-
///
930-
/// Where a list array has indices `[0,2,5,10]`, taking indices of `[2,0]` returns
931-
/// an array of the indices `[5..10, 0..2]` and offsets `[0,5,7]` (5 elements and 2
932-
/// elements)
933-
#[allow(clippy::type_complexity)]
934-
fn take_value_indices_from_list<IndexType, OffsetType>(
935-
list: &GenericListArray<OffsetType::Native>,
936-
indices: &PrimitiveArray<IndexType>,
937-
) -> Result<
938-
(
939-
PrimitiveArray<OffsetType>,
940-
Vec<OffsetType::Native>,
941-
MutableBuffer,
942-
),
943-
ArrowError,
944-
>
945-
where
946-
IndexType: ArrowPrimitiveType,
947-
OffsetType: ArrowPrimitiveType,
948-
OffsetType::Native: OffsetSizeTrait + std::ops::Add + Zero + One,
949-
PrimitiveArray<OffsetType>: From<Vec<OffsetType::Native>>,
950-
{
951-
// TODO: benchmark this function, there might be a faster unsafe alternative
952-
let offsets: &[OffsetType::Native] = list.value_offsets();
953-
954-
let mut new_offsets = Vec::with_capacity(indices.len());
955-
let mut values = Vec::new();
956-
let mut current_offset = OffsetType::Native::zero();
957-
// add first offset
958-
new_offsets.push(OffsetType::Native::zero());
959-
960-
// Initialize null buffer
961-
let num_bytes = bit_util::ceil(indices.len(), 8);
962-
let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, true);
963-
let null_slice = null_buf.as_slice_mut();
964-
965-
// compute the value indices, and set offsets accordingly
966-
for i in 0..indices.len() {
967-
if indices.is_valid(i) {
968-
let ix = indices
969-
.value(i)
970-
.to_usize()
971-
.ok_or_else(|| ArrowError::ComputeError("Cast to usize failed".to_string()))?;
972-
let start = offsets[ix];
973-
let end = offsets[ix + 1];
974-
current_offset += end - start;
975-
new_offsets.push(current_offset);
976-
977-
let mut curr = start;
978-
979-
// if start == end, this slot is empty
980-
while curr < end {
981-
values.push(curr);
982-
curr += One::one();
983-
}
984-
if !list.is_valid(ix) {
985-
bit_util::unset_bit(null_slice, i);
986-
}
987-
} else {
988-
bit_util::unset_bit(null_slice, i);
989-
new_offsets.push(current_offset);
990-
}
991-
}
992-
993-
Ok((
994-
PrimitiveArray::<OffsetType>::from(values),
995-
new_offsets,
996-
null_buf,
997-
))
998-
}
999-
1000969
/// Takes/filters a fixed size list array's inner data using the offsets of the list array.
1001970
fn take_value_indices_from_fixed_size_list<IndexType>(
1002971
list: &FixedSizeListArray,
@@ -1985,6 +1954,78 @@ mod tests {
19851954
}};
19861955
}
19871956

1957+
fn test_take_sliced_list_generic<S: OffsetSizeTrait + 'static>() {
1958+
let list = build_generic_list::<S, Int32Type>(vec![
1959+
Some(vec![0, 1]),
1960+
Some(vec![2, 3, 4]),
1961+
None,
1962+
Some(vec![]),
1963+
Some(vec![5, 6]),
1964+
Some(vec![7]),
1965+
]);
1966+
let sliced = list.slice(1, 4);
1967+
let indices = UInt32Array::from(vec![Some(3), Some(0), None, Some(2), Some(1)]);
1968+
1969+
let taken = take(&sliced, &indices, None).unwrap();
1970+
let taken = taken.as_list::<S>();
1971+
1972+
let expected = build_generic_list::<S, Int32Type>(vec![
1973+
Some(vec![5, 6]),
1974+
Some(vec![2, 3, 4]),
1975+
None,
1976+
Some(vec![]),
1977+
None,
1978+
]);
1979+
1980+
assert_eq!(taken, &expected);
1981+
}
1982+
1983+
fn test_take_sliced_list_with_value_nulls_generic<S: OffsetSizeTrait + 'static>() {
1984+
let list = GenericListArray::<S>::from_iter_primitive::<Int32Type, _, _>(vec![
1985+
Some(vec![Some(10)]),
1986+
Some(vec![None, Some(1)]),
1987+
None,
1988+
Some(vec![Some(2), None]),
1989+
Some(vec![]),
1990+
Some(vec![Some(3)]),
1991+
]);
1992+
let sliced = list.slice(1, 4);
1993+
let indices = UInt32Array::from(vec![Some(2), Some(0), None, Some(3), Some(1)]);
1994+
1995+
let taken = take(&sliced, &indices, None).unwrap();
1996+
let taken = taken.as_list::<S>();
1997+
1998+
let expected = GenericListArray::<S>::from_iter_primitive::<Int32Type, _, _>(vec![
1999+
Some(vec![Some(2), None]),
2000+
Some(vec![None, Some(1)]),
2001+
None,
2002+
Some(vec![]),
2003+
None,
2004+
]);
2005+
2006+
assert_eq!(taken, &expected);
2007+
}
2008+
2009+
#[test]
2010+
fn test_take_sliced_list() {
2011+
test_take_sliced_list_generic::<i32>();
2012+
}
2013+
2014+
#[test]
2015+
fn test_take_sliced_large_list() {
2016+
test_take_sliced_list_generic::<i64>();
2017+
}
2018+
2019+
#[test]
2020+
fn test_take_sliced_list_with_value_nulls() {
2021+
test_take_sliced_list_with_value_nulls_generic::<i32>();
2022+
}
2023+
2024+
#[test]
2025+
fn test_take_sliced_large_list_with_value_nulls() {
2026+
test_take_sliced_list_with_value_nulls_generic::<i64>();
2027+
}
2028+
19882029
fn test_take_list_view_generic<OffsetType: OffsetSizeTrait, ValuesType: ArrowPrimitiveType, F>(
19892030
values: Vec<Option<Vec<Option<ValuesType::Native>>>>,
19902031
take_indices: Vec<Option<usize>>,
@@ -2497,39 +2538,6 @@ mod tests {
24972538
)
24982539
}
24992540

2500-
#[test]
2501-
fn test_take_value_index_from_list() {
2502-
let list = build_generic_list::<i32, Int32Type>(vec![
2503-
Some(vec![0, 1]),
2504-
Some(vec![2, 3, 4]),
2505-
Some(vec![5, 6, 7, 8, 9]),
2506-
]);
2507-
let indices = UInt32Array::from(vec![2, 0]);
2508-
2509-
let (indexed, offsets, null_buf) = take_value_indices_from_list(&list, &indices).unwrap();
2510-
2511-
assert_eq!(indexed, Int32Array::from(vec![5, 6, 7, 8, 9, 0, 1]));
2512-
assert_eq!(offsets, vec![0, 5, 7]);
2513-
assert_eq!(null_buf.as_slice(), &[0b11111111]);
2514-
}
2515-
2516-
#[test]
2517-
fn test_take_value_index_from_large_list() {
2518-
let list = build_generic_list::<i64, Int32Type>(vec![
2519-
Some(vec![0, 1]),
2520-
Some(vec![2, 3, 4]),
2521-
Some(vec![5, 6, 7, 8, 9]),
2522-
]);
2523-
let indices = UInt32Array::from(vec![2, 0]);
2524-
2525-
let (indexed, offsets, null_buf) =
2526-
take_value_indices_from_list::<_, Int64Type>(&list, &indices).unwrap();
2527-
2528-
assert_eq!(indexed, Int64Array::from(vec![5, 6, 7, 8, 9, 0, 1]));
2529-
assert_eq!(offsets, vec![0, 5, 7]);
2530-
assert_eq!(null_buf.as_slice(), &[0b11111111]);
2531-
}
2532-
25332541
#[test]
25342542
fn test_take_runs() {
25352543
let logical_array: Vec<i32> = vec![1_i32, 1, 2, 2, 1, 1, 1, 2, 2, 1, 1, 2, 2];

0 commit comments

Comments
 (0)