Skip to content

Commit c69957f

Browse files
fix: avoid ListView take_reduce rebuild for dense selections (#7339)
The `take_reduce` path rebuilds the ListView, which is expensive. For dense selections (density above the rebuild threshold) there's little garbage to reclaim, so skip the rebuild and let the regular take path handle it. The rebuild is still worthwhile for sparse takes where we'd otherwise drag around a lot of unused elements when exporting (e.g. to DuckDB). This PR does a very crude estimation of the number of elements that stay in the ListViewArray. Potentially we should do this estimation at export time in the `ListViewExporter`, but that's less trivial. --------- Signed-off-by: Dimitar Dimitrov <mitko@spiraldb.com> Signed-off-by: Dimitar Dimitrov <dimitar@spiraldb.com> Co-authored-by: Joe Isaacs <joe.isaacs@live.co.uk>
1 parent aca6251 commit c69957f

File tree

5 files changed

+113
-85
lines changed

5 files changed

+113
-85
lines changed

vortex-array/public-api.lock

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2232,6 +2232,10 @@ impl vortex_array::arrays::dict::TakeExecute for vortex_array::arrays::List
22322232

22332233
pub fn vortex_array::arrays::List::take(array: vortex_array::ArrayView<'_, vortex_array::arrays::List>, indices: &vortex_array::ArrayRef, ctx: &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::ArrayRef>>
22342234

2235+
impl vortex_array::arrays::dict::TakeExecute for vortex_array::arrays::ListView
2236+
2237+
pub fn vortex_array::arrays::ListView::take(array: vortex_array::ArrayView<'_, vortex_array::arrays::ListView>, indices: &vortex_array::ArrayRef, _ctx: &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::ArrayRef>>
2238+
22352239
impl vortex_array::arrays::dict::TakeExecute for vortex_array::arrays::Primitive
22362240

22372241
pub fn vortex_array::arrays::Primitive::take(array: vortex_array::ArrayView<'_, vortex_array::arrays::Primitive>, indices: &vortex_array::ArrayRef, ctx: &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::ArrayRef>>
@@ -2970,6 +2974,10 @@ impl vortex_array::ValidityVTable<vortex_array::arrays::ListView> for vortex_arr
29702974

29712975
pub fn vortex_array::arrays::ListView::validity(array: vortex_array::ArrayView<'_, vortex_array::arrays::ListView>) -> vortex_error::VortexResult<vortex_array::validity::Validity>
29722976

2977+
impl vortex_array::arrays::dict::TakeExecute for vortex_array::arrays::ListView
2978+
2979+
pub fn vortex_array::arrays::ListView::take(array: vortex_array::ArrayView<'_, vortex_array::arrays::ListView>, indices: &vortex_array::ArrayRef, _ctx: &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::ArrayRef>>
2980+
29732981
impl vortex_array::arrays::dict::TakeReduce for vortex_array::arrays::ListView
29742982

29752983
pub fn vortex_array::arrays::ListView::take(array: vortex_array::ArrayView<'_, vortex_array::arrays::ListView>, indices: &vortex_array::ArrayRef) -> vortex_error::VortexResult<core::option::Option<vortex_array::ArrayRef>>
@@ -5884,6 +5892,10 @@ impl vortex_array::ValidityVTable<vortex_array::arrays::ListView> for vortex_arr
58845892

58855893
pub fn vortex_array::arrays::ListView::validity(array: vortex_array::ArrayView<'_, vortex_array::arrays::ListView>) -> vortex_error::VortexResult<vortex_array::validity::Validity>
58865894

5895+
impl vortex_array::arrays::dict::TakeExecute for vortex_array::arrays::ListView
5896+
5897+
pub fn vortex_array::arrays::ListView::take(array: vortex_array::ArrayView<'_, vortex_array::arrays::ListView>, indices: &vortex_array::ArrayRef, _ctx: &mut vortex_array::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::ArrayRef>>
5898+
58875899
impl vortex_array::arrays::dict::TakeReduce for vortex_array::arrays::ListView
58885900

58895901
pub fn vortex_array::arrays::ListView::take(array: vortex_array::ArrayView<'_, vortex_array::arrays::ListView>, indices: &vortex_array::ArrayRef) -> vortex_error::VortexResult<core::option::Option<vortex_array::ArrayRef>>

vortex-array/src/arrays/dict/execute.rs

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ pub fn take_canonical(
4646
Canonical::Primitive(a) => Canonical::Primitive(take_primitive(&a, codes, ctx)),
4747
Canonical::Decimal(a) => Canonical::Decimal(take_decimal(&a, codes, ctx)),
4848
Canonical::VarBinView(a) => Canonical::VarBinView(take_varbinview(&a, codes, ctx)),
49-
Canonical::List(a) => Canonical::List(take_listview(&a, codes)),
49+
Canonical::List(a) => Canonical::List(take_listview(&a, codes, ctx)),
5050
Canonical::FixedSizeList(a) => {
5151
Canonical::FixedSizeList(take_fixed_size_list(&a, codes, ctx))
5252
}
@@ -123,12 +123,16 @@ fn take_varbinview(
123123
.into_owned()
124124
}
125125

126-
fn take_listview(array: &ListViewArray, codes: &PrimitiveArray) -> ListViewArray {
126+
fn take_listview(
127+
array: &ListViewArray,
128+
codes: &PrimitiveArray,
129+
ctx: &mut ExecutionCtx,
130+
) -> ListViewArray {
127131
let codes_ref = codes.clone().into_array();
128132
let array = array.as_view();
129-
<ListView as TakeReduce>::take(array, &codes_ref)
130-
.vortex_expect("take listview array")
131-
.vortex_expect("take listview should not return None")
133+
<ListView as TakeExecute>::take(array, &codes_ref, ctx)
134+
.vortex_expect("take listview execute")
135+
.vortex_expect("ListView TakeExecute should not return None")
132136
.as_::<ListView>()
133137
.into_owned()
134138
}

vortex-array/src/arrays/filter/execute/listview.rs

Lines changed: 9 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -9,21 +9,10 @@ use vortex_mask::MaskValues;
99
use crate::arrays::ListViewArray;
1010
use crate::arrays::filter::execute::filter_validity;
1111
use crate::arrays::filter::execute::values_to_mask;
12+
use crate::arrays::listview;
1213
use crate::arrays::listview::ListViewArrayExt;
1314
use crate::arrays::listview::ListViewRebuildMode;
1415

15-
// TODO(connor)[ListView]: Make use of this threshold after we start migrating operators.
16-
/// The threshold for triggering a rebuild of the [`ListViewArray`].
17-
///
18-
/// By default, we will not touch the underlying `elements` array of the [`ListViewArray`] since it
19-
/// can be potentially expensive to reorganize the array based on what views we have into it.
20-
///
21-
/// However, we also do not want to carry around a large amount of garbage data. Below this
22-
/// threshold of the density of the selection mask, we will rebuild the [`ListViewArray`], removing
23-
/// any garbage data.
24-
#[expect(unused)]
25-
const REBUILD_DENSITY_THRESHOLD: f64 = 0.1;
26-
2716
/// [`ListViewArray`] filter implementation.
2817
///
2918
/// This implementation is deliberately simple and read-optimized. We just filter the `offsets` and
@@ -70,13 +59,14 @@ pub fn filter_listview(array: &ListViewArray, selection_mask: &Arc<MaskValues>)
7059
ListViewArray::new_unchecked(elements.clone(), new_offsets, new_sizes, new_validity)
7160
};
7261

73-
// TODO(connor)[ListView]: Ideally, we would only rebuild after all `take`s and `filter`
74-
// compute functions have run, at the "top" of the operator tree. However, we cannot do this
75-
// right now, so we will just rebuild every time (similar to `ListArray`).
76-
77-
new_array
78-
.rebuild(ListViewRebuildMode::MakeZeroCopyToList)
79-
.vortex_expect("ListViewArray rebuild to zero-copy List should always succeed")
62+
let kept_row_fraction = selection_mask.true_count() as f32 / array.sizes().len() as f32;
63+
if kept_row_fraction < listview::compute::REBUILD_DENSITY_THRESHOLD {
64+
new_array
65+
.rebuild(ListViewRebuildMode::MakeZeroCopyToList)
66+
.vortex_expect("ListViewArray rebuild to zero-copy List should always succeed")
67+
} else {
68+
new_array
69+
}
8070
}
8171

8272
#[cfg(test)]

vortex-array/src/arrays/listview/compute/mod.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,15 @@ mod mask;
66
pub(crate) mod rules;
77
mod slice;
88
mod take;
9+
10+
/// The threshold below which we rebuild the elements of a listview.
11+
///
12+
/// We don't touch `elements` on the metadata-only path since reorganizing it can be expensive.
13+
/// However, we also don't want to drag around a large amount of garbage data when the selection
14+
/// is sparse. Below this fraction of list rows retained, the rebuild is worth it.
15+
/// Rebuilding is needed when exporting the ListView's elements.
16+
///
17+
// TODO(connor)[ListView]: Ideally, we would only rebuild after all `take`s and `filter`
18+
// compute functions have run, at the "top" of the operator tree. However, we cannot do this
19+
// right now, so we will just rebuild every time (similar to [`ListArray`]).
20+
pub(crate) const REBUILD_DENSITY_THRESHOLD: f32 = 0.1;

vortex-array/src/arrays/listview/compute/take.rs

Lines changed: 71 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,14 @@
44
use num_traits::Zero;
55
use vortex_error::VortexResult;
66

7+
use super::REBUILD_DENSITY_THRESHOLD;
78
use crate::ArrayRef;
9+
use crate::ExecutionCtx;
810
use crate::IntoArray;
911
use crate::array::ArrayView;
1012
use crate::arrays::ListView;
1113
use crate::arrays::ListViewArray;
14+
use crate::arrays::dict::TakeExecute;
1215
use crate::arrays::dict::TakeReduce;
1316
use crate::arrays::listview::ListViewArrayExt;
1417
use crate::arrays::listview::ListViewRebuildMode;
@@ -17,72 +20,79 @@ use crate::dtype::Nullability;
1720
use crate::match_each_integer_ptype;
1821
use crate::scalar::Scalar;
1922

20-
// TODO(connor)[ListView]: Make use of this threshold after we start migrating operators.
21-
/// The threshold for triggering a rebuild of the [`ListViewArray`].
22-
///
23-
/// By default, we will not touch the underlying `elements` array of the [`ListViewArray`] since it
24-
/// can be potentially expensive to reorganize the array based on what views we have into it.
25-
///
26-
/// However, we also do not want to carry around a large amount of garbage data. Below this
27-
/// threshold of the density of the selection mask, we will rebuild the [`ListViewArray`], removing
28-
/// any garbage data.
29-
#[expect(unused)]
30-
const REBUILD_DENSITY_THRESHOLD: f64 = 0.1;
31-
32-
/// [`ListViewArray`] take implementation.
33-
///
34-
/// This implementation is deliberately simple and read-optimized. We just take the `offsets` and
35-
/// `sizes` at the requested indices and reuse the original `elements` array. This works because
36-
/// `ListView` (unlike `List`) allows non-contiguous and out-of-order lists.
37-
///
38-
/// We don't slice the `elements` array because it would require computing min/max offsets and
39-
/// adjusting all offsets accordingly, which is not really worth the small potential memory we would
40-
/// be able to get back.
41-
///
42-
/// The trade-off is that we may keep unreferenced elements in memory, but this is acceptable since
43-
/// we're optimizing for read performance and the data isn't being copied.
23+
/// Metadata-only take for [`ListViewArray`].
4424
impl TakeReduce for ListView {
4525
fn take(array: ArrayView<'_, ListView>, indices: &ArrayRef) -> VortexResult<Option<ArrayRef>> {
46-
let elements = array.elements();
47-
let offsets = array.offsets();
48-
let sizes = array.sizes();
26+
// Approximate element density by the fraction of list rows retained. Assumes roughly
27+
// uniform list sizes; good enough to decide whether dragging along the full `elements`
28+
// buffer is worth avoiding a rebuild.
29+
let kept_row_fraction = indices.len() as f32 / array.sizes().len() as f32;
30+
if kept_row_fraction < REBUILD_DENSITY_THRESHOLD {
31+
return Ok(None);
32+
}
4933

50-
// Compute the new validity by combining the array's validity with the indices' validity.
51-
let new_validity = array.validity()?.take(indices)?;
34+
Ok(Some(apply_take(array, indices)?.into_array()))
35+
}
36+
}
5237

53-
// Take the offsets and sizes arrays at the requested indices.
54-
// Take can reorder offsets, create gaps, and may introduce overlaps if the `indices`
55-
// contain duplicates.
56-
let nullable_new_offsets = offsets.take(indices.clone())?;
57-
let nullable_new_sizes = sizes.take(indices.clone())?;
38+
/// Execution-path take for [`ListViewArray`].
39+
///
40+
/// This does the same metadata-only take as [`TakeReduce`], but also rebuilds the array if the
41+
/// resulting array will be less dense than `REBUILD_DENSITY_THRESHOLD`.
42+
impl TakeExecute for ListView {
43+
fn take(
44+
array: ArrayView<'_, ListView>,
45+
indices: &ArrayRef,
46+
_ctx: &mut ExecutionCtx,
47+
) -> VortexResult<Option<ArrayRef>> {
48+
let kept_row_fraction = indices.len() as f32 / array.sizes().len() as f32;
49+
let taken = apply_take(array, indices)?;
5850

59-
// Since `take` returns nullable arrays, we simply cast it back to non-nullable (filled with
60-
// zeros to represent null lists).
61-
let new_offsets = match_each_integer_ptype!(nullable_new_offsets.dtype().as_ptype(), |O| {
62-
nullable_new_offsets
63-
.fill_null(Scalar::primitive(O::zero(), Nullability::NonNullable))?
64-
});
65-
let new_sizes = match_each_integer_ptype!(nullable_new_sizes.dtype().as_ptype(), |S| {
66-
nullable_new_sizes.fill_null(Scalar::primitive(S::zero(), Nullability::NonNullable))?
67-
});
68-
// SAFETY: Take operation maintains all `ListViewArray` invariants:
69-
// - `new_offsets` and `new_sizes` are derived from existing valid child arrays.
70-
// - `new_offsets` and `new_sizes` are non-nullable.
71-
// - `new_offsets` and `new_sizes` have the same length (both taken with the same
72-
// `indices`).
73-
// - Validity correctly reflects the combination of array and indices validity.
74-
let new_array = unsafe {
75-
ListViewArray::new_unchecked(elements.clone(), new_offsets, new_sizes, new_validity)
76-
};
51+
if kept_row_fraction < REBUILD_DENSITY_THRESHOLD {
52+
// TODO(connor)[ListView]: Ideally, we would only rebuild after all `take`s and `filter`
53+
// compute functions have run, at the "top" of the operator tree. However, we cannot do
54+
// this right now, so we will just rebuild every time (similar to `ListArray`).
55+
Ok(Some(
56+
taken
57+
.rebuild(ListViewRebuildMode::MakeZeroCopyToList)?
58+
.into_array(),
59+
))
60+
} else {
61+
Ok(Some(taken.into_array()))
62+
}
63+
}
64+
}
7765

78-
// TODO(connor)[ListView]: Ideally, we would only rebuild after all `take`s and `filter`
79-
// compute functions have run, at the "top" of the operator tree. However, we cannot do this
80-
// right now, so we will just rebuild every time (similar to `ListArray`).
66+
/// Shared metadata-only take: take `offsets`, `sizes` and `validity` at `indices` while reusing
67+
/// the original `elements` buffer as-is.
68+
fn apply_take(array: ArrayView<'_, ListView>, indices: &ArrayRef) -> VortexResult<ListViewArray> {
69+
let elements = array.elements();
70+
let offsets = array.offsets();
71+
let sizes = array.sizes();
8172

82-
Ok(Some(
83-
new_array
84-
.rebuild(ListViewRebuildMode::MakeZeroCopyToList)?
85-
.into_array(),
86-
))
87-
}
73+
// Combine the array's validity with the indices' validity.
74+
let new_validity = array.validity()?.take(indices)?;
75+
76+
// Take can reorder offsets, create gaps, and may introduce overlaps if `indices` contain
77+
// duplicates.
78+
let nullable_new_offsets = offsets.take(indices.clone())?;
79+
let nullable_new_sizes = sizes.take(indices.clone())?;
80+
81+
// `take` returns nullable arrays; cast back to non-nullable (filling with zeros to represent
82+
// the null lists — the validity mask tracks nullness separately).
83+
let new_offsets = match_each_integer_ptype!(nullable_new_offsets.dtype().as_ptype(), |O| {
84+
nullable_new_offsets.fill_null(Scalar::primitive(O::zero(), Nullability::NonNullable))?
85+
});
86+
let new_sizes = match_each_integer_ptype!(nullable_new_sizes.dtype().as_ptype(), |S| {
87+
nullable_new_sizes.fill_null(Scalar::primitive(S::zero(), Nullability::NonNullable))?
88+
});
89+
90+
// SAFETY: Take operation maintains all `ListViewArray` invariants:
91+
// - `new_offsets` and `new_sizes` are derived from existing valid child arrays.
92+
// - `new_offsets` and `new_sizes` are non-nullable.
93+
// - `new_offsets` and `new_sizes` have the same length (both taken with the same `indices`).
94+
// - Validity correctly reflects the combination of array and indices validity.
95+
Ok(unsafe {
96+
ListViewArray::new_unchecked(elements.clone(), new_offsets, new_sizes, new_validity)
97+
})
8898
}

0 commit comments

Comments
 (0)