From 57094f802eef634e1c8585ab41855f15124fbd90 Mon Sep 17 00:00:00 2001 From: Florent Monjalet Date: Wed, 31 Dec 2025 22:19:16 +0100 Subject: [PATCH 1/5] wip typed iter --- arrow-array/src/array/mod.rs | 1 + arrow-array/src/array/typed_list_iter.rs | 58 ++++++++++++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 arrow-array/src/array/typed_list_iter.rs diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index bb114be95045..96e83f03cb86 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -73,6 +73,7 @@ mod byte_view_array; pub use byte_view_array::*; mod list_view_array; +mod typed_list_iter; pub use list_view_array::*; diff --git a/arrow-array/src/array/typed_list_iter.rs b/arrow-array/src/array/typed_list_iter.rs new file mode 100644 index 000000000000..a4b3c7f40b09 --- /dev/null +++ b/arrow-array/src/array/typed_list_iter.rs @@ -0,0 +1,58 @@ +use arrow_buffer::{NullBuffer, OffsetBuffer}; +use crate::{Array, ArrayRef, ArrowPrimitiveType, DictionaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait, PrimitiveArray}; +use crate::types::{ArrowDictionaryKeyType, ByteArrayType}; + +/// Arrays that can be sliced in a zero copy, zero allocation way. +pub trait SliceableArray { + fn slice(&self, offset: usize, length: usize) -> Self; +} + +impl SliceableArray for PrimitiveArray { + fn slice(&self, offset: usize, length: usize) -> Self { + PrimitiveArray::slice(self, offset, length) + } +} +impl SliceableArray for GenericByteArray { + fn slice(&self, offset: usize, length: usize) -> Self { + GenericByteArray::slice(self, offset, length) + } +} + +impl SliceableArray for DictionaryArray { + fn slice(&self, offset: usize, length: usize) -> Self { + DictionaryArray::slice(self, offset, length) + } +} + +/// A typed iterator on a GenericListArray. +/// +/// Downcasting at iterator creation time allows to avoid allocations during the iteration, since +/// we can now directly return the target type, instead of having to create +/// Arc that require allocations. This version should be both more ergonomic and more +/// efficient than the standard ArrayIter for GenericListArrays for supported types. +pub struct GenericListTypedIter { + nulls: Option, + values: ValueArray, + value_offsets: OffsetBuffer, +} + +impl GenericListTypedIter { + pub fn new(list: GenericListArray) -> Option { + let nulls = list.nulls().cloned(); + let values = list.values().as_any().downcast_ref::()?.clone(); + let value_offsets = list.offsets().clone(); + Some(Self { + nulls, + values, + value_offsets + }) + } +} + +impl Iterator for GenericListTypedIter { + type Item = ValueArray; + + fn next(&mut self) -> Option { + todo!() + } +} \ No newline at end of file From a1b26013522251626a2d8470a423bec05e7194d4 Mon Sep 17 00:00:00 2001 From: Florent Monjalet Date: Wed, 31 Dec 2025 22:32:02 +0100 Subject: [PATCH 2/5] wip finished first impl --- arrow-array/src/array/list_array.rs | 30 +++ arrow-array/src/array/typed_list_iter.rs | 223 ++++++++++++++++++++++- 2 files changed, 249 insertions(+), 4 deletions(-) diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index 32add1abf557..e296035dc201 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -374,6 +374,36 @@ impl GenericListArray { GenericListArrayIter::<'a, OffsetSize>::new(self) } + /// Constructs a new typed iterator that avoids `Arc` allocations + /// by returning concrete array types. + /// + /// This method downcasts the child values array to the specified type `ValueArray`. + /// Returns `None` if the downcast fails (i.e., the child array is not of the expected type). + /// + /// # Example + /// ``` + /// # use arrow_array::{ListArray, Int64Array, types::Int64Type}; + /// # use arrow_array::array::typed_list_iter::GenericListTypedIter; + /// let list_array = ListArray::from_iter_primitive::(vec![ + /// Some(vec![Some(1), Some(2)]), + /// None, + /// Some(vec![Some(3)]), + /// ]); + /// + /// let typed_iter: Option> = list_array.typed_iter(); + /// let mut iter = typed_iter.unwrap(); + /// + /// assert!(iter.next().unwrap().is_some()); // First element + /// assert!(iter.next().unwrap().is_none()); // Null element + /// assert!(iter.next().unwrap().is_some()); // Third element + /// ``` + pub fn typed_iter(&self) -> Option> + where + ValueArray: crate::array::typed_list_iter::SliceableArray + Clone + 'static, + { + crate::array::typed_list_iter::GenericListTypedIter::new(self.clone()) + } + #[inline] fn get_type(data_type: &DataType) -> Option<&DataType> { match (OffsetSize::IS_LARGE, data_type) { diff --git a/arrow-array/src/array/typed_list_iter.rs b/arrow-array/src/array/typed_list_iter.rs index a4b3c7f40b09..e1a779516833 100644 --- a/arrow-array/src/array/typed_list_iter.rs +++ b/arrow-array/src/array/typed_list_iter.rs @@ -1,5 +1,5 @@ use arrow_buffer::{NullBuffer, OffsetBuffer}; -use crate::{Array, ArrayRef, ArrowPrimitiveType, DictionaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait, PrimitiveArray}; +use crate::{Array, ArrowPrimitiveType, DictionaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait, PrimitiveArray}; use crate::types::{ArrowDictionaryKeyType, ByteArrayType}; /// Arrays that can be sliced in a zero copy, zero allocation way. @@ -34,6 +34,7 @@ pub struct GenericListTypedIter, values: ValueArray, value_offsets: OffsetBuffer, + current: usize, } impl GenericListTypedIter { @@ -44,15 +45,229 @@ impl Some(Self { nulls, values, - value_offsets + value_offsets, + current: 0, }) } } impl Iterator for GenericListTypedIter { - type Item = ValueArray; + type Item = Option; fn next(&mut self) -> Option { - todo!() + // Check if we've reached the end + if self.current >= self.value_offsets.len() - 1 { + return None; + } + + // Check if current row is null + let is_null = self.nulls.as_ref().map_or(false, |n| n.is_null(self.current)); + + let result = if is_null { + Some(None) + } else { + // Get start and end offsets for this list element + let start = self.value_offsets[self.current].as_usize(); + let end = self.value_offsets[self.current + 1].as_usize(); + + // Slice the values array - this is zero-copy + Some(Some(self.values.slice(start, end - start))) + }; + + self.current += 1; + result + } + + fn size_hint(&self) -> (usize, Option) { + let remaining = self.value_offsets.len() - 1 - self.current; + (remaining, Some(remaining)) + } +} + +impl ExactSizeIterator for GenericListTypedIter { + fn len(&self) -> usize { + self.value_offsets.len() - 1 - self.current + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{Int32Array, Int64Array, ListArray, StringArray, types::{Int32Type, Int64Type}}; + + #[test] + fn test_primitive_array_no_nulls() { + let list_array = ListArray::from_iter_primitive::(vec![ + Some(vec![Some(1), Some(2), Some(3)]), + Some(vec![Some(4)]), + Some(vec![]), + ]); + + let typed_iter: Option> = list_array.typed_iter(); + let mut iter = typed_iter.unwrap(); + + // First element + let arr = iter.next().unwrap().unwrap(); + assert_eq!(arr.len(), 3); + assert_eq!(arr.value(0), 1); + assert_eq!(arr.value(1), 2); + assert_eq!(arr.value(2), 3); + + // Second element + let arr = iter.next().unwrap().unwrap(); + assert_eq!(arr.len(), 1); + assert_eq!(arr.value(0), 4); + + // Third element (empty list) + let arr = iter.next().unwrap().unwrap(); + assert_eq!(arr.len(), 0); + + // No more elements + assert!(iter.next().is_none()); + } + + #[test] + fn test_primitive_array_with_nulls() { + let list_array = ListArray::from_iter_primitive::(vec![ + Some(vec![Some(1), Some(2)]), + None, + Some(vec![Some(3)]), + None, + ]); + + let typed_iter: Option> = list_array.typed_iter(); + let mut iter = typed_iter.unwrap(); + + // First element + assert!(iter.next().unwrap().is_some()); + + // Null element + assert!(iter.next().unwrap().is_none()); + + // Third element + assert!(iter.next().unwrap().is_some()); + + // Another null element + assert!(iter.next().unwrap().is_none()); + + // No more elements + assert!(iter.next().is_none()); + } + + #[test] + fn test_string_array() { + let list_array = ListArray::new( + arrow_schema::Field::new("item", arrow_schema::DataType::Utf8, true).into(), + arrow_buffer::OffsetBuffer::from_lengths([2, 1, 3]), + std::sync::Arc::new(StringArray::from(vec![ + Some("a"), Some("b"), // First list + Some("c"), // Second list + Some("d"), Some("e"), Some("f"), // Third list + ])), + None, + ); + + let typed_iter: Option> = list_array.typed_iter(); + let mut iter = typed_iter.unwrap(); + + // First element + let arr = iter.next().unwrap().unwrap(); + assert_eq!(arr.len(), 2); + assert_eq!(arr.value(0), "a"); + assert_eq!(arr.value(1), "b"); + + // Second element + let arr = iter.next().unwrap().unwrap(); + assert_eq!(arr.len(), 1); + assert_eq!(arr.value(0), "c"); + + // Third element + let arr = iter.next().unwrap().unwrap(); + assert_eq!(arr.len(), 3); + assert_eq!(arr.value(0), "d"); + assert_eq!(arr.value(1), "e"); + assert_eq!(arr.value(2), "f"); + + // No more elements + assert!(iter.next().is_none()); + } + + #[test] + fn test_wrong_type_returns_none() { + let list_array = ListArray::from_iter_primitive::(vec![ + Some(vec![Some(1), Some(2)]), + ]); + + // Try to create iterator with wrong type - should return None + let typed_iter: Option> = list_array.typed_iter(); + assert!(typed_iter.is_none()); + } + + #[test] + fn test_iterator_size_hint() { + let list_array = ListArray::from_iter_primitive::(vec![ + Some(vec![Some(1)]), + Some(vec![Some(2)]), + Some(vec![Some(3)]), + ]); + + let typed_iter: Option> = list_array.typed_iter(); + let iter = typed_iter.unwrap(); + + assert_eq!(iter.size_hint(), (3, Some(3))); + assert_eq!(iter.len(), 3); + } + + #[test] + fn test_iterator_with_enumerate() { + let list_array = ListArray::from_iter_primitive::(vec![ + Some(vec![Some(10)]), + None, + Some(vec![Some(20), Some(30)]), + ]); + + let typed_iter: Option> = list_array.typed_iter(); + let iter = typed_iter.unwrap(); + + for (idx, arr) in iter.enumerate() { + match idx { + 0 => { + let a = arr.unwrap(); + assert_eq!(a.len(), 1); + assert_eq!(a.value(0), 10); + } + 1 => assert!(arr.is_none()), + 2 => { + let a = arr.unwrap(); + assert_eq!(a.len(), 2); + assert_eq!(a.value(0), 20); + assert_eq!(a.value(1), 30); + } + _ => panic!("Unexpected index"), + } + } + } + + #[test] + fn test_iterator_with_zip() { + let list1 = ListArray::from_iter_primitive::(vec![ + Some(vec![Some(1), Some(2)]), + Some(vec![Some(3)]), + ]); + let list2 = ListArray::from_iter_primitive::(vec![ + Some(vec![Some(10)]), + Some(vec![Some(20), Some(30)]), + ]); + + let iter1: GenericListTypedIter = list1.typed_iter().unwrap(); + let iter2: GenericListTypedIter = list2.typed_iter().unwrap(); + + for (arr1, arr2) in iter1.zip(iter2) { + let a1 = arr1.unwrap(); + let a2 = arr2.unwrap(); + // Just verify they're not empty + assert!(a1.len() > 0); + assert!(a2.len() > 0); + } } } \ No newline at end of file From 25d9badda60c221f4297fd1e64a2a902066079db Mon Sep 17 00:00:00 2001 From: Florent Monjalet Date: Wed, 31 Dec 2025 23:08:53 +0100 Subject: [PATCH 3/5] more types --- arrow-array/Cargo.toml | 4 ++++ arrow-array/src/array/typed_list_iter.rs | 24 ++++++++++++++++++++++-- arrow/benches/array_iter.rs | 14 ++++++++++++++ 3 files changed, 40 insertions(+), 2 deletions(-) diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index 8ab0bb290e96..eac3e308d30a 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -83,4 +83,8 @@ harness = false [[bench]] name = "record_batch" +harness = false + +[[bench]] +name = "list_iterator" harness = false \ No newline at end of file diff --git a/arrow-array/src/array/typed_list_iter.rs b/arrow-array/src/array/typed_list_iter.rs index e1a779516833..b3e993076d5c 100644 --- a/arrow-array/src/array/typed_list_iter.rs +++ b/arrow-array/src/array/typed_list_iter.rs @@ -1,6 +1,6 @@ use arrow_buffer::{NullBuffer, OffsetBuffer}; -use crate::{Array, ArrowPrimitiveType, DictionaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait, PrimitiveArray}; -use crate::types::{ArrowDictionaryKeyType, ByteArrayType}; +use crate::{Array, ArrowPrimitiveType, DictionaryArray, GenericByteArray, GenericByteViewArray, GenericListArray, GenericListViewArray, OffsetSizeTrait, PrimitiveArray}; +use crate::types::{ArrowDictionaryKeyType, ByteArrayType, ByteViewType}; /// Arrays that can be sliced in a zero copy, zero allocation way. pub trait SliceableArray { @@ -18,12 +18,32 @@ impl SliceableArray for GenericByteArray { } } +impl SliceableArray for GenericByteViewArray { + fn slice(&self, offset: usize, length: usize) -> Self { + GenericByteViewArray::slice(self, offset, length) + } +} + impl SliceableArray for DictionaryArray { fn slice(&self, offset: usize, length: usize) -> Self { DictionaryArray::slice(self, offset, length) } } + +impl SliceableArray for GenericListArray { + fn slice(&self, offset: usize, length: usize) -> Self { + GenericListArray::slice(self, offset, length) + } +} + + +impl SliceableArray for GenericListViewArray { + fn slice(&self, offset: usize, length: usize) -> Self { + GenericListViewArray::slice(self, offset, length) + } +} + /// A typed iterator on a GenericListArray. /// /// Downcasting at iterator creation time allows to avoid allocations during the iteration, since diff --git a/arrow/benches/array_iter.rs b/arrow/benches/array_iter.rs index 14738196bf40..b8294519fecc 100644 --- a/arrow/benches/array_iter.rs +++ b/arrow/benches/array_iter.rs @@ -299,6 +299,20 @@ fn add_benchmark(c: &mut Criterion) { // Must use black_box here as this can be optimized away |_item| hint::black_box(false), ); + + benchmark_array_iter( + c, + "int list array with len 16", + &create_primitive_list_array_with_seed::(BATCH_SIZE, 0.0, 0.0, 16, 0), + &create_primitive_list_array_with_seed::(BATCH_SIZE, 0.5, 0.0, 16, 0), + // fold init + 0_usize, + // fold function + |acc, item| acc.wrapping_add(item.map(|item| item.len()).unwrap_or_default()), + // predicate that will always evaluate to false while allowing us to avoid using hint::black_box and let the compiler optimize more + |item| item.is_some_and(|item| item.len() > 100), + ); + } criterion_group!(benches, add_benchmark); From 0af0e2917598aea880f038a4cfe2854b73db3ab0 Mon Sep 17 00:00:00 2001 From: Florent Monjalet Date: Wed, 31 Dec 2025 23:13:56 +0100 Subject: [PATCH 4/5] more types 2 --- arrow-array/src/array/typed_list_iter.rs | 52 +++++++++++++++++++++++- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/arrow-array/src/array/typed_list_iter.rs b/arrow-array/src/array/typed_list_iter.rs index b3e993076d5c..a9c7fdbe3c08 100644 --- a/arrow-array/src/array/typed_list_iter.rs +++ b/arrow-array/src/array/typed_list_iter.rs @@ -1,6 +1,6 @@ use arrow_buffer::{NullBuffer, OffsetBuffer}; -use crate::{Array, ArrowPrimitiveType, DictionaryArray, GenericByteArray, GenericByteViewArray, GenericListArray, GenericListViewArray, OffsetSizeTrait, PrimitiveArray}; -use crate::types::{ArrowDictionaryKeyType, ByteArrayType, ByteViewType}; +use crate::{Array, ArrowPrimitiveType, BooleanArray, DictionaryArray, FixedSizeBinaryArray, FixedSizeListArray, GenericByteArray, GenericByteViewArray, GenericListArray, GenericListViewArray, MapArray, NullArray, OffsetSizeTrait, PrimitiveArray, RunArray, StructArray, UnionArray}; +use crate::types::{ArrowDictionaryKeyType, ByteArrayType, ByteViewType, RunEndIndexType}; /// Arrays that can be sliced in a zero copy, zero allocation way. pub trait SliceableArray { @@ -44,6 +44,54 @@ impl SliceableArray for GenericListViewArray { } } +impl SliceableArray for BooleanArray { + fn slice(&self, offset: usize, length: usize) -> Self { + BooleanArray::slice(self, offset, length) + } +} + +impl SliceableArray for FixedSizeBinaryArray { + fn slice(&self, offset: usize, length: usize) -> Self { + FixedSizeBinaryArray::slice(self, offset, length) + } +} + +impl SliceableArray for FixedSizeListArray { + fn slice(&self, offset: usize, length: usize) -> Self { + FixedSizeListArray::slice(self, offset, length) + } +} + +impl SliceableArray for NullArray { + fn slice(&self, offset: usize, length: usize) -> Self { + NullArray::slice(self, offset, length) + } +} + +impl SliceableArray for MapArray { + fn slice(&self, offset: usize, length: usize) -> Self { + MapArray::slice(self, offset, length) + } +} + +impl SliceableArray for RunArray { + fn slice(&self, offset: usize, length: usize) -> Self { + RunArray::slice(self, offset, length) + } +} + +impl SliceableArray for StructArray { + fn slice(&self, offset: usize, length: usize) -> Self { + StructArray::slice(self, offset, length) + } +} + +impl SliceableArray for UnionArray { + fn slice(&self, offset: usize, length: usize) -> Self { + UnionArray::slice(self, offset, length) + } +} + /// A typed iterator on a GenericListArray. /// /// Downcasting at iterator creation time allows to avoid allocations during the iteration, since From f46d089c3026285c08a94de69545eab2719ec896 Mon Sep 17 00:00:00 2001 From: Florent Monjalet Date: Wed, 31 Dec 2025 23:19:02 +0100 Subject: [PATCH 5/5] less code --- arrow-array/src/array/typed_list_iter.rs | 114 +++++++---------------- 1 file changed, 32 insertions(+), 82 deletions(-) diff --git a/arrow-array/src/array/typed_list_iter.rs b/arrow-array/src/array/typed_list_iter.rs index a9c7fdbe3c08..de3f5d5c0412 100644 --- a/arrow-array/src/array/typed_list_iter.rs +++ b/arrow-array/src/array/typed_list_iter.rs @@ -7,90 +7,40 @@ pub trait SliceableArray { fn slice(&self, offset: usize, length: usize) -> Self; } -impl SliceableArray for PrimitiveArray { - fn slice(&self, offset: usize, length: usize) -> Self { - PrimitiveArray::slice(self, offset, length) - } -} -impl SliceableArray for GenericByteArray { - fn slice(&self, offset: usize, length: usize) -> Self { - GenericByteArray::slice(self, offset, length) - } -} - -impl SliceableArray for GenericByteViewArray { - fn slice(&self, offset: usize, length: usize) -> Self { - GenericByteViewArray::slice(self, offset, length) - } -} - -impl SliceableArray for DictionaryArray { - fn slice(&self, offset: usize, length: usize) -> Self { - DictionaryArray::slice(self, offset, length) - } -} - - -impl SliceableArray for GenericListArray { - fn slice(&self, offset: usize, length: usize) -> Self { - GenericListArray::slice(self, offset, length) - } -} - - -impl SliceableArray for GenericListViewArray { - fn slice(&self, offset: usize, length: usize) -> Self { - GenericListViewArray::slice(self, offset, length) - } -} - -impl SliceableArray for BooleanArray { - fn slice(&self, offset: usize, length: usize) -> Self { - BooleanArray::slice(self, offset, length) - } -} - -impl SliceableArray for FixedSizeBinaryArray { - fn slice(&self, offset: usize, length: usize) -> Self { - FixedSizeBinaryArray::slice(self, offset, length) - } -} - -impl SliceableArray for FixedSizeListArray { - fn slice(&self, offset: usize, length: usize) -> Self { - FixedSizeListArray::slice(self, offset, length) - } -} - -impl SliceableArray for NullArray { - fn slice(&self, offset: usize, length: usize) -> Self { - NullArray::slice(self, offset, length) - } -} - -impl SliceableArray for MapArray { - fn slice(&self, offset: usize, length: usize) -> Self { - MapArray::slice(self, offset, length) - } -} - -impl SliceableArray for RunArray { - fn slice(&self, offset: usize, length: usize) -> Self { - RunArray::slice(self, offset, length) - } -} - -impl SliceableArray for StructArray { - fn slice(&self, offset: usize, length: usize) -> Self { - StructArray::slice(self, offset, length) - } +/// Macro to implement SliceableArray for array types that have a slice method +macro_rules! impl_sliceable { + // Pattern for types without generic parameters + ($array_type:ty) => { + impl SliceableArray for $array_type { + fn slice(&self, offset: usize, length: usize) -> Self { + <$array_type>::slice(self, offset, length) + } + } + }; + // Pattern for types with generic parameters: impl_sliceable!(ArrayType, TraitBound) + ($array_type:ident, $($bounds:tt)+) => { + impl SliceableArray for $array_type { + fn slice(&self, offset: usize, length: usize) -> Self { + $array_type::slice(self, offset, length) + } + } + }; } -impl SliceableArray for UnionArray { - fn slice(&self, offset: usize, length: usize) -> Self { - UnionArray::slice(self, offset, length) - } -} +impl_sliceable!(BooleanArray); +impl_sliceable!(DictionaryArray, ArrowDictionaryKeyType); +impl_sliceable!(FixedSizeBinaryArray); +impl_sliceable!(FixedSizeListArray); +impl_sliceable!(GenericByteArray, ByteArrayType); +impl_sliceable!(GenericByteViewArray, ByteViewType + ?Sized); +impl_sliceable!(GenericListArray, OffsetSizeTrait); +impl_sliceable!(GenericListViewArray, OffsetSizeTrait); +impl_sliceable!(MapArray); +impl_sliceable!(NullArray); +impl_sliceable!(PrimitiveArray, ArrowPrimitiveType); +impl_sliceable!(RunArray, RunEndIndexType); +impl_sliceable!(StructArray); +impl_sliceable!(UnionArray); /// A typed iterator on a GenericListArray. ///