From 871195a9e1e4c2bcf95b20b8472cc72fb38e1c22 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 13 Jan 2026 10:45:39 -0500 Subject: [PATCH 1/3] Avoid a clone when creating StringArray/BinaryArray from ArrayData --- arrow-array/src/array/byte_array.rs | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index bd85bffcfe44..71c221678826 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -542,30 +542,34 @@ impl<'a, T: ByteArrayType> ArrayAccessor for &'a GenericByteArray { impl From for GenericByteArray { fn from(data: ArrayData) -> Self { + let (data_type, len, nulls, offset, mut buffers, _child_data) = data.into_parts(); assert_eq!( - data.data_type(), - &Self::DATA_TYPE, + data_type, + Self::DATA_TYPE, "{}{}Array expects DataType::{}", T::Offset::PREFIX, T::PREFIX, Self::DATA_TYPE ); assert_eq!( - data.buffers().len(), + buffers.len(), 2, "{}{}Array data should contain 2 buffers only (offsets and values)", T::Offset::PREFIX, T::PREFIX, ); + // buffers are offset then value, so pop in reverse + let value_data = buffers.pop().expect("checked above"); + let offset_buffer = buffers.pop().expect("checked above"); + // SAFETY: // ArrayData is valid, and verified type above - let value_offsets = unsafe { get_offsets(&data) }; - let value_data = data.buffers()[1].clone(); + let value_offsets = unsafe { get_offsets(offset_buffer, offset, len) }; Self { value_offsets, value_data, data_type: T::DATA_TYPE, - nulls: data.nulls().cloned(), + nulls, } } } From 31dfc8add2723cf6c93858daee10d2c8143a0fa6 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 13 Jan 2026 10:52:36 -0500 Subject: [PATCH 2/3] fixup --- arrow-array/src/array/byte_array.rs | 6 +++--- arrow-array/src/array/mod.rs | 23 ++++++++++++++++++++++- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/arrow-array/src/array/byte_array.rs b/arrow-array/src/array/byte_array.rs index 71c221678826..8e8ad91ceaeb 100644 --- a/arrow-array/src/array/byte_array.rs +++ b/arrow-array/src/array/byte_array.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{get_offsets, print_long_array}; +use crate::array::{get_offsets_from_buffer, print_long_array}; use crate::builder::GenericByteBuilder; use crate::iterator::ArrayIter; use crate::types::ByteArrayType; @@ -564,11 +564,11 @@ impl From for GenericByteArray { // SAFETY: // ArrayData is valid, and verified type above - let value_offsets = unsafe { get_offsets(offset_buffer, offset, len) }; + let value_offsets = unsafe { get_offsets_from_buffer(offset_buffer, offset, len) }; Self { value_offsets, value_data, - data_type: T::DATA_TYPE, + data_type, nulls, } } diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index aae382ace7b4..4ce03295a069 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -20,7 +20,7 @@ mod binary_array; use crate::types::*; -use arrow_buffer::{ArrowNativeType, NullBuffer, OffsetBuffer, ScalarBuffer}; +use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, OffsetBuffer, ScalarBuffer}; use arrow_data::ArrayData; use arrow_schema::{DataType, IntervalUnit, TimeUnit}; use std::any::Any; @@ -939,6 +939,27 @@ unsafe fn get_offsets(data: &ArrayData) -> OffsetBuffer { } } +/// Helper function that creates an [`OffsetBuffer`] from a buffer and array offset/ length +/// +/// # Safety +/// +/// - buffer must contain valid arrow offsets ( [`OffsetBuffer`] ) for the +/// given length and offset. +unsafe fn get_offsets_from_buffer( + buffer: Buffer, + offset: usize, + len: usize, +) -> OffsetBuffer { + if len == 0 && buffer.is_empty() { + return OffsetBuffer::new_empty(); + } + + let scalar_buffer = ScalarBuffer::new(buffer, offset, len + 1); + // Safety: + // Arguments were valid + unsafe { OffsetBuffer::new_unchecked(scalar_buffer) } +} + /// Helper function for printing potentially long arrays. fn print_long_array(array: &A, f: &mut std::fmt::Formatter, print_item: F) -> std::fmt::Result where From 2cf6d2b13ad951ed92822ca1e52124a7a02cda4f Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 13 Jan 2026 10:55:30 -0500 Subject: [PATCH 3/3] clippy --- arrow-array/src/array/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 4ce03295a069..6fcb80c533fe 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -944,7 +944,7 @@ unsafe fn get_offsets(data: &ArrayData) -> OffsetBuffer { /// # Safety /// /// - buffer must contain valid arrow offsets ( [`OffsetBuffer`] ) for the -/// given length and offset. +/// given length and offset. unsafe fn get_offsets_from_buffer( buffer: Buffer, offset: usize,