From 88621badb893222de2b24e6d43755abd1f1ba2ff Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Sun, 28 Dec 2025 20:22:22 +0200 Subject: [PATCH 1/3] perf: improve performance of encoding `GenericByteArray` by 15%-20% --- arrow-row/src/lib.rs | 14 ++++++-------- arrow-row/src/variable.rs | 37 ++++++++++++++++++++++++++++++++++++- 2 files changed, 42 insertions(+), 9 deletions(-) diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index aa6543485fe3..3ffa71e98c30 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -1644,24 +1644,22 @@ fn encode_column( } } DataType::Binary => { - variable::encode(data, offsets, as_generic_binary_array::(column).iter(), opts) + variable::encode_generic_byte_array(data, offsets, as_generic_binary_array::(column), opts) } DataType::BinaryView => { variable::encode(data, offsets, column.as_binary_view().iter(), opts) } DataType::LargeBinary => { - variable::encode(data, offsets, as_generic_binary_array::(column).iter(), opts) + variable::encode_generic_byte_array(data, offsets, as_generic_binary_array::(column), opts) } - DataType::Utf8 => variable::encode( + DataType::Utf8 => variable::encode_generic_byte_array( data, offsets, - column.as_string::().iter().map(|x| x.map(|x| x.as_bytes())), + column.as_string::(), opts, ), - DataType::LargeUtf8 => variable::encode( + DataType::LargeUtf8 => variable::encode_generic_byte_array( data, offsets, - column.as_string::() - .iter() - .map(|x| x.map(|x| x.as_bytes())), + column.as_string::(), opts, ), DataType::Utf8View => variable::encode( diff --git a/arrow-row/src/variable.rs b/arrow-row/src/variable.rs index ac2c4cb97c20..d54e5f9ffa87 100644 --- a/arrow-row/src/variable.rs +++ b/arrow-row/src/variable.rs @@ -17,9 +17,10 @@ use crate::null_sentinel; use arrow_array::builder::BufferBuilder; +use arrow_array::types::ByteArrayType; use arrow_array::*; -use arrow_buffer::MutableBuffer; use arrow_buffer::bit_util::ceil; +use arrow_buffer::{ArrowNativeType, MutableBuffer}; use arrow_data::{ArrayDataBuilder, MAX_INLINE_VIEW_LEN}; use arrow_schema::{DataType, SortOptions}; use builder::make_view; @@ -84,6 +85,40 @@ pub fn encode<'a, I: Iterator>>( } } +/// Calls [`encode`] with optimized iterator for generic byte arrays +pub(crate) fn encode_generic_byte_array( + data: &mut [u8], + offsets: &mut [usize], + input_array: &GenericByteArray, + opts: SortOptions, +) { + let input_offsets = input_array.value_offsets(); + let bytes = input_array.values().as_slice(); + + if let Some(null_buffer) = input_array.nulls().filter(|x| x.null_count() > 0) { + let input_iter = + input_offsets + .windows(2) + .zip(null_buffer.iter()) + .map(|(start_end, is_null)| { + if is_null { + None + } else { + Some(&bytes[start_end[0].as_usize()..start_end[1].as_usize()]) + } + }); + + encode(data, offsets, input_iter, opts); + } else { + // Skip null checks + let input_iter = input_offsets + .windows(2) + .map(|start_end| Some(&bytes[start_end[0].as_usize()..start_end[1].as_usize()])); + + encode(data, offsets, input_iter, opts); + } +} + pub fn encode_null(out: &mut [u8], opts: SortOptions) -> usize { out[0] = null_sentinel(opts); 1 From 06e180b0b5a00c0f587a6204c352c46e28a98a30 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Sun, 28 Dec 2025 20:31:00 +0200 Subject: [PATCH 2/3] fix nulls --- arrow-row/src/variable.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arrow-row/src/variable.rs b/arrow-row/src/variable.rs index d54e5f9ffa87..a567f28d10df 100644 --- a/arrow-row/src/variable.rs +++ b/arrow-row/src/variable.rs @@ -100,11 +100,11 @@ pub(crate) fn encode_generic_byte_array( input_offsets .windows(2) .zip(null_buffer.iter()) - .map(|(start_end, is_null)| { - if is_null { - None - } else { + .map(|(start_end, is_valid)| { + if is_valid { Some(&bytes[start_end[0].as_usize()..start_end[1].as_usize()]) + } else { + None } }); From 7e943df653b8093067f42a7dedeaec7498bde3ce Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Mon, 29 Dec 2025 16:13:11 +0200 Subject: [PATCH 3/3] use unsafe and inline `encode_one` so the compiler will have a better chance for skipping the encode null for non nullable --- arrow-row/src/variable.rs | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/arrow-row/src/variable.rs b/arrow-row/src/variable.rs index a567f28d10df..73e19b197f92 100644 --- a/arrow-row/src/variable.rs +++ b/arrow-row/src/variable.rs @@ -102,7 +102,11 @@ pub(crate) fn encode_generic_byte_array( .zip(null_buffer.iter()) .map(|(start_end, is_valid)| { if is_valid { - Some(&bytes[start_end[0].as_usize()..start_end[1].as_usize()]) + let item_range = start_end[0].as_usize()..start_end[1].as_usize(); + // SAFETY: the offsets of the input are valid by construction + // so it is ok to use unsafe here + let item = unsafe { bytes.get_unchecked(item_range) }; + Some(item) } else { None } @@ -111,9 +115,13 @@ pub(crate) fn encode_generic_byte_array( encode(data, offsets, input_iter, opts); } else { // Skip null checks - let input_iter = input_offsets - .windows(2) - .map(|start_end| Some(&bytes[start_end[0].as_usize()..start_end[1].as_usize()])); + let input_iter = input_offsets.windows(2).map(|start_end| { + let item_range = start_end[0].as_usize()..start_end[1].as_usize(); + // SAFETY: the offsets of the input are valid by construction + // so it is ok to use unsafe here + let item = unsafe { bytes.get_unchecked(item_range) }; + Some(item) + }); encode(data, offsets, input_iter, opts); } @@ -132,6 +140,7 @@ pub fn encode_empty(out: &mut [u8], opts: SortOptions) -> usize { 1 } +#[inline] pub fn encode_one(out: &mut [u8], val: Option<&[u8]>, opts: SortOptions) -> usize { match val { None => encode_null(out, opts),