From 314bccbfdca4d5041eae2bd14378f1d567b6164f Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Wed, 31 Dec 2025 21:34:17 +0200 Subject: [PATCH 1/2] perf: improve calculating length performance for nested arrays in row conversion --- arrow-row/src/lib.rs | 77 ++++++++++++++++++++++++++++++++++++++++--- arrow-row/src/list.rs | 34 +++++++++---------- arrow-row/src/run.rs | 4 +-- 3 files changed, 92 insertions(+), 23 deletions(-) diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 3ffa71e98c30..23b633b4ca6b 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -161,6 +161,8 @@ #![warn(missing_docs)] use std::cmp::Ordering; use std::hash::{Hash, Hasher}; +use std::iter::Map; +use std::slice::Windows; use std::sync::Arc; use arrow_array::cast::*; @@ -1077,6 +1079,9 @@ pub struct Rows { config: RowConfig, } +/// The iterator type for [`Rows::lengths`] +pub type RowLengthIter<'a> = Map, fn(&'a [usize]) -> usize>; + impl Rows { /// Append a [`Row`] to this [`Rows`] pub fn push(&mut self, row: Row<'_>) { @@ -1109,6 +1114,29 @@ impl Rows { } } + /// Returns the length of the row at index `row` in bytes + pub fn row_len(&self, row: usize) -> usize { + assert!(row + 1 < self.offsets.len()); + + unsafe { self.row_len_unchecked(row) } + } + + /// Returns the length of the row at `index` in bytes without bounds checking + /// + /// # Safety + /// Caller must ensure that `index` is less than the number of offsets (#rows + 1) + pub unsafe fn row_len_unchecked(&self, index: usize) -> usize { + let end = unsafe { self.offsets.get_unchecked(index + 1) }; + let start = unsafe { self.offsets.get_unchecked(index) }; + + end - start + } + + /// Get an iterator over the lengths of each row in this [`Rows`] + pub fn lengths(&self) -> RowLengthIter<'_> { + self.offsets.windows(2).map(|w| w[1] - w[0]) + } + /// Sets the length of this [`Rows`] to 0 pub fn clear(&mut self) { self.offsets.truncate(1); @@ -1540,7 +1568,7 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder]) -> LengthTracker { array => { tracker.push_variable( array.keys().iter().map(|v| match v { - Some(k) => values.row(k.as_usize()).data.len(), + Some(k) => values.row_len(k.as_usize()), None => null.data.len(), }) ) @@ -1551,7 +1579,7 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder]) -> LengthTracker { Encoder::Struct(rows, null) => { let array = as_struct_array(array); tracker.push_variable((0..array.len()).map(|idx| match array.is_valid(idx) { - true => 1 + rows.row(idx).as_ref().len(), + true => 1 + rows.row_len(idx), false => 1 + null.data.len(), })); } @@ -1603,10 +1631,10 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder]) -> LengthTracker { let lengths = (0..union_array.len()).map(|i| { let type_id = type_ids[i]; let child_row_i = offsets.as_ref().map(|o| o[i] as usize).unwrap_or(i); - let child_row = child_rows[type_id as usize].row(child_row_i); + let child_row_len = child_rows[type_id as usize].row_len(child_row_i); // length: 1 byte type_id + child row bytes - 1 + child_row.as_ref().len() + 1 + child_row_len }); tracker.push_variable(lengths); @@ -3611,6 +3639,38 @@ mod tests { } } + // Validate rows length iterator + { + let mut rows_iter = rows.iter(); + let mut rows_lengths_iter = rows.lengths(); + for (index, row) in rows_iter.by_ref().enumerate() { + let len = rows_lengths_iter + .next() + .expect("Reached end of length iterator while still have rows"); + assert_eq!( + row.data.len(), + len, + "Row length mismatch: {} vs {}", + row.data.len(), + len + ); + assert_eq!( + len, + rows.row_len(index), + "Row length mismatch at index {}: {} vs {}", + index, + len, + rows.row_len(index) + ); + } + + assert_eq!( + rows_lengths_iter.next(), + None, + "Length iterator did not reach end" + ); + } + // Convert rows produced from convert_columns(). // Note: validate_utf8 is set to false since Row is initialized through empty_rows() let back = converter.convert_rows(&rows).unwrap(); @@ -4091,4 +4151,13 @@ mod tests { "{empty_rows_size_with_preallocate_data} should be larger than {empty_rows_size_without_preallocate}" ); } + + #[test] + fn empty_rows_should_return_empty_lengths_iterator() { + let rows = RowConverter::new(vec![SortField::new(DataType::UInt8)]) + .unwrap() + .empty_rows(0, 0); + let mut lengths_iter = rows.lengths(); + assert_eq!(lengths_iter.next(), None); + } } diff --git a/arrow-row/src/list.rs b/arrow-row/src/list.rs index e04aa70c528f..6e552b0a93b9 100644 --- a/arrow-row/src/list.rs +++ b/arrow-row/src/list.rs @@ -27,32 +27,32 @@ pub fn compute_lengths( rows: &Rows, array: &GenericListArray, ) { - let shift = array.value_offsets()[0].as_usize(); - let offsets = array.value_offsets().windows(2); + let mut rows_length_iter = rows.lengths(); + lengths .iter_mut() .zip(offsets) .enumerate() .for_each(|(idx, (length, offsets))| { - let start = offsets[0].as_usize() - shift; - let end = offsets[1].as_usize() - shift; - let range = array.is_valid(idx).then_some(start..end); - *length += encoded_len(rows, range); + let len = offsets[1].as_usize() - offsets[0].as_usize(); + if array.is_valid(idx) { + *length += 1 + rows_length_iter + .by_ref() + .take(len) + .map(Some) + .map(super::variable::padded_length) + .sum::() + } else { + // Advance rows iterator by len + if len > 0 { + rows_length_iter.nth(len - 1); + } + *length += 1; + } }); } -fn encoded_len(rows: &Rows, range: Option>) -> usize { - match range { - None => 1, - Some(range) => { - 1 + range - .map(|i| super::variable::padded_length(Some(rows.row(i).as_ref().len()))) - .sum::() - } - } -} - /// Encodes the provided `GenericListArray` to `out` with the provided `SortOptions` /// /// `rows` should contain the encoded child elements diff --git a/arrow-row/src/run.rs b/arrow-row/src/run.rs index 24eaaa18e018..e12fa87dce4b 100644 --- a/arrow-row/src/run.rs +++ b/arrow-row/src/run.rs @@ -33,8 +33,8 @@ pub fn compute_lengths( // Iterate over each run and apply the same length to all logical positions in the run for (physical_idx, &run_end) in run_ends.iter().enumerate() { let logical_end = run_end.as_usize(); - let row = rows.row(physical_idx); - let encoded_len = variable::encoded_len(Some(row.data)); + let row_len = rows.row_len(physical_idx); + let encoded_len = variable::padded_length(Some(row_len)); // Add the same length for all logical positions in this run for length in &mut lengths[logical_start..logical_end] { From b9a6c8cc0c6dbd07341c4624f54f19bda74f98e4 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Tue, 13 Jan 2026 21:28:55 +0200 Subject: [PATCH 2/2] update based on cr comments --- arrow-row/src/lib.rs | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index d9f9096b0511..f25b51f3655d 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -1161,22 +1161,12 @@ impl Rows { } } - /// Returns the length of the row at index `row` in bytes + /// Returns the number of bytes the row at index `row` is occupying, + /// that is, what is the length of the returned [`Row::data`] will be. pub fn row_len(&self, row: usize) -> usize { assert!(row + 1 < self.offsets.len()); - unsafe { self.row_len_unchecked(row) } - } - - /// Returns the length of the row at `index` in bytes without bounds checking - /// - /// # Safety - /// Caller must ensure that `index` is less than the number of offsets (#rows + 1) - pub unsafe fn row_len_unchecked(&self, index: usize) -> usize { - let end = unsafe { self.offsets.get_unchecked(index + 1) }; - let start = unsafe { self.offsets.get_unchecked(index) }; - - end - start + self.offsets[row + 1] - self.offsets[row] } /// Get an iterator over the lengths of each row in this [`Rows`]