Skip to content

Commit 13d497a

Browse files
rluvatonalamb
andauthored
perf: improve calculating length performance for nested arrays in row conversion (#9079)
# Which issue does this PR close? N/A # Rationale for this change Making the row length calculation faster which result in faster row conversion # What changes are included in this PR? 1. Instead of iterating over the rows and getting the length from the byte slice, we use the offsets directly, this 2. Added 3 new APIs for `Rows` (explained below) # Are these changes tested? Yes # Are there any user-facing changes? Yes, added 3 functions to `Rows`: - `row_len` - get the row length at index - `row_len_unchecked` - get the row length at index without bound checks - `lengths` - get iterator over the lengths of the rows ----- Related to: - #9078 - #9080 --------- Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent f445bfc commit 13d497a

File tree

3 files changed

+82
-23
lines changed

3 files changed

+82
-23
lines changed

arrow-row/src/lib.rs

Lines changed: 63 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,8 @@
161161
#![warn(missing_docs)]
162162
use std::cmp::Ordering;
163163
use std::hash::{Hash, Hasher};
164+
use std::iter::Map;
165+
use std::slice::Windows;
164166
use std::sync::Arc;
165167

166168
use arrow_array::cast::*;
@@ -1118,6 +1120,9 @@ pub struct Rows {
11181120
config: RowConfig,
11191121
}
11201122

1123+
/// The iterator type for [`Rows::lengths`]
1124+
pub type RowLengthIter<'a> = Map<Windows<'a, usize>, fn(&'a [usize]) -> usize>;
1125+
11211126
impl Rows {
11221127
/// Append a [`Row`] to this [`Rows`]
11231128
pub fn push(&mut self, row: Row<'_>) {
@@ -1156,6 +1161,19 @@ impl Rows {
11561161
}
11571162
}
11581163

1164+
/// Returns the number of bytes the row at index `row` is occupying,
1165+
/// that is, what is the length of the returned [`Row::data`] will be.
1166+
pub fn row_len(&self, row: usize) -> usize {
1167+
assert!(row + 1 < self.offsets.len());
1168+
1169+
self.offsets[row + 1] - self.offsets[row]
1170+
}
1171+
1172+
/// Get an iterator over the lengths of each row in this [`Rows`]
1173+
pub fn lengths(&self) -> RowLengthIter<'_> {
1174+
self.offsets.windows(2).map(|w| w[1] - w[0])
1175+
}
1176+
11591177
/// Sets the length of this [`Rows`] to 0
11601178
pub fn clear(&mut self) {
11611179
self.offsets.truncate(1);
@@ -1563,7 +1581,7 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder]) -> LengthTracker {
15631581
array => {
15641582
tracker.push_variable(
15651583
array.keys().iter().map(|v| match v {
1566-
Some(k) => values.row(k.as_usize()).data.len(),
1584+
Some(k) => values.row_len(k.as_usize()),
15671585
None => null.data.len(),
15681586
})
15691587
)
@@ -1574,7 +1592,7 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder]) -> LengthTracker {
15741592
Encoder::Struct(rows, null) => {
15751593
let array = as_struct_array(array);
15761594
tracker.push_variable((0..array.len()).map(|idx| match array.is_valid(idx) {
1577-
true => 1 + rows.row(idx).as_ref().len(),
1595+
true => 1 + rows.row_len(idx),
15781596
false => 1 + null.data.len(),
15791597
}));
15801598
}
@@ -1626,10 +1644,10 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder]) -> LengthTracker {
16261644
let lengths = (0..union_array.len()).map(|i| {
16271645
let type_id = type_ids[i];
16281646
let child_row_i = offsets.as_ref().map(|o| o[i] as usize).unwrap_or(i);
1629-
let child_row = child_rows[type_id as usize].row(child_row_i);
1647+
let child_row_len = child_rows[type_id as usize].row_len(child_row_i);
16301648

16311649
// length: 1 byte type_id + child row bytes
1632-
1 + child_row.as_ref().len()
1650+
1 + child_row_len
16331651
});
16341652

16351653
tracker.push_variable(lengths);
@@ -3699,6 +3717,38 @@ mod tests {
36993717
}
37003718
}
37013719

3720+
// Validate rows length iterator
3721+
{
3722+
let mut rows_iter = rows.iter();
3723+
let mut rows_lengths_iter = rows.lengths();
3724+
for (index, row) in rows_iter.by_ref().enumerate() {
3725+
let len = rows_lengths_iter
3726+
.next()
3727+
.expect("Reached end of length iterator while still have rows");
3728+
assert_eq!(
3729+
row.data.len(),
3730+
len,
3731+
"Row length mismatch: {} vs {}",
3732+
row.data.len(),
3733+
len
3734+
);
3735+
assert_eq!(
3736+
len,
3737+
rows.row_len(index),
3738+
"Row length mismatch at index {}: {} vs {}",
3739+
index,
3740+
len,
3741+
rows.row_len(index)
3742+
);
3743+
}
3744+
3745+
assert_eq!(
3746+
rows_lengths_iter.next(),
3747+
None,
3748+
"Length iterator did not reach end"
3749+
);
3750+
}
3751+
37023752
// Convert rows produced from convert_columns().
37033753
// Note: validate_utf8 is set to false since Row is initialized through empty_rows()
37043754
let back = converter.convert_rows(&rows).unwrap();
@@ -4351,4 +4401,13 @@ mod tests {
43514401
"Size should increase when reserving more space than previously reserved"
43524402
);
43534403
}
4404+
4405+
#[test]
4406+
fn empty_rows_should_return_empty_lengths_iterator() {
4407+
let rows = RowConverter::new(vec![SortField::new(DataType::UInt8)])
4408+
.unwrap()
4409+
.empty_rows(0, 0);
4410+
let mut lengths_iter = rows.lengths();
4411+
assert_eq!(lengths_iter.next(), None);
4412+
}
43544413
}

arrow-row/src/list.rs

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -27,32 +27,32 @@ pub fn compute_lengths<O: OffsetSizeTrait>(
2727
rows: &Rows,
2828
array: &GenericListArray<O>,
2929
) {
30-
let shift = array.value_offsets()[0].as_usize();
31-
3230
let offsets = array.value_offsets().windows(2);
31+
let mut rows_length_iter = rows.lengths();
32+
3333
lengths
3434
.iter_mut()
3535
.zip(offsets)
3636
.enumerate()
3737
.for_each(|(idx, (length, offsets))| {
38-
let start = offsets[0].as_usize() - shift;
39-
let end = offsets[1].as_usize() - shift;
40-
let range = array.is_valid(idx).then_some(start..end);
41-
*length += encoded_len(rows, range);
38+
let len = offsets[1].as_usize() - offsets[0].as_usize();
39+
if array.is_valid(idx) {
40+
*length += 1 + rows_length_iter
41+
.by_ref()
42+
.take(len)
43+
.map(Some)
44+
.map(super::variable::padded_length)
45+
.sum::<usize>()
46+
} else {
47+
// Advance rows iterator by len
48+
if len > 0 {
49+
rows_length_iter.nth(len - 1);
50+
}
51+
*length += 1;
52+
}
4253
});
4354
}
4455

45-
fn encoded_len(rows: &Rows, range: Option<Range<usize>>) -> usize {
46-
match range {
47-
None => 1,
48-
Some(range) => {
49-
1 + range
50-
.map(|i| super::variable::padded_length(Some(rows.row(i).as_ref().len())))
51-
.sum::<usize>()
52-
}
53-
}
54-
}
55-
5656
/// Encodes the provided `GenericListArray` to `out` with the provided `SortOptions`
5757
///
5858
/// `rows` should contain the encoded child elements

arrow-row/src/run.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ pub fn compute_lengths<R: RunEndIndexType>(
3333
// Iterate over each run and apply the same length to all logical positions in the run
3434
for (physical_idx, &run_end) in run_ends.iter().enumerate() {
3535
let logical_end = run_end.as_usize();
36-
let row = rows.row(physical_idx);
37-
let encoded_len = variable::encoded_len(Some(row.data));
36+
let row_len = rows.row_len(physical_idx);
37+
let encoded_len = variable::padded_length(Some(row_len));
3838

3939
// Add the same length for all logical positions in this run
4040
for length in &mut lengths[logical_start..logical_end] {

0 commit comments

Comments
 (0)