Skip to content

Commit 8beeab2

Browse files
rluvatonalamb
andauthored
perf: improve calculating length performance for view byte array in row conversion (#9080)
# Which issue does this PR close? N/A # Rationale for this change Making the row length calculation faster which result in faster row conversion # What changes are included in this PR? 1. Instead of iterating over the bytes and getting the length from the byte slice, we use the offsets directly, this is faster as it saves us going to the buffer 2. Added new API for `GenericByteViewArray` (explained below) # Are these changes tested? Yes # Are there any user-facing changes? Yes, added `lengths` function to `GenericByteViewArray` to get an iterator over the lengths of the items in the array ----- Related to: - #9078 - #9079 --------- Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent a7b8907 commit 8beeab2

File tree

2 files changed

+165
-12
lines changed

2 files changed

+165
-12
lines changed

arrow-array/src/array/byte_view_array.rs

Lines changed: 134 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,26 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
438438
})
439439
}
440440

441+
/// Return an iterator over the length of each array element, including null values.
442+
///
443+
/// Null values length would equal to the underlying bytes length and NOT 0
444+
///
445+
/// Example of getting 0 for null values
446+
/// ```rust
447+
/// # use arrow_array::StringViewArray;
448+
/// # use arrow_array::Array;
449+
/// use arrow_data::ByteView;
450+
///
451+
/// fn lengths_with_zero_for_nulls(view: &StringViewArray) -> impl Iterator<Item = u32> {
452+
/// view.lengths()
453+
/// .enumerate()
454+
/// .map(|(index, length)| if view.is_null(index) { 0 } else { length })
455+
/// }
456+
/// ```
457+
pub fn lengths(&self) -> impl ExactSizeIterator<Item = u32> + Clone {
458+
self.views().iter().map(|v| *v as u32)
459+
}
460+
441461
/// Returns a zero-copy slice of this array with the indicated offset and length.
442462
pub fn slice(&self, offset: usize, length: usize) -> Self {
443463
Self {
@@ -1184,7 +1204,7 @@ mod tests {
11841204
use crate::{
11851205
Array, BinaryViewArray, GenericBinaryArray, GenericByteViewArray, StringViewArray,
11861206
};
1187-
use arrow_buffer::{Buffer, ScalarBuffer};
1207+
use arrow_buffer::{Buffer, NullBuffer, ScalarBuffer};
11881208
use arrow_data::{ByteView, MAX_INLINE_VIEW_LEN};
11891209
use rand::prelude::StdRng;
11901210
use rand::{Rng, SeedableRng};
@@ -1681,4 +1701,117 @@ mod tests {
16811701
);
16821702
}
16831703
}
1704+
1705+
#[test]
1706+
fn empty_array_should_return_empty_lengths_iterator() {
1707+
let empty = GenericByteViewArray::<BinaryViewType>::from(Vec::<&[u8]>::new());
1708+
1709+
let mut lengths_iter = empty.lengths();
1710+
assert_eq!(lengths_iter.len(), 0);
1711+
assert_eq!(lengths_iter.next(), None);
1712+
}
1713+
1714+
#[test]
1715+
fn array_lengths_should_return_correct_length_for_both_inlined_and_non_inlined() {
1716+
let cases = GenericByteViewArray::<BinaryViewType>::from(vec![
1717+
// Not inlined as longer than 12 bytes
1718+
b"Supercalifragilisticexpialidocious" as &[u8],
1719+
// Inlined as shorter than 12 bytes
1720+
b"Hello",
1721+
// Empty value
1722+
b"",
1723+
// Exactly 12 bytes
1724+
b"abcdefghijkl",
1725+
]);
1726+
1727+
let mut lengths_iter = cases.lengths();
1728+
1729+
assert_eq!(lengths_iter.len(), cases.len());
1730+
1731+
let cases_iter = cases.iter();
1732+
1733+
for case in cases_iter {
1734+
let case_value = case.unwrap();
1735+
let length = lengths_iter.next().expect("Should have a length");
1736+
1737+
assert_eq!(case_value.len(), length as usize);
1738+
}
1739+
1740+
assert_eq!(lengths_iter.next(), None, "Should not have more lengths");
1741+
}
1742+
1743+
#[test]
1744+
fn array_lengths_should_return_the_underlying_length_for_null_values() {
1745+
let cases = GenericByteViewArray::<BinaryViewType>::from(vec![
1746+
// Not inlined as longer than 12 bytes
1747+
b"Supercalifragilisticexpialidocious" as &[u8],
1748+
// Inlined as shorter than 12 bytes
1749+
b"Hello",
1750+
// Empty value
1751+
b"",
1752+
// Exactly 12 bytes
1753+
b"abcdefghijkl",
1754+
]);
1755+
1756+
let (views, buffer, _) = cases.clone().into_parts();
1757+
1758+
// Keeping the values but just adding nulls on top
1759+
let cases_with_all_nulls = GenericByteViewArray::<BinaryViewType>::new(
1760+
views,
1761+
buffer,
1762+
Some(NullBuffer::new_null(cases.len())),
1763+
);
1764+
1765+
let lengths_iter = cases.lengths();
1766+
let mut all_nulls_lengths_iter = cases_with_all_nulls.lengths();
1767+
1768+
assert_eq!(lengths_iter.len(), all_nulls_lengths_iter.len());
1769+
1770+
for expected_length in lengths_iter {
1771+
let actual_length = all_nulls_lengths_iter.next().expect("Should have a length");
1772+
1773+
assert_eq!(expected_length, actual_length);
1774+
}
1775+
1776+
assert_eq!(
1777+
all_nulls_lengths_iter.next(),
1778+
None,
1779+
"Should not have more lengths"
1780+
);
1781+
}
1782+
1783+
#[test]
1784+
fn array_lengths_on_sliced_should_only_return_lengths_for_sliced_data() {
1785+
let array = GenericByteViewArray::<BinaryViewType>::from(vec![
1786+
b"aaaaaaaaaaaaaaaaaaaaaaaaaaa" as &[u8],
1787+
b"Hello",
1788+
b"something great",
1789+
b"is",
1790+
b"coming soon!",
1791+
b"when you find what it is",
1792+
b"let me know",
1793+
b"cause",
1794+
b"I",
1795+
b"have no idea",
1796+
b"what it",
1797+
b"is",
1798+
]);
1799+
1800+
let sliced_array = array.slice(2, array.len() - 3);
1801+
1802+
let mut lengths_iter = sliced_array.lengths();
1803+
1804+
assert_eq!(lengths_iter.len(), sliced_array.len());
1805+
1806+
let values_iter = sliced_array.iter();
1807+
1808+
for value in values_iter {
1809+
let value = value.unwrap();
1810+
let length = lengths_iter.next().expect("Should have a length");
1811+
1812+
assert_eq!(value.len(), length as usize);
1813+
}
1814+
1815+
assert_eq!(lengths_iter.next(), None, "Should not have more lengths");
1816+
}
16841817
}

arrow-row/src/lib.rs

Lines changed: 31 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ use std::hash::{Hash, Hasher};
164164
use std::sync::Arc;
165165

166166
use arrow_array::cast::*;
167-
use arrow_array::types::ArrowDictionaryKeyType;
167+
use arrow_array::types::{ArrowDictionaryKeyType, ByteViewType};
168168
use arrow_array::*;
169169
use arrow_buffer::{ArrowNativeType, Buffer, OffsetBuffer, ScalarBuffer};
170170
use arrow_data::{ArrayData, ArrayDataBuilder};
@@ -1555,11 +1555,7 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder]) -> LengthTracker {
15551555
.iter()
15561556
.map(|slice| variable::encoded_len(slice))
15571557
),
1558-
DataType::BinaryView => tracker.push_variable(
1559-
array.as_binary_view()
1560-
.iter()
1561-
.map(|slice| variable::encoded_len(slice))
1562-
),
1558+
DataType::BinaryView => push_byte_view_array_lengths(&mut tracker, array.as_binary_view()),
15631559
DataType::Utf8 => tracker.push_variable(
15641560
array.as_string::<i32>()
15651561
.iter()
@@ -1570,11 +1566,7 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder]) -> LengthTracker {
15701566
.iter()
15711567
.map(|slice| variable::encoded_len(slice.map(|x| x.as_bytes())))
15721568
),
1573-
DataType::Utf8View => tracker.push_variable(
1574-
array.as_string_view()
1575-
.iter()
1576-
.map(|slice| variable::encoded_len(slice.map(|x| x.as_bytes())))
1577-
),
1569+
DataType::Utf8View => push_byte_view_array_lengths(&mut tracker, array.as_string_view()),
15781570
DataType::FixedSizeBinary(len) => {
15791571
let len = len.to_usize().unwrap();
15801572
tracker.push_fixed(1 + len)
@@ -1664,6 +1656,34 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder]) -> LengthTracker {
16641656
tracker
16651657
}
16661658

1659+
/// Add to [`LengthTracker`] the encoded length of each item in the [`GenericByteViewArray`]
1660+
fn push_byte_view_array_lengths<T: ByteViewType>(
1661+
tracker: &mut LengthTracker,
1662+
array: &GenericByteViewArray<T>,
1663+
) {
1664+
if let Some(nulls) = array.nulls().filter(|n| n.null_count() > 0) {
1665+
tracker.push_variable(
1666+
array
1667+
.lengths()
1668+
.zip(nulls.iter())
1669+
.map(|(length, is_valid)| {
1670+
if is_valid {
1671+
Some(length as usize)
1672+
} else {
1673+
None
1674+
}
1675+
})
1676+
.map(variable::padded_length),
1677+
)
1678+
} else {
1679+
tracker.push_variable(
1680+
array
1681+
.lengths()
1682+
.map(|len| variable::padded_length(Some(len as usize))),
1683+
)
1684+
}
1685+
}
1686+
16671687
/// Encodes a column to the provided [`Rows`] incrementing the offsets as it progresses
16681688
fn encode_column(
16691689
data: &mut [u8],

0 commit comments

Comments
 (0)