Skip to content

Commit ff5b8ab

Browse files
committed
perf: improve calculating length performance for GenericByteArray in row conversion
1 parent 843bee2 commit ff5b8ab

File tree

1 file changed

+30
-21
lines changed

1 file changed

+30
-21
lines changed

arrow-row/src/lib.rs

Lines changed: 30 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ use std::hash::{Hash, Hasher};
164164
use std::sync::Arc;
165165

166166
use arrow_array::cast::*;
167-
use arrow_array::types::ArrowDictionaryKeyType;
167+
use arrow_array::types::{ArrowDictionaryKeyType, ByteArrayType};
168168
use arrow_array::*;
169169
use arrow_buffer::{ArrowNativeType, Buffer, OffsetBuffer, ScalarBuffer};
170170
use arrow_data::{ArrayData, ArrayDataBuilder};
@@ -1498,31 +1498,15 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder]) -> LengthTracker {
14981498
array => tracker.push_fixed(fixed::encoded_len(array)),
14991499
DataType::Null => {},
15001500
DataType::Boolean => tracker.push_fixed(bool::ENCODED_LEN),
1501-
DataType::Binary => tracker.push_variable(
1502-
as_generic_binary_array::<i32>(array)
1503-
.iter()
1504-
.map(|slice| variable::encoded_len(slice))
1505-
),
1506-
DataType::LargeBinary => tracker.push_variable(
1507-
as_generic_binary_array::<i64>(array)
1508-
.iter()
1509-
.map(|slice| variable::encoded_len(slice))
1510-
),
1501+
DataType::Binary => push_generic_byte_array_lengths(&mut tracker, as_generic_binary_array::<i32>(array)),
1502+
DataType::LargeBinary => push_generic_byte_array_lengths(&mut tracker, as_generic_binary_array::<i64>(array)),
15111503
DataType::BinaryView => tracker.push_variable(
15121504
array.as_binary_view()
15131505
.iter()
15141506
.map(|slice| variable::encoded_len(slice))
15151507
),
1516-
DataType::Utf8 => tracker.push_variable(
1517-
array.as_string::<i32>()
1518-
.iter()
1519-
.map(|slice| variable::encoded_len(slice.map(|x| x.as_bytes())))
1520-
),
1521-
DataType::LargeUtf8 => tracker.push_variable(
1522-
array.as_string::<i64>()
1523-
.iter()
1524-
.map(|slice| variable::encoded_len(slice.map(|x| x.as_bytes())))
1525-
),
1508+
DataType::Utf8 => push_generic_byte_array_lengths(&mut tracker, array.as_string::<i32>()),
1509+
DataType::LargeUtf8 => push_generic_byte_array_lengths(&mut tracker, array.as_string::<i64>()),
15261510
DataType::Utf8View => tracker.push_variable(
15271511
array.as_string_view()
15281512
.iter()
@@ -1617,6 +1601,31 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder]) -> LengthTracker {
16171601
tracker
16181602
}
16191603

1604+
/// Add to [`LengthTracker`] the encoded length of each item in the [`GenericByteArray`]
1605+
fn push_generic_byte_array_lengths<T: ByteArrayType>(
1606+
tracker: &mut LengthTracker,
1607+
array: &GenericByteArray<T>,
1608+
) {
1609+
if let Some(nulls) = array.nulls().filter(|n| n.null_count() > 0) {
1610+
tracker.push_variable(
1611+
array
1612+
.offsets()
1613+
.lengths()
1614+
.zip(nulls.iter())
1615+
.map(|(length, is_valid)| if is_valid { Some(length) } else { None })
1616+
.map(variable::padded_length),
1617+
)
1618+
} else {
1619+
tracker.push_variable(
1620+
array
1621+
.offsets()
1622+
.lengths()
1623+
.map(Some)
1624+
.map(variable::padded_length),
1625+
)
1626+
}
1627+
}
1628+
16201629
/// Encodes a column to the provided [`Rows`] incrementing the offsets as it progresses
16211630
fn encode_column(
16221631
data: &mut [u8],

0 commit comments

Comments
 (0)