Skip to content

Commit 9213ffd

Browse files
authored
perf: improve performance of encoding GenericByteArray by 8% (#9054)
# Which issue does this PR close? N/A # Rationale for this change Make row conversion faster # What changes are included in this PR? created "manual" iterator over the byte array and offsets with optimizations for no nulls # Are these changes tested? Existing tests # Are there any user-facing changes? No
1 parent 5ddddbd commit 9213ffd

File tree

2 files changed

+51
-9
lines changed

2 files changed

+51
-9
lines changed

arrow-row/src/lib.rs

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1644,24 +1644,22 @@ fn encode_column(
16441644
}
16451645
}
16461646
DataType::Binary => {
1647-
variable::encode(data, offsets, as_generic_binary_array::<i32>(column).iter(), opts)
1647+
variable::encode_generic_byte_array(data, offsets, as_generic_binary_array::<i32>(column), opts)
16481648
}
16491649
DataType::BinaryView => {
16501650
variable::encode(data, offsets, column.as_binary_view().iter(), opts)
16511651
}
16521652
DataType::LargeBinary => {
1653-
variable::encode(data, offsets, as_generic_binary_array::<i64>(column).iter(), opts)
1653+
variable::encode_generic_byte_array(data, offsets, as_generic_binary_array::<i64>(column), opts)
16541654
}
1655-
DataType::Utf8 => variable::encode(
1655+
DataType::Utf8 => variable::encode_generic_byte_array(
16561656
data, offsets,
1657-
column.as_string::<i32>().iter().map(|x| x.map(|x| x.as_bytes())),
1657+
column.as_string::<i32>(),
16581658
opts,
16591659
),
1660-
DataType::LargeUtf8 => variable::encode(
1660+
DataType::LargeUtf8 => variable::encode_generic_byte_array(
16611661
data, offsets,
1662-
column.as_string::<i64>()
1663-
.iter()
1664-
.map(|x| x.map(|x| x.as_bytes())),
1662+
column.as_string::<i64>(),
16651663
opts,
16661664
),
16671665
DataType::Utf8View => variable::encode(

arrow-row/src/variable.rs

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,10 @@
1717

1818
use crate::null_sentinel;
1919
use arrow_array::builder::BufferBuilder;
20+
use arrow_array::types::ByteArrayType;
2021
use arrow_array::*;
21-
use arrow_buffer::MutableBuffer;
2222
use arrow_buffer::bit_util::ceil;
23+
use arrow_buffer::{ArrowNativeType, MutableBuffer};
2324
use arrow_data::{ArrayDataBuilder, MAX_INLINE_VIEW_LEN};
2425
use arrow_schema::{DataType, SortOptions};
2526
use builder::make_view;
@@ -84,6 +85,48 @@ pub fn encode<'a, I: Iterator<Item = Option<&'a [u8]>>>(
8485
}
8586
}
8687

88+
/// Calls [`encode`] with optimized iterator for generic byte arrays
89+
pub(crate) fn encode_generic_byte_array<T: ByteArrayType>(
90+
data: &mut [u8],
91+
offsets: &mut [usize],
92+
input_array: &GenericByteArray<T>,
93+
opts: SortOptions,
94+
) {
95+
let input_offsets = input_array.value_offsets();
96+
let bytes = input_array.values().as_slice();
97+
98+
if let Some(null_buffer) = input_array.nulls().filter(|x| x.null_count() > 0) {
99+
let input_iter =
100+
input_offsets
101+
.windows(2)
102+
.zip(null_buffer.iter())
103+
.map(|(start_end, is_valid)| {
104+
if is_valid {
105+
let item_range = start_end[0].as_usize()..start_end[1].as_usize();
106+
// SAFETY: the offsets of the input are valid by construction
107+
// so it is ok to use unsafe here
108+
let item = unsafe { bytes.get_unchecked(item_range) };
109+
Some(item)
110+
} else {
111+
None
112+
}
113+
});
114+
115+
encode(data, offsets, input_iter, opts);
116+
} else {
117+
// Skip null checks
118+
let input_iter = input_offsets.windows(2).map(|start_end| {
119+
let item_range = start_end[0].as_usize()..start_end[1].as_usize();
120+
// SAFETY: the offsets of the input are valid by construction
121+
// so it is ok to use unsafe here
122+
let item = unsafe { bytes.get_unchecked(item_range) };
123+
Some(item)
124+
});
125+
126+
encode(data, offsets, input_iter, opts);
127+
}
128+
}
129+
87130
pub fn encode_null(out: &mut [u8], opts: SortOptions) -> usize {
88131
out[0] = null_sentinel(opts);
89132
1
@@ -97,6 +140,7 @@ pub fn encode_empty(out: &mut [u8], opts: SortOptions) -> usize {
97140
1
98141
}
99142

143+
#[inline]
100144
pub fn encode_one(out: &mut [u8], val: Option<&[u8]>, opts: SortOptions) -> usize {
101145
match val {
102146
None => encode_null(out, opts),

0 commit comments

Comments
 (0)