Skip to content

Commit b6143ff

Browse files
committed
Restore non-null fastpath but unsafely
1 parent 57cacc4 commit b6143ff

File tree

1 file changed

+31
-14
lines changed

1 file changed

+31
-14
lines changed

arrow-select/src/take.rs

Lines changed: 31 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -495,14 +495,19 @@ fn take_boolean<IndexType: ArrowPrimitiveType>(
495495
/// # Safety
496496
/// Each `(start, end)` in `ranges` must be in-bounds of `src`, and
497497
/// `capacity` must equal the total bytes across all ranges.
498-
unsafe fn copy_byte_ranges(src: &[u8], ranges: &[(usize, usize)], capacity: usize) -> Vec<u8> {
498+
unsafe fn copy_byte_ranges(
499+
src: &[u8],
500+
ranges: &[(usize, usize)],
501+
capacity: usize,
502+
values: &mut Vec<u8>,
503+
) {
504+
values.reserve(capacity);
499505
debug_assert_eq!(
500506
ranges.iter().map(|(s, e)| e - s).sum::<usize>(),
501507
capacity,
502508
"capacity must equal total bytes across all ranges"
503509
);
504510
let src_len = src.len();
505-
let mut values = Vec::with_capacity(capacity);
506511
let src = src.as_ptr();
507512
let mut dst = values.as_mut_ptr();
508513
for &(start, end) in ranges {
@@ -523,14 +528,14 @@ unsafe fn copy_byte_ranges(src: &[u8], ranges: &[(usize, usize)], capacity: usiz
523528
// SAFETY: caller guarantees `capacity` == total bytes across all ranges,
524529
// so the loop above wrote exactly `capacity` bytes.
525530
unsafe { values.set_len(capacity) };
526-
values
527531
}
528532

529533
/// `take` implementation for string arrays
530534
fn take_bytes<T: ByteArrayType, IndexType: ArrowPrimitiveType>(
531535
array: &GenericByteArray<T>,
532536
indices: &PrimitiveArray<IndexType>,
533537
) -> Result<GenericByteArray<T>, ArrowError> {
538+
let mut values = Vec::new();
534539
let mut offsets = Vec::with_capacity(indices.len() + 1);
535540
offsets.push(T::Offset::default());
536541

@@ -539,12 +544,10 @@ fn take_bytes<T: ByteArrayType, IndexType: ArrowPrimitiveType>(
539544
let mut capacity = 0;
540545
let nulls = take_nulls(array.nulls(), indices);
541546

542-
// Pass 1: compute offsets and collect byte ranges.
543547
// Branch on output nulls — `None` means every output slot is valid.
544-
let ranges = match nulls.as_ref().filter(|n| n.null_count() > 0) {
548+
match nulls.as_ref().filter(|n| n.null_count() > 0) {
545549
// Fast path: no nulls in output, every index is valid.
546550
None => {
547-
let mut ranges = Vec::with_capacity(indices.len());
548551
for index in indices.values() {
549552
let index = index.as_usize();
550553
let start = input_offsets[index].as_usize();
@@ -554,9 +557,26 @@ fn take_bytes<T: ByteArrayType, IndexType: ArrowPrimitiveType>(
554557
T::Offset::from_usize(capacity)
555558
.ok_or_else(|| ArrowError::OffsetOverflowError(capacity))?,
556559
);
557-
ranges.push((start, end));
558560
}
559-
ranges
561+
562+
values.reserve(capacity);
563+
564+
let mut dst = values.as_mut_ptr();
565+
566+
for index in indices.values() {
567+
// SAFETY: in-bounds proven by the first loop's bounds-checked offset access.
568+
// dst stays within reserved capacity computed from the same indices.
569+
unsafe {
570+
let data: &[u8] = array.value_unchecked(index.as_usize()).as_ref();
571+
std::ptr::copy_nonoverlapping(data.as_ptr(), dst, data.len());
572+
dst = dst.add(data.len());
573+
}
574+
}
575+
576+
// SAFETY: wrote exactly `capacity` bytes above; reserved on line above.
577+
unsafe {
578+
values.set_len(capacity);
579+
}
560580
}
561581
// Nullable path: only process valid (non-null) output positions.
562582
Some(output_nulls) => {
@@ -566,6 +586,7 @@ fn take_bytes<T: ByteArrayType, IndexType: ArrowPrimitiveType>(
566586
// Pre-fill offsets; we overwrite valid positions below.
567587
offsets.resize(indices.len() + 1, T::Offset::default());
568588

589+
// Pass 1: find all valid ranges that need to be copied.
569590
for i in output_nulls.valid_indices() {
570591
let current_offset = T::Offset::from_usize(capacity)
571592
.ok_or_else(|| ArrowError::OffsetOverflowError(capacity))?;
@@ -589,15 +610,11 @@ fn take_bytes<T: ByteArrayType, IndexType: ArrowPrimitiveType>(
589610
let final_offset = T::Offset::from_usize(capacity)
590611
.ok_or_else(|| ArrowError::OffsetOverflowError(capacity))?;
591612
offsets[last_filled + 1..].fill(final_offset);
592-
ranges
613+
// Pass 2: copy byte data for all collected ranges.
614+
unsafe { copy_byte_ranges(input_values, &ranges, capacity, &mut values) };
593615
}
594616
};
595617

596-
// Pass 2: copy byte data for all collected ranges.
597-
let values = unsafe { copy_byte_ranges(input_values, &ranges, capacity) };
598-
599-
debug_assert_eq!(capacity, values.len());
600-
601618
// SAFETY: offsets are monotonically increasing and in-bounds of `values`,
602619
// and `nulls` (if present) has length == `indices.len()`.
603620
let array = unsafe {

0 commit comments

Comments
 (0)