Skip to content

Commit 37c1e6b

Browse files
committed
perf: vectorize take_bits
1 parent a67cd19 commit 37c1e6b

File tree

1 file changed

+133
-4
lines changed

1 file changed

+133
-4
lines changed

arrow-select/src/take.rs

Lines changed: 133 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -457,12 +457,141 @@ fn take_bits<I: ArrowPrimitiveType>(
457457
});
458458
BooleanBuffer::new(output_buffer.into(), 0, len)
459459
}
460-
None => {
461-
BooleanBuffer::collect_bool(len, |idx: usize| {
462-
// SAFETY: idx<indices.len()
463-
values.value(unsafe { indices.value_unchecked(idx).as_usize() })
460+
None => take_bits_non_null_indices(indices, values),
461+
}
462+
}
463+
464+
fn take_bits_non_null_indices<I: ArrowPrimitiveType>(
465+
indices: &PrimitiveArray<I>,
466+
values: &BooleanBuffer,
467+
) -> BooleanBuffer {
468+
let values_slice: &[u8] = values.values();
469+
470+
// SAFETY: u8 is trivially transmutable to u32
471+
let (prefix, aligned, suffix) = unsafe { values_slice.align_to::<u32>() };
472+
473+
// By acessing the values buffer as [u32], we allow LLVM to use gather instructions,
474+
// which only exists for 32 and 64 bits values, which in turn allows better vectorization of
475+
// the rest of code, except for the final bitmask packing, which requires usage of instrinsics.
476+
// This only fails for BooleanBuffer'a created with Vec's of integers of 8 or 16 bits, which should
477+
// be uncommon. Even then, because there are gather instructions only for unaligned data, we could use
478+
// ptr::read_unaligned without any performace penalty if either the values buffer len is multiple of 4
479+
// or if it's sliced and there's valid memory after the slice end allowing up to 24 bits of memory read
480+
// This is currently unimplemented due to increased unsafe usage and probable low usefulness
481+
if prefix.is_empty() && suffix.is_empty() {
482+
let values_len = I::Native::usize_as(values.len());
483+
let indices_chunks = indices.values().chunks_exact(64);
484+
let remainder = indices_chunks.remainder();
485+
486+
let iter = indices_chunks.map(|indices_chunk| {
487+
let indices_chunk: &[I::Native; 64] = indices_chunk.try_into().unwrap(); // unwrap should be optimized out
488+
489+
let in_bounds = indices_chunk
490+
.iter()
491+
.fold(true, |acc, bit_idx| acc & (*bit_idx < values_len));
492+
493+
// todo: print the exact out of bounds index
494+
assert!(in_bounds, "Out-of-bounds index");
495+
496+
pack_bitmask(|i| {
497+
let bit_idx = indices_chunk[i].as_usize() + values.offset();
498+
let data_idx = bit_idx / 32;
499+
let bit_offset = bit_idx % 32;
500+
501+
// SAFETY: bounds checked above
502+
let value = unsafe { aligned.get_unchecked(data_idx).to_be() };
503+
504+
value & (1 << bit_offset) != 0
464505
})
506+
});
507+
508+
let mut buffer = unsafe { MutableBuffer::from_trusted_len_iter(iter) };
509+
510+
// TODO: to avoid buffer grow+copy below, add MutableBuffer::extend_from_trusted_len_iter
511+
// or use Vec<u64>, which would be aligned to 8 bytes instead of 64
512+
if !remainder.is_empty() {
513+
let mut packed = 0;
514+
515+
for (bit_idx, i) in remainder.iter().enumerate() {
516+
packed |= (values.value(i.as_usize()) as u64) << bit_idx;
517+
}
518+
519+
buffer.push(packed)
465520
}
521+
522+
BooleanBuffer::new(buffer.into(), 0, indices.len())
523+
} else {
524+
BooleanBuffer::collect_bool(indices.len(), |idx: usize| {
525+
// SAFETY: idx<indices.len()
526+
values.value(unsafe { indices.value_unchecked(idx).as_usize() })
527+
})
528+
}
529+
}
530+
531+
#[cfg(target_arch = "x86")]
532+
use std::arch::x86::*;
533+
534+
#[cfg(target_arch = "x86_64")]
535+
use std::arch::x86_64::*;
536+
537+
#[inline(always)]
538+
fn pack_bitmask(f: impl Fn(usize) -> bool) -> u64 {
539+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
540+
if cfg!(target_feature = "avx2") {
541+
return unsafe { pack_bitmask_avx2(f) };
542+
} else if cfg!(target_feature = "sse2") {
543+
return unsafe { pack_bitmask_sse2(f) };
544+
}
545+
546+
pack_bitmask_portable(f)
547+
}
548+
549+
#[inline(always)]
550+
fn pack_bitmask_portable(f: impl Fn(usize) -> bool) -> u64 {
551+
let mut mask = 0;
552+
553+
for i in 0..64 {
554+
mask |= (f(i) as u64) << i;
555+
}
556+
557+
mask
558+
}
559+
560+
#[inline(always)]
561+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
562+
unsafe fn pack_bitmask_avx2(f: impl Fn(usize) -> bool) -> u64 {
563+
unsafe {
564+
let unpacked: [u8; 64] = std::array::from_fn(|i| if f(i) { u8::MAX } else { 0 });
565+
566+
let low = _mm256_loadu_si256(unpacked.as_ptr() as *const _);
567+
let low = _mm256_movemask_epi8(low) as u32 as u64;
568+
569+
let high = _mm256_loadu_si256(unpacked[32..].as_ptr() as *const _);
570+
let high = _mm256_movemask_epi8(high) as u32 as u64;
571+
572+
(high << 32) | low
573+
}
574+
}
575+
576+
#[inline(always)]
577+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
578+
unsafe fn pack_bitmask_sse2(f: impl Fn(usize) -> bool) -> u64 {
579+
let unpacked: [u8; 64] = std::array::from_fn(|i| if f(i) { u8::MAX } else { 0 });
580+
581+
unsafe {
582+
let lolo = _mm_loadu_si128(unpacked.as_ptr() as *const _);
583+
let lolo = _mm_movemask_epi8(lolo) as u32 as u64;
584+
585+
let lo = _mm_loadu_si128(unpacked[16..].as_ptr() as *const _);
586+
let lo = _mm_movemask_epi8(lo) as u32 as u64;
587+
588+
let hi = _mm_loadu_si128(unpacked[32..].as_ptr() as *const _);
589+
let hi = _mm_movemask_epi8(hi) as u32 as u64;
590+
591+
let hihi = _mm_loadu_si128(unpacked[48..].as_ptr() as *const _);
592+
let hihi = _mm_movemask_epi8(hihi) as u32 as u64;
593+
594+
(hihi << 48) | (hi << 32) | (lo << 16) | lolo
466595
}
467596
}
468597

0 commit comments

Comments
 (0)