@@ -457,12 +457,141 @@ fn take_bits<I: ArrowPrimitiveType>(
457457 } ) ;
458458 BooleanBuffer :: new ( output_buffer. into ( ) , 0 , len)
459459 }
460- None => {
461- BooleanBuffer :: collect_bool ( len, |idx : usize | {
462- // SAFETY: idx<indices.len()
463- values. value ( unsafe { indices. value_unchecked ( idx) . as_usize ( ) } )
460+ None => take_bits_non_null_indices ( indices, values) ,
461+ }
462+ }
463+
464+ fn take_bits_non_null_indices < I : ArrowPrimitiveType > (
465+ indices : & PrimitiveArray < I > ,
466+ values : & BooleanBuffer ,
467+ ) -> BooleanBuffer {
468+ let values_slice: & [ u8 ] = values. values ( ) ;
469+
470+ // SAFETY: u8 is trivially transmutable to u32
471+ let ( prefix, aligned, suffix) = unsafe { values_slice. align_to :: < u32 > ( ) } ;
472+
473+ // By acessing the values buffer as [u32], we allow LLVM to use gather instructions,
474+ // which only exists for 32 and 64 bits values, which in turn allows better vectorization of
475+ // the rest of code, except for the final bitmask packing, which requires usage of instrinsics.
476+ // This only fails for BooleanBuffer'a created with Vec's of integers of 8 or 16 bits, which should
477+ // be uncommon. Even then, because there are gather instructions only for unaligned data, we could use
478+ // ptr::read_unaligned without any performace penalty if either the values buffer len is multiple of 4
479+ // or if it's sliced and there's valid memory after the slice end allowing up to 24 bits of memory read
480+ // This is currently unimplemented due to increased unsafe usage and probable low usefulness
481+ if prefix. is_empty ( ) && suffix. is_empty ( ) {
482+ let values_len = I :: Native :: usize_as ( values. len ( ) ) ;
483+ let indices_chunks = indices. values ( ) . chunks_exact ( 64 ) ;
484+ let remainder = indices_chunks. remainder ( ) ;
485+
486+ let iter = indices_chunks. map ( |indices_chunk| {
487+ let indices_chunk: & [ I :: Native ; 64 ] = indices_chunk. try_into ( ) . unwrap ( ) ; // unwrap should be optimized out
488+
489+ let in_bounds = indices_chunk
490+ . iter ( )
491+ . fold ( true , |acc, bit_idx| acc & ( * bit_idx < values_len) ) ;
492+
493+ // todo: print the exact out of bounds index
494+ assert ! ( in_bounds, "Out-of-bounds index" ) ;
495+
496+ pack_bitmask ( |i| {
497+ let bit_idx = indices_chunk[ i] . as_usize ( ) + values. offset ( ) ;
498+ let data_idx = bit_idx / 32 ;
499+ let bit_offset = bit_idx % 32 ;
500+
501+ // SAFETY: bounds checked above
502+ let value = unsafe { aligned. get_unchecked ( data_idx) . to_be ( ) } ;
503+
504+ value & ( 1 << bit_offset) != 0
464505 } )
506+ } ) ;
507+
508+ let mut buffer = unsafe { MutableBuffer :: from_trusted_len_iter ( iter) } ;
509+
510+ // TODO: to avoid buffer grow+copy below, add MutableBuffer::extend_from_trusted_len_iter
511+ // or use Vec<u64>, which would be aligned to 8 bytes instead of 64
512+ if !remainder. is_empty ( ) {
513+ let mut packed = 0 ;
514+
515+ for ( bit_idx, i) in remainder. iter ( ) . enumerate ( ) {
516+ packed |= ( values. value ( i. as_usize ( ) ) as u64 ) << bit_idx;
517+ }
518+
519+ buffer. push ( packed)
465520 }
521+
522+ BooleanBuffer :: new ( buffer. into ( ) , 0 , indices. len ( ) )
523+ } else {
524+ BooleanBuffer :: collect_bool ( indices. len ( ) , |idx : usize | {
525+ // SAFETY: idx<indices.len()
526+ values. value ( unsafe { indices. value_unchecked ( idx) . as_usize ( ) } )
527+ } )
528+ }
529+ }
530+
531+ #[ cfg( target_arch = "x86" ) ]
532+ use std:: arch:: x86:: * ;
533+
534+ #[ cfg( target_arch = "x86_64" ) ]
535+ use std:: arch:: x86_64:: * ;
536+
537+ #[ inline( always) ]
538+ fn pack_bitmask ( f : impl Fn ( usize ) -> bool ) -> u64 {
539+ #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
540+ if cfg ! ( target_feature = "avx2" ) {
541+ return unsafe { pack_bitmask_avx2 ( f) } ;
542+ } else if cfg ! ( target_feature = "sse2" ) {
543+ return unsafe { pack_bitmask_sse2 ( f) } ;
544+ }
545+
546+ pack_bitmask_portable ( f)
547+ }
548+
549+ #[ inline( always) ]
550+ fn pack_bitmask_portable ( f : impl Fn ( usize ) -> bool ) -> u64 {
551+ let mut mask = 0 ;
552+
553+ for i in 0 ..64 {
554+ mask |= ( f ( i) as u64 ) << i;
555+ }
556+
557+ mask
558+ }
559+
560+ #[ inline( always) ]
561+ #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
562+ unsafe fn pack_bitmask_avx2 ( f : impl Fn ( usize ) -> bool ) -> u64 {
563+ unsafe {
564+ let unpacked: [ u8 ; 64 ] = std:: array:: from_fn ( |i| if f ( i) { u8:: MAX } else { 0 } ) ;
565+
566+ let low = _mm256_loadu_si256 ( unpacked. as_ptr ( ) as * const _ ) ;
567+ let low = _mm256_movemask_epi8 ( low) as u32 as u64 ;
568+
569+ let high = _mm256_loadu_si256 ( unpacked[ 32 ..] . as_ptr ( ) as * const _ ) ;
570+ let high = _mm256_movemask_epi8 ( high) as u32 as u64 ;
571+
572+ ( high << 32 ) | low
573+ }
574+ }
575+
576+ #[ inline( always) ]
577+ #[ cfg( any( target_arch = "x86" , target_arch = "x86_64" ) ) ]
578+ unsafe fn pack_bitmask_sse2 ( f : impl Fn ( usize ) -> bool ) -> u64 {
579+ let unpacked: [ u8 ; 64 ] = std:: array:: from_fn ( |i| if f ( i) { u8:: MAX } else { 0 } ) ;
580+
581+ unsafe {
582+ let lolo = _mm_loadu_si128 ( unpacked. as_ptr ( ) as * const _ ) ;
583+ let lolo = _mm_movemask_epi8 ( lolo) as u32 as u64 ;
584+
585+ let lo = _mm_loadu_si128 ( unpacked[ 16 ..] . as_ptr ( ) as * const _ ) ;
586+ let lo = _mm_movemask_epi8 ( lo) as u32 as u64 ;
587+
588+ let hi = _mm_loadu_si128 ( unpacked[ 32 ..] . as_ptr ( ) as * const _ ) ;
589+ let hi = _mm_movemask_epi8 ( hi) as u32 as u64 ;
590+
591+ let hihi = _mm_loadu_si128 ( unpacked[ 48 ..] . as_ptr ( ) as * const _ ) ;
592+ let hihi = _mm_movemask_epi8 ( hihi) as u32 as u64 ;
593+
594+ ( hihi << 48 ) | ( hi << 32 ) | ( lo << 16 ) | lolo
466595 }
467596}
468597
0 commit comments