From bedcfd6e3ff3fba618f0cbf6d744576c84dee311 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 20 Dec 2025 15:38:19 -0500 Subject: [PATCH 01/10] Add BinaryBuffer::from_bitwise_binary_op --- arrow-arith/src/boolean.rs | 24 ++-- arrow-buffer/src/buffer/boolean.rs | 174 ++++++++++++++++++++++++++++- arrow-buffer/src/buffer/ops.rs | 46 ++++---- arrow-select/src/nullif.rs | 18 ++- 4 files changed, 213 insertions(+), 49 deletions(-) diff --git a/arrow-arith/src/boolean.rs b/arrow-arith/src/boolean.rs index d94df49de256..6bf438e64618 100644 --- a/arrow-arith/src/boolean.rs +++ b/arrow-arith/src/boolean.rs @@ -23,7 +23,7 @@ //! [here](https://doc.rust-lang.org/stable/core/arch/) for more information. use arrow_array::*; -use arrow_buffer::buffer::{bitwise_bin_op_helper, bitwise_quaternary_op_helper}; +use arrow_buffer::buffer::bitwise_quaternary_op_helper; use arrow_buffer::{BooleanBuffer, NullBuffer, buffer_bin_and_not}; use arrow_schema::ArrowError; @@ -74,7 +74,7 @@ pub fn and_kleene(left: &BooleanArray, right: &BooleanArray) -> Result Result { // Same as above - Some(bitwise_bin_op_helper( + Some(BooleanBuffer::from_bitwise_binary_op( right_null_buffer.buffer(), right_null_buffer.offset(), left_values.inner(), @@ -100,7 +100,7 @@ pub fn and_kleene(left: &BooleanArray, right: &BooleanArray) -> Result Result Result Result { // Same as above - Some(bitwise_bin_op_helper( + Some(BooleanBuffer::from_bitwise_binary_op( right_nulls.buffer(), right_nulls.offset(), left_values.inner(), @@ -195,7 +196,7 @@ pub fn or_kleene(left: &BooleanArray, right: &BooleanArray) -> Result Result( + left: impl AsRef<[u8]>, + left_offset_in_bits: usize, + right: impl AsRef<[u8]>, + right_offset_in_bits: usize, + len_in_bits: usize, + mut op: F, + ) -> Self + where + F: FnMut(u64, u64) -> u64, + { + // try fast path for aligned input + if left_offset_in_bits % 8 == 0 && right_offset_in_bits % 8 == 0 { + if let Some(result) = Self::try_from_aligned_bitwise_binary_op( + &left.as_ref()[left_offset_in_bits / 8..], // aligned to byte boundary + &right.as_ref()[right_offset_in_bits / 8..], + len_in_bits, + &mut op, + ) { + return result; + } + } + + // each chunk is 64 bits + let left_chunks = BitChunks::new(left.as_ref(), left_offset_in_bits, len_in_bits); + let right_chunks = BitChunks::new(right.as_ref(), right_offset_in_bits, len_in_bits); + assert_eq!(left_chunks.num_u64s(), right_chunks.num_u64s()); + + let mut result = MutableBuffer::with_capacity(left_chunks.num_u64s() * 8); + + for (left, right) in left_chunks.iter().zip(right_chunks.iter()) { + // SAFETY: reserved enough capacity above, (exactly num_u64s() + // items) and we assume `BitChunks` correctly reports upper bound + unsafe { + result.push_unchecked(op(left, right)); + } + } + if left_chunks.remainder_len() > 0 { + debug_assert!(result.capacity() >= result.len() + 8); // should not reallocate + let op_result = op(left_chunks.remainder_bits(), right_chunks.remainder_bits()); + // SAFETY: reserved enough capacity above, (exactly num_u64s() + // items) and we assume `BitChunks` correctly reports upper bound + unsafe { + result.push_unchecked(op_result); + } + // Just pushed one u64, which may have trailing zeros + result.truncate(left_chunks.num_bytes()); + } + + BooleanBuffer { + buffer: Buffer::from(result), + offset: 0, + len: len_in_bits, + } + } + + /// Like [`Self::from_bitwise_binary_op`] but optimized for the case where the + /// inputs are aligned to byte boundaries + /// + /// Returns `None` if the inputs are not fully u64 aligned + fn try_from_aligned_bitwise_binary_op( + left: &[u8], + right: &[u8], + len_in_bits: usize, + op: &mut F, + ) -> Option + where + F: FnMut(u64, u64) -> u64, + { + // Safety: all valid bytes are valid u64s + let (left_prefix, left_u64s, left_suffix) = unsafe { left.align_to::() }; + let (right_prefix, right_u64s, right_suffix) = unsafe { right.align_to::() }; + if !(left_prefix.is_empty() + && right_prefix.is_empty() + && left_suffix.is_empty() + && right_suffix.is_empty()) + { + // Couldn't make this case any faster than the default path + // would be cool to handle non empty prefixes/suffixes too, + return None; + } + // the buffers are word (64 bit) aligned, so use optimized Vec code. + let result_u64s = left_u64s + .iter() + .zip(right_u64s.iter()) + .map(|(l, r)| op(*l, *r)) + .collect::>(); + Some(BooleanBuffer::new( + Buffer::from(result_u64s), + 0, + len_in_bits, + )) + } + /// Returns the number of set bits in this buffer pub fn count_set_bits(&self) -> usize { self.buffer.count_set_bits_offset(self.offset, self.len) @@ -591,4 +721,42 @@ mod tests { assert_eq!(result, expected); } } + + #[test] + fn test_from_bitwise_binary_op() { + // pick random boolean inputs + let input_bools_left = (0..1024) + .map(|_| rand::random::()) + .collect::>(); + let input_bools_right = (0..1024) + .map(|_| rand::random::()) + .collect::>(); + let input_buffer_left = BooleanBuffer::from(&input_bools_left[..]); + let input_buffer_right = BooleanBuffer::from(&input_bools_right[..]); + + for left_offset in 0..200 { + for right_offset in [0, 4, 5, 17, 33, 24, 45, 64, 65, 100, 200] { + for len_offset in [0, 1, 44, 100, 256, 300, 512] { + let len = 1024 - len_offset - left_offset.max(right_offset); // ensure we don't go out of bounds + // compute with AND + let result = BooleanBuffer::from_bitwise_binary_op( + input_buffer_left.values(), + left_offset, + input_buffer_right.values(), + right_offset, + len, + |a, b| a & b, + ); + // compute directly from bools + let expected = input_bools_left[left_offset..] + .iter() + .zip(&input_bools_right[right_offset..]) + .take(len) + .map(|(a, b)| *a & *b) + .collect::(); + assert_eq!(result, expected); + } + } + } + } } diff --git a/arrow-buffer/src/buffer/ops.rs b/arrow-buffer/src/buffer/ops.rs index 05593504b1cf..c9e7971709e7 100644 --- a/arrow-buffer/src/buffer/ops.rs +++ b/arrow-buffer/src/buffer/ops.rs @@ -61,35 +61,30 @@ where /// Apply a bitwise operation `op` to two inputs and return the result as a Buffer. /// The inputs are treated as bitmaps, meaning that offsets and length are specified in number of bits. +#[deprecated( + since = "57.2.0", + note = "use BooleanBuffer::from_bitwise_binary_op instead" +)] pub fn bitwise_bin_op_helper( left: &Buffer, left_offset_in_bits: usize, right: &Buffer, right_offset_in_bits: usize, len_in_bits: usize, - mut op: F, + op: F, ) -> Buffer where F: FnMut(u64, u64) -> u64, { - let left_chunks = left.bit_chunks(left_offset_in_bits, len_in_bits); - let right_chunks = right.bit_chunks(right_offset_in_bits, len_in_bits); - - let chunks = left_chunks - .iter() - .zip(right_chunks.iter()) - .map(|(left, right)| op(left, right)); - // Soundness: `BitChunks` is a `BitChunks` iterator which - // correctly reports its upper bound - let mut buffer = unsafe { MutableBuffer::from_trusted_len_iter(chunks) }; - - let remainder_bytes = ceil(left_chunks.remainder_len(), 8); - let rem = op(left_chunks.remainder_bits(), right_chunks.remainder_bits()); - // we are counting its starting from the least significant bit, to to_le_bytes should be correct - let rem = &rem.to_le_bytes()[0..remainder_bytes]; - buffer.extend_from_slice(rem); - - buffer.into() + BooleanBuffer::from_bitwise_binary_op( + left, + left_offset_in_bits, + right, + right_offset_in_bits, + len_in_bits, + op, + ) + .into_inner() } /// Apply a bitwise operation `op` to one input and return the result as a Buffer. @@ -119,7 +114,7 @@ pub fn buffer_bin_and( right_offset_in_bits: usize, len_in_bits: usize, ) -> Buffer { - bitwise_bin_op_helper( + BooleanBuffer::from_bitwise_binary_op( left, left_offset_in_bits, right, @@ -127,6 +122,7 @@ pub fn buffer_bin_and( len_in_bits, |a, b| a & b, ) + .into_inner() } /// Apply a bitwise or to two inputs and return the result as a Buffer. @@ -138,7 +134,7 @@ pub fn buffer_bin_or( right_offset_in_bits: usize, len_in_bits: usize, ) -> Buffer { - bitwise_bin_op_helper( + BooleanBuffer::from_bitwise_binary_op( left, left_offset_in_bits, right, @@ -146,6 +142,7 @@ pub fn buffer_bin_or( len_in_bits, |a, b| a | b, ) + .into_inner() } /// Apply a bitwise xor to two inputs and return the result as a Buffer. @@ -157,7 +154,7 @@ pub fn buffer_bin_xor( right_offset_in_bits: usize, len_in_bits: usize, ) -> Buffer { - bitwise_bin_op_helper( + BooleanBuffer::from_bitwise_binary_op( left, left_offset_in_bits, right, @@ -165,6 +162,7 @@ pub fn buffer_bin_xor( len_in_bits, |a, b| a ^ b, ) + .into_inner() } /// Apply a bitwise and_not to two inputs and return the result as a Buffer. @@ -176,7 +174,7 @@ pub fn buffer_bin_and_not( right_offset_in_bits: usize, len_in_bits: usize, ) -> Buffer { - bitwise_bin_op_helper( + BooleanBuffer::from_bitwise_binary_op( left, left_offset_in_bits, right, @@ -184,11 +182,11 @@ pub fn buffer_bin_and_not( len_in_bits, |a, b| a & !b, ) + .into_inner() } /// Apply a bitwise not to one input and return the result as a Buffer. /// The input is treated as a bitmap, meaning that offset and length are specified in number of bits. pub fn buffer_unary_not(left: &Buffer, offset_in_bits: usize, len_in_bits: usize) -> Buffer { - // TODO: should we deprecate this function in favor of the Buffer ! impl ? BooleanBuffer::from_bitwise_unary_op(left, offset_in_bits, len_in_bits, |a| !a).into_inner() } diff --git a/arrow-select/src/nullif.rs b/arrow-select/src/nullif.rs index 211cabf7afc0..9c386b3edf07 100644 --- a/arrow-select/src/nullif.rs +++ b/arrow-select/src/nullif.rs @@ -18,7 +18,6 @@ //! Implements the `nullif` function for Arrow arrays. use arrow_array::{Array, ArrayRef, BooleanArray, make_array}; -use arrow_buffer::buffer::bitwise_bin_op_helper; use arrow_buffer::{BooleanBuffer, NullBuffer}; use arrow_schema::{ArrowError, DataType}; @@ -75,7 +74,7 @@ pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result { let mut valid_count = 0; - let b = bitwise_bin_op_helper( + let b = BooleanBuffer::from_bitwise_binary_op( left.buffer(), left.offset(), right.inner(), @@ -91,18 +90,15 @@ pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result { let mut null_count = 0; - let buffer = - BooleanBuffer::from_bitwise_unary_op(right.inner(), right.offset(), len, |b| { - let t = !b; - null_count += t.count_zeros() as usize; - t - }) - .into_inner(); - (buffer, null_count) + let b = BooleanBuffer::from_bitwise_unary_op(right.inner(), right.offset(), len, |b| { + let t = !b; + null_count += t.count_zeros() as usize; + t + }); + (b, null_count) } }; - let combined = BooleanBuffer::new(combined, 0, len); // Safety: // Counted nulls whilst computing let nulls = unsafe { NullBuffer::new_unchecked(combined, null_count) }; From edfc085c1e0eabac3aa53d9fa918891e4bf95ebd Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 20 Dec 2025 16:29:39 -0500 Subject: [PATCH 02/10] Improve error message --- arrow-select/src/nullif.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/arrow-select/src/nullif.rs b/arrow-select/src/nullif.rs index 9c386b3edf07..4f2c5383a5bc 100644 --- a/arrow-select/src/nullif.rs +++ b/arrow-select/src/nullif.rs @@ -490,7 +490,18 @@ mod tests { let r_data = r.to_data(); r_data.validate().unwrap(); - assert_eq!(r.as_ref(), &expected); + assert_eq!( + r.as_ref(), + &expected, + "expected nulls: {:#?}\n\n\ + result nulls: {:#?}\n\n\\ + expected values: {:#?}\n\n\ + result values: {:#?}", + expected.nulls(), + r.nulls(), + expected.values(), + r.as_primitive::().values() + ); } #[test] From e7ef09717b3f35871da6b56e27881f8a74d12491 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 20 Dec 2025 17:10:54 -0500 Subject: [PATCH 03/10] llvm-encouragement --- arrow-buffer/src/buffer/boolean.rs | 27 ++++++++-------- arrow-buffer/src/buffer/mutable.rs | 49 ++++++++++++++++++------------ 2 files changed, 44 insertions(+), 32 deletions(-) diff --git a/arrow-buffer/src/buffer/boolean.rs b/arrow-buffer/src/buffer/boolean.rs index 4c3f428949fd..a9275c07cb54 100644 --- a/arrow-buffer/src/buffer/boolean.rs +++ b/arrow-buffer/src/buffer/boolean.rs @@ -161,13 +161,12 @@ impl BooleanBuffer { } let chunks = BitChunks::new(src.as_ref(), offset_in_bits, len_in_bits); + let result_chunks = chunks.iter().map(|chunk| op(chunk)); let mut result = MutableBuffer::with_capacity(chunks.num_u64s() * 8); - for chunk in chunks.iter() { - // SAFETY: reserved enough capacity above, (exactly num_u64s() - // items) and we assume `BitChunks` correctly reports upper bound - unsafe { - result.push_unchecked(op(chunk)); - } + // SAFETY: reserved enough capacity above, (exactly num_u64s() + // items) and we assume `BitChunks` correctly reports upper bound + unsafe { + result.extend_from_trusted_len_iter(result_chunks); } if chunks.remainder_len() > 0 { debug_assert!(result.capacity() >= result.len() + 8); // should not reallocate @@ -277,14 +276,16 @@ impl BooleanBuffer { assert_eq!(left_chunks.num_u64s(), right_chunks.num_u64s()); let mut result = MutableBuffer::with_capacity(left_chunks.num_u64s() * 8); - - for (left, right) in left_chunks.iter().zip(right_chunks.iter()) { - // SAFETY: reserved enough capacity above, (exactly num_u64s() - // items) and we assume `BitChunks` correctly reports upper bound - unsafe { - result.push_unchecked(op(left, right)); - } + let output_chunks = left_chunks + .iter() + .zip(right_chunks.iter()) + .map(|(left, right)| op(left, right)); + // SAFETY: reserved enough capacity above, (exactly num_u64s() + // items) and we assume `BitChunks` correctly reports upper bound + unsafe { + result.extend_from_trusted_len_iter(output_chunks); } + if left_chunks.remainder_len() > 0 { debug_assert!(result.capacity() >= result.len() + 8); // should not reallocate let op_result = op(left_chunks.remainder_bits(), right_chunks.remainder_bits()); diff --git a/arrow-buffer/src/buffer/mutable.rs b/arrow-buffer/src/buffer/mutable.rs index 75f77242b9e1..c2fb3e23e7f0 100644 --- a/arrow-buffer/src/buffer/mutable.rs +++ b/arrow-buffer/src/buffer/mutable.rs @@ -716,6 +716,32 @@ impl MutableBuffer { iterator.for_each(|item| self.push(item)); } + /// Extends this buffer with an iterator with a trusted length + pub unsafe fn extend_from_trusted_len_iter>( + &mut self, + iterator: I, + ) { + let item_size = std::mem::size_of::(); + let (_, upper) = iterator.size_hint(); + let upper = upper.expect("extend_from_trusted_len_iter requires an upper limit"); + let len = upper * item_size; + self.reserve(len); + + let mut dst = self.data.as_ptr(); + for item in iterator { + // note how there is no reserve here (compared with `extend_from_iter`) + let src = item.to_byte_slice().as_ptr(); + unsafe { std::ptr::copy_nonoverlapping(src, dst, item_size) }; + dst = unsafe { dst.add(item_size) }; + } + assert_eq!( + unsafe { dst.offset_from(self.data.as_ptr()) } as usize, + len, + "Trusted iterator length was not accurately reported" + ); + self.len += len; + } + /// Creates a [`MutableBuffer`] from an [`Iterator`] with a trusted (upper) length. /// Prefer this to `collect` whenever possible, as it is faster ~60% faster. /// # Example @@ -737,26 +763,11 @@ impl MutableBuffer { pub unsafe fn from_trusted_len_iter>( iterator: I, ) -> Self { - let item_size = std::mem::size_of::(); - let (_, upper) = iterator.size_hint(); - let upper = upper.expect("from_trusted_len_iter requires an upper limit"); - let len = upper * item_size; - - let mut buffer = MutableBuffer::new(len); - - let mut dst = buffer.data.as_ptr(); - for item in iterator { - // note how there is no reserve here (compared with `extend_from_iter`) - let src = item.to_byte_slice().as_ptr(); - unsafe { std::ptr::copy_nonoverlapping(src, dst, item_size) }; - dst = unsafe { dst.add(item_size) }; + let mut buffer = MutableBuffer::new(0); + // SAFETY: caller guarantees trusted length + unsafe { + buffer.extend_from_trusted_len_iter(iterator); } - assert_eq!( - unsafe { dst.offset_from(buffer.data.as_ptr()) } as usize, - len, - "Trusted iterator length was not accurately reported" - ); - buffer.len = len; buffer } From be3a64f26507fe47b65b09f8a27b1b891b10ca89 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 20 Dec 2025 17:12:58 -0500 Subject: [PATCH 04/10] clippy --- arrow-buffer/src/buffer/boolean.rs | 2 +- arrow-buffer/src/buffer/mutable.rs | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/arrow-buffer/src/buffer/boolean.rs b/arrow-buffer/src/buffer/boolean.rs index a9275c07cb54..9f9ed25992b4 100644 --- a/arrow-buffer/src/buffer/boolean.rs +++ b/arrow-buffer/src/buffer/boolean.rs @@ -161,7 +161,7 @@ impl BooleanBuffer { } let chunks = BitChunks::new(src.as_ref(), offset_in_bits, len_in_bits); - let result_chunks = chunks.iter().map(|chunk| op(chunk)); + let result_chunks = chunks.iter().map(&mut op); let mut result = MutableBuffer::with_capacity(chunks.num_u64s() * 8); // SAFETY: reserved enough capacity above, (exactly num_u64s() // items) and we assume `BitChunks` correctly reports upper bound diff --git a/arrow-buffer/src/buffer/mutable.rs b/arrow-buffer/src/buffer/mutable.rs index c2fb3e23e7f0..f454232ff527 100644 --- a/arrow-buffer/src/buffer/mutable.rs +++ b/arrow-buffer/src/buffer/mutable.rs @@ -717,6 +717,10 @@ impl MutableBuffer { } /// Extends this buffer with an iterator with a trusted length + /// + /// # Safety + /// This method assumes that the iterator's size is correct and is undefined behavior + /// to use it on an iterator that reports an incorrect length. pub unsafe fn extend_from_trusted_len_iter>( &mut self, iterator: I, From 2cdb0fea7bda1e9e97fb2c64ccca422bdac95e8e Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 21 Dec 2025 08:11:50 -0500 Subject: [PATCH 05/10] check --- arrow-buffer/src/buffer/boolean.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-buffer/src/buffer/boolean.rs b/arrow-buffer/src/buffer/boolean.rs index 9f9ed25992b4..e42b2048b953 100644 --- a/arrow-buffer/src/buffer/boolean.rs +++ b/arrow-buffer/src/buffer/boolean.rs @@ -259,7 +259,7 @@ impl BooleanBuffer { F: FnMut(u64, u64) -> u64, { // try fast path for aligned input - if left_offset_in_bits % 8 == 0 && right_offset_in_bits % 8 == 0 { + if left_offset_in_bits & 0x7 == 0 && right_offset_in_bits & 0x7 == 0 { if let Some(result) = Self::try_from_aligned_bitwise_binary_op( &left.as_ref()[left_offset_in_bits / 8..], // aligned to byte boundary &right.as_ref()[right_offset_in_bits / 8..], From 8a7f747d88689e7ede01d6e660a48317d67bfc6b Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 21 Dec 2025 09:50:30 -0500 Subject: [PATCH 06/10] More messing around with vectorization --- arrow-buffer/src/buffer/mutable.rs | 42 +++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/arrow-buffer/src/buffer/mutable.rs b/arrow-buffer/src/buffer/mutable.rs index f454232ff527..6828112c95a0 100644 --- a/arrow-buffer/src/buffer/mutable.rs +++ b/arrow-buffer/src/buffer/mutable.rs @@ -721,6 +721,7 @@ impl MutableBuffer { /// # Safety /// This method assumes that the iterator's size is correct and is undefined behavior /// to use it on an iterator that reports an incorrect length. + #[inline] pub unsafe fn extend_from_trusted_len_iter>( &mut self, iterator: I, @@ -731,19 +732,42 @@ impl MutableBuffer { let len = upper * item_size; self.reserve(len); - let mut dst = self.data.as_ptr(); + let written = unsafe { + let dst = self.data.as_ptr().add(self.len); + self.extend_from_trusted_len_iter_inner(iterator, dst) + }; + assert_eq!(written as usize, len, "Trusted iterator length was not accurately reported"); + self.len += len; + } + + /// Copies the contents of a trusted-length iterator into the provided destination pointer. + /// returning the number of bytes written. + /// + /// This is its own non inlined function to maximize the chances of + /// maximally vectorizing the inner loop. + /// + /// # Safety + /// The caller must ensure that `dst` points to a memory region large enough to hold + /// all items from the iterator. + #[inline(never)] + pub unsafe fn extend_from_trusted_len_iter_inner>( + &mut self, + iterator: I, + mut dst: *mut u8, + ) -> isize + { + let item_size = std::mem::size_of::(); + let start = dst; for item in iterator { // note how there is no reserve here (compared with `extend_from_iter`) let src = item.to_byte_slice().as_ptr(); - unsafe { std::ptr::copy_nonoverlapping(src, dst, item_size) }; - dst = unsafe { dst.add(item_size) }; + unsafe { + std::ptr::copy_nonoverlapping(src, dst, item_size); + dst = dst.add(item_size) + }; } - assert_eq!( - unsafe { dst.offset_from(self.data.as_ptr()) } as usize, - len, - "Trusted iterator length was not accurately reported" - ); - self.len += len; + unsafe { dst.offset_from(start) } + } /// Creates a [`MutableBuffer`] from an [`Iterator`] with a trusted (upper) length. From 2b7f80de8591ab881ec9617754aaf05b3d3c8078 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 21 Dec 2025 09:52:02 -0500 Subject: [PATCH 07/10] fmt --- arrow-buffer/src/buffer/boolean.rs | 2 +- arrow-buffer/src/buffer/mutable.rs | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/arrow-buffer/src/buffer/boolean.rs b/arrow-buffer/src/buffer/boolean.rs index e42b2048b953..9ba540662d89 100644 --- a/arrow-buffer/src/buffer/boolean.rs +++ b/arrow-buffer/src/buffer/boolean.rs @@ -259,7 +259,7 @@ impl BooleanBuffer { F: FnMut(u64, u64) -> u64, { // try fast path for aligned input - if left_offset_in_bits & 0x7 == 0 && right_offset_in_bits & 0x7 == 0 { + if left_offset_in_bits & 0x7 == 0 && right_offset_in_bits & 0x7 == 0 { if let Some(result) = Self::try_from_aligned_bitwise_binary_op( &left.as_ref()[left_offset_in_bits / 8..], // aligned to byte boundary &right.as_ref()[right_offset_in_bits / 8..], diff --git a/arrow-buffer/src/buffer/mutable.rs b/arrow-buffer/src/buffer/mutable.rs index 6828112c95a0..b795faf5b42d 100644 --- a/arrow-buffer/src/buffer/mutable.rs +++ b/arrow-buffer/src/buffer/mutable.rs @@ -733,10 +733,13 @@ impl MutableBuffer { self.reserve(len); let written = unsafe { - let dst = self.data.as_ptr().add(self.len); + let dst = self.data.as_ptr().add(self.len); self.extend_from_trusted_len_iter_inner(iterator, dst) }; - assert_eq!(written as usize, len, "Trusted iterator length was not accurately reported"); + assert_eq!( + written as usize, len, + "Trusted iterator length was not accurately reported" + ); self.len += len; } @@ -754,8 +757,7 @@ impl MutableBuffer { &mut self, iterator: I, mut dst: *mut u8, - ) -> isize - { + ) -> isize { let item_size = std::mem::size_of::(); let start = dst; for item in iterator { @@ -763,11 +765,10 @@ impl MutableBuffer { let src = item.to_byte_slice().as_ptr(); unsafe { std::ptr::copy_nonoverlapping(src, dst, item_size); - dst = dst.add(item_size) + dst = dst.add(item_size) }; } unsafe { dst.offset_from(start) } - } /// Creates a [`MutableBuffer`] from an [`Iterator`] with a trusted (upper) length. From 3fa46837709e1b65ed883a050ab4a3c9cf02fd97 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 21 Dec 2025 12:54:21 -0500 Subject: [PATCH 08/10] use prior implementation --- arrow-buffer/src/buffer/boolean.rs | 45 +++++++---------- arrow-buffer/src/buffer/mutable.rs | 78 ++++++++---------------------- 2 files changed, 36 insertions(+), 87 deletions(-) diff --git a/arrow-buffer/src/buffer/boolean.rs b/arrow-buffer/src/buffer/boolean.rs index 9ba540662d89..709adb1ecfdd 100644 --- a/arrow-buffer/src/buffer/boolean.rs +++ b/arrow-buffer/src/buffer/boolean.rs @@ -19,7 +19,7 @@ use crate::bit_chunk_iterator::BitChunks; use crate::bit_iterator::{BitIndexIterator, BitIndexU32Iterator, BitIterator, BitSliceIterator}; use crate::{ BooleanBufferBuilder, Buffer, MutableBuffer, bit_util, buffer_bin_and, buffer_bin_or, - buffer_bin_xor, buffer_unary_not, + buffer_bin_xor, buffer_unary_not, util, }; use std::ops::{BitAnd, BitOr, BitXor, Not}; @@ -161,12 +161,13 @@ impl BooleanBuffer { } let chunks = BitChunks::new(src.as_ref(), offset_in_bits, len_in_bits); - let result_chunks = chunks.iter().map(&mut op); let mut result = MutableBuffer::with_capacity(chunks.num_u64s() * 8); - // SAFETY: reserved enough capacity above, (exactly num_u64s() - // items) and we assume `BitChunks` correctly reports upper bound - unsafe { - result.extend_from_trusted_len_iter(result_chunks); + for chunk in chunks.iter() { + // SAFETY: reserved enough capacity above, (exactly num_u64s() + // items) and we assume `BitChunks` correctly reports upper bound + unsafe { + result.push_unchecked(op(chunk)); + } } if chunks.remainder_len() > 0 { debug_assert!(result.capacity() >= result.len() + 8); // should not reallocate @@ -269,37 +270,25 @@ impl BooleanBuffer { return result; } } - - // each chunk is 64 bits let left_chunks = BitChunks::new(left.as_ref(), left_offset_in_bits, len_in_bits); let right_chunks = BitChunks::new(right.as_ref(), right_offset_in_bits, len_in_bits); - assert_eq!(left_chunks.num_u64s(), right_chunks.num_u64s()); - let mut result = MutableBuffer::with_capacity(left_chunks.num_u64s() * 8); - let output_chunks = left_chunks + let chunks = left_chunks .iter() .zip(right_chunks.iter()) .map(|(left, right)| op(left, right)); - // SAFETY: reserved enough capacity above, (exactly num_u64s() - // items) and we assume `BitChunks` correctly reports upper bound - unsafe { - result.extend_from_trusted_len_iter(output_chunks); - } + // Soundness: `BitChunks` is a `BitChunks` iterator which + // correctly reports its upper bound + let mut buffer = unsafe { MutableBuffer::from_trusted_len_iter(chunks) }; - if left_chunks.remainder_len() > 0 { - debug_assert!(result.capacity() >= result.len() + 8); // should not reallocate - let op_result = op(left_chunks.remainder_bits(), right_chunks.remainder_bits()); - // SAFETY: reserved enough capacity above, (exactly num_u64s() - // items) and we assume `BitChunks` correctly reports upper bound - unsafe { - result.push_unchecked(op_result); - } - // Just pushed one u64, which may have trailing zeros - result.truncate(left_chunks.num_bytes()); - } + let remainder_bytes = util::bit_util::ceil(left_chunks.remainder_len(), 8); + let rem = op(left_chunks.remainder_bits(), right_chunks.remainder_bits()); + // we are counting its starting from the least significant bit, to to_le_bytes should be correct + let rem = &rem.to_le_bytes()[0..remainder_bytes]; + buffer.extend_from_slice(rem); BooleanBuffer { - buffer: Buffer::from(result), + buffer: Buffer::from(buffer), offset: 0, len: len_in_bits, } diff --git a/arrow-buffer/src/buffer/mutable.rs b/arrow-buffer/src/buffer/mutable.rs index b795faf5b42d..75f77242b9e1 100644 --- a/arrow-buffer/src/buffer/mutable.rs +++ b/arrow-buffer/src/buffer/mutable.rs @@ -716,61 +716,6 @@ impl MutableBuffer { iterator.for_each(|item| self.push(item)); } - /// Extends this buffer with an iterator with a trusted length - /// - /// # Safety - /// This method assumes that the iterator's size is correct and is undefined behavior - /// to use it on an iterator that reports an incorrect length. - #[inline] - pub unsafe fn extend_from_trusted_len_iter>( - &mut self, - iterator: I, - ) { - let item_size = std::mem::size_of::(); - let (_, upper) = iterator.size_hint(); - let upper = upper.expect("extend_from_trusted_len_iter requires an upper limit"); - let len = upper * item_size; - self.reserve(len); - - let written = unsafe { - let dst = self.data.as_ptr().add(self.len); - self.extend_from_trusted_len_iter_inner(iterator, dst) - }; - assert_eq!( - written as usize, len, - "Trusted iterator length was not accurately reported" - ); - self.len += len; - } - - /// Copies the contents of a trusted-length iterator into the provided destination pointer. - /// returning the number of bytes written. - /// - /// This is its own non inlined function to maximize the chances of - /// maximally vectorizing the inner loop. - /// - /// # Safety - /// The caller must ensure that `dst` points to a memory region large enough to hold - /// all items from the iterator. - #[inline(never)] - pub unsafe fn extend_from_trusted_len_iter_inner>( - &mut self, - iterator: I, - mut dst: *mut u8, - ) -> isize { - let item_size = std::mem::size_of::(); - let start = dst; - for item in iterator { - // note how there is no reserve here (compared with `extend_from_iter`) - let src = item.to_byte_slice().as_ptr(); - unsafe { - std::ptr::copy_nonoverlapping(src, dst, item_size); - dst = dst.add(item_size) - }; - } - unsafe { dst.offset_from(start) } - } - /// Creates a [`MutableBuffer`] from an [`Iterator`] with a trusted (upper) length. /// Prefer this to `collect` whenever possible, as it is faster ~60% faster. /// # Example @@ -792,11 +737,26 @@ impl MutableBuffer { pub unsafe fn from_trusted_len_iter>( iterator: I, ) -> Self { - let mut buffer = MutableBuffer::new(0); - // SAFETY: caller guarantees trusted length - unsafe { - buffer.extend_from_trusted_len_iter(iterator); + let item_size = std::mem::size_of::(); + let (_, upper) = iterator.size_hint(); + let upper = upper.expect("from_trusted_len_iter requires an upper limit"); + let len = upper * item_size; + + let mut buffer = MutableBuffer::new(len); + + let mut dst = buffer.data.as_ptr(); + for item in iterator { + // note how there is no reserve here (compared with `extend_from_iter`) + let src = item.to_byte_slice().as_ptr(); + unsafe { std::ptr::copy_nonoverlapping(src, dst, item_size) }; + dst = unsafe { dst.add(item_size) }; } + assert_eq!( + unsafe { dst.offset_from(buffer.data.as_ptr()) } as usize, + len, + "Trusted iterator length was not accurately reported" + ); + buffer.len = len; buffer } From 03d060bed3184cf3044d49192caae9734eaa76c6 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 22 Dec 2025 08:54:39 -0500 Subject: [PATCH 09/10] Add fast path using as_cunks --- arrow-buffer/src/buffer/boolean.rs | 77 ++++++++++++++++++++---------- 1 file changed, 53 insertions(+), 24 deletions(-) diff --git a/arrow-buffer/src/buffer/boolean.rs b/arrow-buffer/src/buffer/boolean.rs index 709adb1ecfdd..cd6ec39ca11c 100644 --- a/arrow-buffer/src/buffer/boolean.rs +++ b/arrow-buffer/src/buffer/boolean.rs @@ -22,6 +22,7 @@ use crate::{ buffer_bin_xor, buffer_unary_not, util, }; +use crate::bit_util::ceil; use std::ops::{BitAnd, BitOr, BitXor, Not}; /// A slice-able [`Buffer`] containing bit-packed booleans @@ -234,7 +235,7 @@ impl BooleanBuffer { /// let result = BooleanBuffer::from_bitwise_binary_op( /// &left, 0, &right, 0, 12, |a, b| a & b /// ); - /// assert_eq!(result.inner().as_slice(), &[0b10001000u8, 0b00001000u8]); + /// assert_eq!(result.inner().as_slice(), &[0b10001000u8, 0b10011000u8]); /// ``` /// /// # Example: Create new [`BooleanBuffer`] from bitwise `OR` of two byte slices @@ -259,16 +260,14 @@ impl BooleanBuffer { where F: FnMut(u64, u64) -> u64, { - // try fast path for aligned input + // fast path for aligned input if left_offset_in_bits & 0x7 == 0 && right_offset_in_bits & 0x7 == 0 { - if let Some(result) = Self::try_from_aligned_bitwise_binary_op( + return Self::from_aligned_bitwise_binary_op( &left.as_ref()[left_offset_in_bits / 8..], // aligned to byte boundary &right.as_ref()[right_offset_in_bits / 8..], len_in_bits, &mut op, - ) { - return result; - } + ); } let left_chunks = BitChunks::new(left.as_ref(), left_offset_in_bits, len_in_bits); let right_chunks = BitChunks::new(right.as_ref(), right_offset_in_bits, len_in_bits); @@ -298,38 +297,68 @@ impl BooleanBuffer { /// inputs are aligned to byte boundaries /// /// Returns `None` if the inputs are not fully u64 aligned - fn try_from_aligned_bitwise_binary_op( + fn from_aligned_bitwise_binary_op( left: &[u8], right: &[u8], len_in_bits: usize, op: &mut F, - ) -> Option + ) -> Self where F: FnMut(u64, u64) -> u64, { + // trim to length + let len_in_bytes = ceil(len_in_bits, 8); + let left = &left[0..len_in_bytes]; + let right = &right[0..len_in_bytes]; // Safety: all valid bytes are valid u64s let (left_prefix, left_u64s, left_suffix) = unsafe { left.align_to::() }; let (right_prefix, right_u64s, right_suffix) = unsafe { right.align_to::() }; - if !(left_prefix.is_empty() + if left_prefix.is_empty() && right_prefix.is_empty() && left_suffix.is_empty() - && right_suffix.is_empty()) + && right_suffix.is_empty() { - // Couldn't make this case any faster than the default path - // would be cool to handle non empty prefixes/suffixes too, - return None; + // the buffers are word (64 bit) aligned, so use optimized Vec code. + let result_u64s = left_u64s + .iter() + .zip(right_u64s.iter()) + .map(|(l, r)| op(*l, *r)) + .collect::>(); + BooleanBuffer::new(Buffer::from(result_u64s), 0, len_in_bits) + } else { + let (left_slices, left_remainder) = left.as_chunks::<8>(); + let (right_slices, right_remainder) = right.as_chunks::<8>(); + debug_assert_eq!(left_slices.len(), right_slices.len()); + debug_assert_eq!(left_remainder.len(), right_remainder.len()); + let mut mutable_result = + MutableBuffer::with_capacity(left_slices.len() * 8 + left_remainder.len()); + mutable_result.extend_from_iter( + left_slices + .iter() + .zip(right_slices.iter()) + .map(|(l, r)| op(u64::from_le_bytes(*l), u64::from_le_bytes(*r))), + ); + if !left_remainder.is_empty() { + let rem = op( + u64::from_le_bytes({ + let mut bytes = [0u8; 8]; + bytes[..left_remainder.len()].copy_from_slice(left_remainder); + bytes + }), + u64::from_le_bytes({ + let mut bytes = [0u8; 8]; + bytes[..right_remainder.len()].copy_from_slice(right_remainder); + bytes + }), + ); + mutable_result.extend_from_slice(&rem.to_le_bytes()[..left_remainder.len()]); + } + BooleanBuffer { + buffer: Buffer::from(mutable_result), + offset: 0, + len: len_in_bits, + } } - // the buffers are word (64 bit) aligned, so use optimized Vec code. - let result_u64s = left_u64s - .iter() - .zip(right_u64s.iter()) - .map(|(l, r)| op(*l, *r)) - .collect::>(); - Some(BooleanBuffer::new( - Buffer::from(result_u64s), - 0, - len_in_bits, - )) } /// Returns the number of set bits in this buffer From 57bd03002a3c3e225dbb775e4197432d2efe56b6 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 27 Dec 2025 07:19:12 -0500 Subject: [PATCH 10/10] Revert "Add fast path using as_cunks" This reverts commit 03d060bed3184cf3044d49192caae9734eaa76c6. --- arrow-buffer/src/buffer/boolean.rs | 77 ++++++++++-------------------- 1 file changed, 24 insertions(+), 53 deletions(-) diff --git a/arrow-buffer/src/buffer/boolean.rs b/arrow-buffer/src/buffer/boolean.rs index cd6ec39ca11c..709adb1ecfdd 100644 --- a/arrow-buffer/src/buffer/boolean.rs +++ b/arrow-buffer/src/buffer/boolean.rs @@ -22,7 +22,6 @@ use crate::{ buffer_bin_xor, buffer_unary_not, util, }; -use crate::bit_util::ceil; use std::ops::{BitAnd, BitOr, BitXor, Not}; /// A slice-able [`Buffer`] containing bit-packed booleans @@ -235,7 +234,7 @@ impl BooleanBuffer { /// let result = BooleanBuffer::from_bitwise_binary_op( /// &left, 0, &right, 0, 12, |a, b| a & b /// ); - /// assert_eq!(result.inner().as_slice(), &[0b10001000u8, 0b10011000u8]); + /// assert_eq!(result.inner().as_slice(), &[0b10001000u8, 0b00001000u8]); /// ``` /// /// # Example: Create new [`BooleanBuffer`] from bitwise `OR` of two byte slices @@ -260,14 +259,16 @@ impl BooleanBuffer { where F: FnMut(u64, u64) -> u64, { - // fast path for aligned input + // try fast path for aligned input if left_offset_in_bits & 0x7 == 0 && right_offset_in_bits & 0x7 == 0 { - return Self::from_aligned_bitwise_binary_op( + if let Some(result) = Self::try_from_aligned_bitwise_binary_op( &left.as_ref()[left_offset_in_bits / 8..], // aligned to byte boundary &right.as_ref()[right_offset_in_bits / 8..], len_in_bits, &mut op, - ); + ) { + return result; + } } let left_chunks = BitChunks::new(left.as_ref(), left_offset_in_bits, len_in_bits); let right_chunks = BitChunks::new(right.as_ref(), right_offset_in_bits, len_in_bits); @@ -297,68 +298,38 @@ impl BooleanBuffer { /// inputs are aligned to byte boundaries /// /// Returns `None` if the inputs are not fully u64 aligned - fn from_aligned_bitwise_binary_op( + fn try_from_aligned_bitwise_binary_op( left: &[u8], right: &[u8], len_in_bits: usize, op: &mut F, - ) -> Self + ) -> Option where F: FnMut(u64, u64) -> u64, { - // trim to length - let len_in_bytes = ceil(len_in_bits, 8); - let left = &left[0..len_in_bytes]; - let right = &right[0..len_in_bytes]; // Safety: all valid bytes are valid u64s let (left_prefix, left_u64s, left_suffix) = unsafe { left.align_to::() }; let (right_prefix, right_u64s, right_suffix) = unsafe { right.align_to::() }; - if left_prefix.is_empty() + if !(left_prefix.is_empty() && right_prefix.is_empty() && left_suffix.is_empty() - && right_suffix.is_empty() + && right_suffix.is_empty()) { - // the buffers are word (64 bit) aligned, so use optimized Vec code. - let result_u64s = left_u64s - .iter() - .zip(right_u64s.iter()) - .map(|(l, r)| op(*l, *r)) - .collect::>(); - BooleanBuffer::new(Buffer::from(result_u64s), 0, len_in_bits) - } else { - let (left_slices, left_remainder) = left.as_chunks::<8>(); - let (right_slices, right_remainder) = right.as_chunks::<8>(); - debug_assert_eq!(left_slices.len(), right_slices.len()); - debug_assert_eq!(left_remainder.len(), right_remainder.len()); - let mut mutable_result = - MutableBuffer::with_capacity(left_slices.len() * 8 + left_remainder.len()); - mutable_result.extend_from_iter( - left_slices - .iter() - .zip(right_slices.iter()) - .map(|(l, r)| op(u64::from_le_bytes(*l), u64::from_le_bytes(*r))), - ); - if !left_remainder.is_empty() { - let rem = op( - u64::from_le_bytes({ - let mut bytes = [0u8; 8]; - bytes[..left_remainder.len()].copy_from_slice(left_remainder); - bytes - }), - u64::from_le_bytes({ - let mut bytes = [0u8; 8]; - bytes[..right_remainder.len()].copy_from_slice(right_remainder); - bytes - }), - ); - mutable_result.extend_from_slice(&rem.to_le_bytes()[..left_remainder.len()]); - } - BooleanBuffer { - buffer: Buffer::from(mutable_result), - offset: 0, - len: len_in_bits, - } + // Couldn't make this case any faster than the default path + // would be cool to handle non empty prefixes/suffixes too, + return None; } + // the buffers are word (64 bit) aligned, so use optimized Vec code. + let result_u64s = left_u64s + .iter() + .zip(right_u64s.iter()) + .map(|(l, r)| op(*l, *r)) + .collect::>(); + Some(BooleanBuffer::new( + Buffer::from(result_u64s), + 0, + len_in_bits, + )) } /// Returns the number of set bits in this buffer