Skip to content

Commit b7fd764

Browse files
committed
Add BooleanBuffer::from_bitwise_binary_op
1 parent 42f7ce7 commit b7fd764

File tree

2 files changed

+149
-43
lines changed

2 files changed

+149
-43
lines changed

arrow-buffer/src/buffer/boolean.rs

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,107 @@ impl BooleanBuffer {
253253
Some(BooleanBuffer::new(buffer, 0, len_in_bits))
254254
}
255255

256+
/// Create a new [`BooleanBuffer`] by applying the bitwise operation `op` to
257+
/// the relevant bits from two input buffers.
258+
///
259+
/// This function is faster than applying the operation bit by bit as
260+
/// it processes input buffers in chunks of 64 bits (8 bytes) at a time
261+
///
262+
/// # Notes:
263+
/// See notes on [Self::from_bitwise_unary_op]
264+
///
265+
/// # See Also
266+
/// - [`BooleanBuffer::from_bitwise_unary_op`] for unary operations on a single input buffer.
267+
/// - [`apply_bitwise_binary_op`](bit_util::apply_bitwise_binary_op) for in-place binary bitwise operations
268+
///
269+
/// # Example: Create new [`BooleanBuffer`] from bitwise `AND` of two [`Buffer`]s
270+
/// ```
271+
/// # use arrow_buffer::{Buffer, BooleanBuffer};
272+
/// let left = Buffer::from(vec![0b11001100u8, 0b10111010u8]); // 2 bytes = 16 bits
273+
/// let right = Buffer::from(vec![0b10101010u8, 0b11011100u8, 0b11110000u8]); // 3 bytes = 24 bits
274+
/// // AND of the first 12 bits
275+
/// let result = BooleanBuffer::from_bitwise_binary_op(
276+
/// &left, 0, &right, 0, 12, |a, b| a & b
277+
/// );
278+
/// assert_eq!(result.inner().as_slice(), &[0b10001000u8, 0b00001000u8]);
279+
/// ```
280+
///
281+
/// # Example: Create new [`BooleanBuffer`] from bitwise `OR` of two byte slices
282+
/// ```
283+
/// # use arrow_buffer::BooleanBuffer;
284+
/// let left = [0b11001100u8, 0b10111010u8];
285+
/// let right = [0b10101010u8, 0b11011100u8];
286+
/// // OR of bits 4..16 from left and bits 0..12 from right
287+
/// let result = BooleanBuffer::from_bitwise_binary_op(
288+
/// &left, 4, &right, 0, 12, |a, b| a | b
289+
/// );
290+
/// assert_eq!(result.inner().as_slice(), &[0b10101110u8, 0b00001111u8]);
291+
/// ```
292+
pub fn from_bitwise_binary_op<F>(
293+
left: impl AsRef<[u8]>,
294+
left_offset_in_bits: usize,
295+
right: impl AsRef<[u8]>,
296+
right_offset_in_bits: usize,
297+
len_in_bits: usize,
298+
mut op: F,
299+
) -> Self
300+
where
301+
F: FnMut(u64, u64) -> u64,
302+
{
303+
let left = left.as_ref();
304+
let right = right.as_ref();
305+
// try fast path for aligned input
306+
// If the underlying buffers are aligned to u64 we can apply the operation directly on the u64 slices
307+
// to improve performance.
308+
if left_offset_in_bits == 0 && right_offset_in_bits == 0 {
309+
unsafe {
310+
let (left_prefix, left_u64s, left_suffix) = left.align_to::<u64>();
311+
let (right_prefix, right_u64s, right_suffix) = right.align_to::<u64>();
312+
// if there is no prefix or suffix, both buffers are aligned and we can do the operation directly
313+
// on u64s
314+
// TODO also handle non empty suffixes by processing them separately
315+
if left_prefix.is_empty()
316+
&& right_prefix.is_empty()
317+
&& left_suffix.is_empty()
318+
&& right_suffix.is_empty()
319+
{
320+
let result_u64s = left_u64s
321+
.iter()
322+
.zip(right_u64s.iter())
323+
.map(|(l, r)| op(*l, *r))
324+
.collect::<Vec<u64>>();
325+
return BooleanBuffer {
326+
buffer: Buffer::from(result_u64s),
327+
bit_offset: 0,
328+
bit_len: len_in_bits,
329+
}
330+
}
331+
}
332+
}
333+
let left_chunks = BitChunks::new(left, left_offset_in_bits, len_in_bits);
334+
let right_chunks = BitChunks::new(right, right_offset_in_bits, len_in_bits);
335+
336+
let chunks = left_chunks
337+
.iter()
338+
.zip(right_chunks.iter())
339+
.map(|(left, right)| op(left, right));
340+
// Soundness: `BitChunks` is a `BitChunks` iterator which
341+
// correctly reports its upper bound
342+
let mut buffer = unsafe { MutableBuffer::from_trusted_len_iter(chunks) };
343+
344+
let remainder_bytes = bit_util::ceil(left_chunks.remainder_len(), 8);
345+
let rem = op(left_chunks.remainder_bits(), right_chunks.remainder_bits());
346+
// we are counting its starting from the least significant bit, to to_le_bytes should be correct
347+
let rem = &rem.to_le_bytes()[0..remainder_bytes];
348+
buffer.extend_from_slice(rem);
349+
350+
BooleanBuffer {
351+
buffer: Buffer::from(buffer),
352+
bit_offset: 0,
353+
bit_len: len_in_bits,
354+
}
355+
}
356+
256357
/// Returns the number of set bits in this buffer
257358
pub fn count_set_bits(&self) -> usize {
258359
self.buffer
@@ -655,4 +756,42 @@ mod tests {
655756
assert_eq!(result, expected);
656757
}
657758
}
759+
760+
#[test]
761+
fn test_from_bitwise_binary_op() {
762+
// pick random boolean inputs
763+
let input_bools_left = (0..1024)
764+
.map(|_| rand::random::<bool>())
765+
.collect::<Vec<bool>>();
766+
let input_bools_right = (0..1024)
767+
.map(|_| rand::random::<bool>())
768+
.collect::<Vec<bool>>();
769+
let input_buffer_left = BooleanBuffer::from(&input_bools_left[..]);
770+
let input_buffer_right = BooleanBuffer::from(&input_bools_right[..]);
771+
772+
for left_offset in 0..200 {
773+
for right_offset in [0, 4, 5, 17, 33, 24, 45, 64, 65, 100, 200] {
774+
for len_offset in [0, 1, 44, 100, 256, 300, 512] {
775+
let len = 1024 - len_offset - left_offset.max(right_offset); // ensure we don't go out of bounds
776+
// compute with AND
777+
let result = BooleanBuffer::from_bitwise_binary_op(
778+
input_buffer_left.values(),
779+
left_offset,
780+
input_buffer_right.values(),
781+
right_offset,
782+
len,
783+
|a, b| a & b,
784+
);
785+
// compute directly from bools
786+
let expected = input_bools_left[left_offset..]
787+
.iter()
788+
.zip(&input_bools_right[right_offset..])
789+
.take(len)
790+
.map(|(a, b)| *a & *b)
791+
.collect::<BooleanBuffer>();
792+
assert_eq!(result, expected);
793+
}
794+
}
795+
}
796+
}
658797
}

arrow-buffer/src/buffer/ops.rs

Lines changed: 10 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -69,53 +69,20 @@ pub fn bitwise_bin_op_helper<F>(
6969
right: &Buffer,
7070
right_offset_in_bits: usize,
7171
len_in_bits: usize,
72-
mut op: F,
72+
op: F,
7373
) -> Buffer
7474
where
7575
F: FnMut(u64, u64) -> u64,
7676
{
77-
// If the underlying buffers are aligned to u64 we can apply the operation directly on the u64 slices
78-
// to improve performance.
79-
if left_offset_in_bits == 0 && right_offset_in_bits == 0 {
80-
unsafe {
81-
let (left_prefix, left_u64s, left_suffix) = left.as_slice().align_to::<u64>();
82-
let (right_prefix, right_u64s, right_suffix) = right.as_slice().align_to::<u64>();
83-
// if there is no prefix or suffix, both buffers are aligned and we can do the operation directly
84-
// on u64s
85-
// TODO also handle non empty suffixes by processing them separately
86-
if left_prefix.is_empty()
87-
&& right_prefix.is_empty()
88-
&& left_suffix.is_empty()
89-
&& right_suffix.is_empty()
90-
{
91-
let result_u64s = left_u64s
92-
.iter()
93-
.zip(right_u64s.iter())
94-
.map(|(l, r)| op(*l, *r))
95-
.collect::<Vec<u64>>();
96-
return result_u64s.into();
97-
}
98-
}
99-
}
100-
101-
let left_chunks = left.bit_chunks(left_offset_in_bits, len_in_bits);
102-
let right_chunks = right.bit_chunks(right_offset_in_bits, len_in_bits);
103-
104-
let chunks = left_chunks
105-
.iter()
106-
.zip(right_chunks.iter())
107-
.map(|(left, right)| op(left, right));
108-
// Soundness: `BitChunks` is a `BitChunks` iterator which
109-
// correctly reports its upper bound
110-
let mut buffer = unsafe { MutableBuffer::from_trusted_len_iter(chunks) };
111-
112-
let remainder_bytes = ceil(left_chunks.remainder_len(), 8);
113-
let rem = op(left_chunks.remainder_bits(), right_chunks.remainder_bits());
114-
// we are counting its starting from the least significant bit, to to_le_bytes should be correct
115-
let rem = &rem.to_le_bytes()[0..remainder_bytes];
116-
buffer.extend_from_slice(rem);
117-
118-
buffer.into()
77+
BooleanBuffer::from_bitwise_binary_op(
78+
left,
79+
left_offset_in_bits,
80+
right,
81+
right_offset_in_bits,
82+
len_in_bits,
83+
op,
84+
)
85+
.into_inner()
11986
}
12087

12188
/// Apply a bitwise operation `op` to one input and return the result as a Buffer.

0 commit comments

Comments
 (0)