Skip to content

Commit 96637fc

Browse files
authored
Speed up binary kernels (30% faster and and or), add BooleanBuffer::from_bitwise_binary_op (#9090)
# Which issue does this PR close? - Part of #8806 - Closes #8854 - Closes #8807 This is the next step after - #8996 # Rationale for this change - we can help rust / LLVM generate more optimal code by processing u64 words at a time when the buffer is already u64 aligned (see #8807) Also, it is hard to find the code to create new Buffers by applying bitwise unary operations. # What changes are included in this PR? - Introduce optimized `BooleanBuffer::from_bitwise_binary` - Migrate several kernels that use `bitwise_bin_op_helper` to use the new BooleanBuffer # Are these changes tested? Yes new tests are added Performance results show 30% performance improvement for the `and` and `or` kernels for aligned buffers (common case) # Are there any user-facing changes? A new API
1 parent 964daec commit 96637fc

File tree

4 files changed

+169
-20
lines changed

4 files changed

+169
-20
lines changed

arrow-arith/src/boolean.rs

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
//! [here](https://doc.rust-lang.org/stable/core/arch/) for more information.
2424
2525
use arrow_array::*;
26-
use arrow_buffer::buffer::{bitwise_bin_op_helper, bitwise_quaternary_op_helper};
26+
use arrow_buffer::buffer::bitwise_quaternary_op_helper;
2727
use arrow_buffer::{BooleanBuffer, NullBuffer, buffer_bin_and_not};
2828
use arrow_schema::ArrowError;
2929

@@ -74,7 +74,7 @@ pub fn and_kleene(left: &BooleanArray, right: &BooleanArray) -> Result<BooleanAr
7474
// The final null bit is set only if:
7575
// 1. left null bit is set, or
7676
// 2. right data bit is false (because null AND false = false).
77-
Some(bitwise_bin_op_helper(
77+
Some(BooleanBuffer::from_bitwise_binary_op(
7878
left_null_buffer.buffer(),
7979
left_null_buffer.offset(),
8080
right_values.inner(),
@@ -85,7 +85,7 @@ pub fn and_kleene(left: &BooleanArray, right: &BooleanArray) -> Result<BooleanAr
8585
}
8686
(None, Some(right_null_buffer)) => {
8787
// Same as above
88-
Some(bitwise_bin_op_helper(
88+
Some(BooleanBuffer::from_bitwise_binary_op(
8989
right_null_buffer.buffer(),
9090
right_null_buffer.offset(),
9191
left_values.inner(),
@@ -100,7 +100,7 @@ pub fn and_kleene(left: &BooleanArray, right: &BooleanArray) -> Result<BooleanAr
100100
// d is right data bits.
101101
// The final null bits are:
102102
// (a | (c & !d)) & (c | (a & !b))
103-
Some(bitwise_quaternary_op_helper(
103+
let buffer = bitwise_quaternary_op_helper(
104104
[
105105
left_null_buffer.buffer(),
106106
left_values.inner(),
@@ -115,10 +115,11 @@ pub fn and_kleene(left: &BooleanArray, right: &BooleanArray) -> Result<BooleanAr
115115
],
116116
left.len(),
117117
|a, b, c, d| (a | (c & !d)) & (c | (a & !b)),
118-
))
118+
);
119+
Some(BooleanBuffer::new(buffer, 0, left.len()))
119120
}
120121
};
121-
let nulls = buffer.map(|b| NullBuffer::new(BooleanBuffer::new(b, 0, left.len())));
122+
let nulls = buffer.map(NullBuffer::new);
122123
Ok(BooleanArray::new(left_values & right_values, nulls))
123124
}
124125

@@ -169,7 +170,7 @@ pub fn or_kleene(left: &BooleanArray, right: &BooleanArray) -> Result<BooleanArr
169170
// The final null bit is set only if:
170171
// 1. left null bit is set, or
171172
// 2. right data bit is true (because null OR true = true).
172-
Some(bitwise_bin_op_helper(
173+
Some(BooleanBuffer::from_bitwise_binary_op(
173174
left_nulls.buffer(),
174175
left_nulls.offset(),
175176
right_values.inner(),
@@ -180,7 +181,7 @@ pub fn or_kleene(left: &BooleanArray, right: &BooleanArray) -> Result<BooleanArr
180181
}
181182
(None, Some(right_nulls)) => {
182183
// Same as above
183-
Some(bitwise_bin_op_helper(
184+
Some(BooleanBuffer::from_bitwise_binary_op(
184185
right_nulls.buffer(),
185186
right_nulls.offset(),
186187
left_values.inner(),
@@ -195,7 +196,7 @@ pub fn or_kleene(left: &BooleanArray, right: &BooleanArray) -> Result<BooleanArr
195196
// d is right data bits.
196197
// The final null bits are:
197198
// (a | (c & d)) & (c | (a & b))
198-
Some(bitwise_quaternary_op_helper(
199+
let buffer = bitwise_quaternary_op_helper(
199200
[
200201
left_nulls.buffer(),
201202
left_values.inner(),
@@ -210,11 +211,12 @@ pub fn or_kleene(left: &BooleanArray, right: &BooleanArray) -> Result<BooleanArr
210211
],
211212
left.len(),
212213
|a, b, c, d| (a | (c & d)) & (c | (a & b)),
213-
))
214+
);
215+
Some(BooleanBuffer::new(buffer, 0, left.len()))
214216
}
215217
};
216218

217-
let nulls = buffer.map(|b| NullBuffer::new(BooleanBuffer::new(b, 0, left.len())));
219+
let nulls = buffer.map(NullBuffer::new);
218220
Ok(BooleanArray::new(left_values | right_values, nulls))
219221
}
220222

arrow-buffer/src/buffer/boolean.rs

Lines changed: 147 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -169,9 +169,10 @@ impl BooleanBuffer {
169169
/// * The output always has zero offset
170170
///
171171
/// # See Also
172+
/// - [`BooleanBuffer::from_bitwise_binary_op`] to create a new buffer from a binary operation
172173
/// - [`apply_bitwise_unary_op`](bit_util::apply_bitwise_unary_op) for in-place unary bitwise operations
173174
///
174-
/// # Example: Create new [`BooleanBuffer`] from bitwise `NOT` of an input [`Buffer`]
175+
/// # Example: Create new [`BooleanBuffer`] from bitwise `NOT` of a byte slice
175176
/// ```
176177
/// # use arrow_buffer::BooleanBuffer;
177178
/// let input = [0b11001100u8, 0b10111010u8]; // 2 bytes = 16 bits
@@ -221,9 +222,8 @@ impl BooleanBuffer {
221222
result.truncate(chunks.num_bytes());
222223
}
223224

224-
let buffer = Buffer::from(result);
225225
BooleanBuffer {
226-
buffer,
226+
buffer: Buffer::from(result),
227227
bit_offset: 0,
228228
bit_len: len_in_bits,
229229
}
@@ -254,6 +254,112 @@ impl BooleanBuffer {
254254
Some(BooleanBuffer::new(buffer, 0, len_in_bits))
255255
}
256256

257+
/// Create a new [`BooleanBuffer`] by applying the bitwise operation `op` to
258+
/// the relevant bits from two input buffers.
259+
///
260+
/// This function is faster than applying the operation bit by bit as
261+
/// it processes input buffers in chunks of 64 bits (8 bytes) at a time
262+
///
263+
/// # Notes:
264+
/// See notes on [Self::from_bitwise_unary_op]
265+
///
266+
/// # See Also
267+
/// - [`BooleanBuffer::from_bitwise_unary_op`] for unary operations on a single input buffer.
268+
/// - [`apply_bitwise_binary_op`](bit_util::apply_bitwise_binary_op) for in-place binary bitwise operations
269+
///
270+
/// # Example: Create new [`BooleanBuffer`] from bitwise `AND` of two [`Buffer`]s
271+
/// ```
272+
/// # use arrow_buffer::{Buffer, BooleanBuffer};
273+
/// let left = Buffer::from(vec![0b11001100u8, 0b10111010u8]); // 2 bytes = 16 bits
274+
/// let right = Buffer::from(vec![0b10101010u8, 0b11011100u8, 0b11110000u8]); // 3 bytes = 24 bits
275+
/// // AND of the first 12 bits
276+
/// let result = BooleanBuffer::from_bitwise_binary_op(
277+
/// &left, 0, &right, 0, 12, |a, b| a & b
278+
/// );
279+
/// assert_eq!(result.inner().as_slice(), &[0b10001000u8, 0b00001000u8]);
280+
/// ```
281+
///
282+
/// # Example: Create new [`BooleanBuffer`] from bitwise `OR` of two byte slices
283+
/// ```
284+
/// # use arrow_buffer::BooleanBuffer;
285+
/// let left = [0b11001100u8, 0b10111010u8];
286+
/// let right = [0b10101010u8, 0b11011100u8];
287+
/// // OR of bits 4..16 from left and bits 0..12 from right
288+
/// let result = BooleanBuffer::from_bitwise_binary_op(
289+
/// &left, 4, &right, 0, 12, |a, b| a | b
290+
/// );
291+
/// assert_eq!(result.inner().as_slice(), &[0b10101110u8, 0b00001111u8]);
292+
/// ```
293+
pub fn from_bitwise_binary_op<F>(
294+
left: impl AsRef<[u8]>,
295+
left_offset_in_bits: usize,
296+
right: impl AsRef<[u8]>,
297+
right_offset_in_bits: usize,
298+
len_in_bits: usize,
299+
mut op: F,
300+
) -> Self
301+
where
302+
F: FnMut(u64, u64) -> u64,
303+
{
304+
let left = left.as_ref();
305+
let right = right.as_ref();
306+
// try fast path for aligned input
307+
// If the underlying buffers are aligned to u64 we can apply the operation directly on the u64 slices
308+
// to improve performance.
309+
if left_offset_in_bits & 0x7 == 0 && right_offset_in_bits & 0x7 == 0 {
310+
// align to byte boundary
311+
let left = &left[left_offset_in_bits / 8..];
312+
let right = &right[right_offset_in_bits / 8..];
313+
314+
unsafe {
315+
let (left_prefix, left_u64s, left_suffix) = left.align_to::<u64>();
316+
let (right_prefix, right_u64s, right_suffix) = right.align_to::<u64>();
317+
// if there is no prefix or suffix, both buffers are aligned and
318+
// we can do the operation directly on u64s.
319+
// TODO: consider `slice::as_chunks` and `u64::from_le_bytes` when MSRV reaches 1.88.
320+
// https://github.com/apache/arrow-rs/pull/9022#discussion_r2639949361
321+
if left_prefix.is_empty()
322+
&& right_prefix.is_empty()
323+
&& left_suffix.is_empty()
324+
&& right_suffix.is_empty()
325+
{
326+
let result_u64s = left_u64s
327+
.iter()
328+
.zip(right_u64s.iter())
329+
.map(|(l, r)| op(*l, *r))
330+
.collect::<Vec<u64>>();
331+
return BooleanBuffer {
332+
buffer: Buffer::from(result_u64s),
333+
bit_offset: 0,
334+
bit_len: len_in_bits,
335+
};
336+
}
337+
}
338+
}
339+
let left_chunks = BitChunks::new(left, left_offset_in_bits, len_in_bits);
340+
let right_chunks = BitChunks::new(right, right_offset_in_bits, len_in_bits);
341+
342+
let chunks = left_chunks
343+
.iter()
344+
.zip(right_chunks.iter())
345+
.map(|(left, right)| op(left, right));
346+
// Soundness: `BitChunks` is a `BitChunks` trusted length iterator which
347+
// correctly reports its upper bound
348+
let mut buffer = unsafe { MutableBuffer::from_trusted_len_iter(chunks) };
349+
350+
let remainder_bytes = bit_util::ceil(left_chunks.remainder_len(), 8);
351+
let rem = op(left_chunks.remainder_bits(), right_chunks.remainder_bits());
352+
// we are counting its starting from the least significant bit, to to_le_bytes should be correct
353+
let rem = &rem.to_le_bytes()[0..remainder_bytes];
354+
buffer.extend_from_slice(rem);
355+
356+
BooleanBuffer {
357+
buffer: Buffer::from(buffer),
358+
bit_offset: 0,
359+
bit_len: len_in_bits,
360+
}
361+
}
362+
257363
/// Returns the number of set bits in this buffer
258364
pub fn count_set_bits(&self) -> usize {
259365
self.buffer
@@ -656,4 +762,42 @@ mod tests {
656762
assert_eq!(result, expected);
657763
}
658764
}
765+
766+
#[test]
767+
fn test_from_bitwise_binary_op() {
768+
// pick random boolean inputs
769+
let input_bools_left = (0..1024)
770+
.map(|_| rand::random::<bool>())
771+
.collect::<Vec<bool>>();
772+
let input_bools_right = (0..1024)
773+
.map(|_| rand::random::<bool>())
774+
.collect::<Vec<bool>>();
775+
let input_buffer_left = BooleanBuffer::from(&input_bools_left[..]);
776+
let input_buffer_right = BooleanBuffer::from(&input_bools_right[..]);
777+
778+
for left_offset in 0..200 {
779+
for right_offset in [0, 4, 5, 17, 33, 24, 45, 64, 65, 100, 200] {
780+
for len_offset in [0, 1, 44, 100, 256, 300, 512] {
781+
let len = 1024 - len_offset - left_offset.max(right_offset); // ensure we don't go out of bounds
782+
// compute with AND
783+
let result = BooleanBuffer::from_bitwise_binary_op(
784+
input_buffer_left.values(),
785+
left_offset,
786+
input_buffer_right.values(),
787+
right_offset,
788+
len,
789+
|a, b| a & b,
790+
);
791+
// compute directly from bools
792+
let expected = input_bools_left[left_offset..]
793+
.iter()
794+
.zip(&input_bools_right[right_offset..])
795+
.take(len)
796+
.map(|(a, b)| *a & *b)
797+
.collect::<BooleanBuffer>();
798+
assert_eq!(result, expected);
799+
}
800+
}
801+
}
802+
}
659803
}

arrow-buffer/src/buffer/ops.rs

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -150,14 +150,15 @@ pub fn buffer_bin_and(
150150
right_offset_in_bits: usize,
151151
len_in_bits: usize,
152152
) -> Buffer {
153-
bitwise_bin_op_helper(
153+
BooleanBuffer::from_bitwise_binary_op(
154154
left,
155155
left_offset_in_bits,
156156
right,
157157
right_offset_in_bits,
158158
len_in_bits,
159159
|a, b| a & b,
160160
)
161+
.into_inner()
161162
}
162163

163164
/// Apply a bitwise or to two inputs and return the result as a Buffer.
@@ -169,14 +170,15 @@ pub fn buffer_bin_or(
169170
right_offset_in_bits: usize,
170171
len_in_bits: usize,
171172
) -> Buffer {
172-
bitwise_bin_op_helper(
173+
BooleanBuffer::from_bitwise_binary_op(
173174
left,
174175
left_offset_in_bits,
175176
right,
176177
right_offset_in_bits,
177178
len_in_bits,
178179
|a, b| a | b,
179180
)
181+
.into_inner()
180182
}
181183

182184
/// Apply a bitwise xor to two inputs and return the result as a Buffer.
@@ -188,14 +190,15 @@ pub fn buffer_bin_xor(
188190
right_offset_in_bits: usize,
189191
len_in_bits: usize,
190192
) -> Buffer {
191-
bitwise_bin_op_helper(
193+
BooleanBuffer::from_bitwise_binary_op(
192194
left,
193195
left_offset_in_bits,
194196
right,
195197
right_offset_in_bits,
196198
len_in_bits,
197199
|a, b| a ^ b,
198200
)
201+
.into_inner()
199202
}
200203

201204
/// Apply a bitwise and_not to two inputs and return the result as a Buffer.
@@ -207,19 +210,19 @@ pub fn buffer_bin_and_not(
207210
right_offset_in_bits: usize,
208211
len_in_bits: usize,
209212
) -> Buffer {
210-
bitwise_bin_op_helper(
213+
BooleanBuffer::from_bitwise_binary_op(
211214
left,
212215
left_offset_in_bits,
213216
right,
214217
right_offset_in_bits,
215218
len_in_bits,
216219
|a, b| a & !b,
217220
)
221+
.into_inner()
218222
}
219223

220224
/// Apply a bitwise not to one input and return the result as a Buffer.
221225
/// The input is treated as a bitmap, meaning that offset and length are specified in number of bits.
222226
pub fn buffer_unary_not(left: &Buffer, offset_in_bits: usize, len_in_bits: usize) -> Buffer {
223-
// TODO: should we deprecate this function in favor of the Buffer ! impl ?
224227
BooleanBuffer::from_bitwise_unary_op(left, offset_in_bits, len_in_bits, |a| !a).into_inner()
225228
}

arrow-select/src/nullif.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ use arrow_buffer::{BooleanBuffer, NullBuffer, bitwise_unary_op_helper};
2323
use arrow_schema::{ArrowError, DataType};
2424

2525
/// Returns a new array with the same values and the validity bit to false where
26-
/// the corresponding element of`right` is true.
26+
/// the corresponding element of `right` is true.
2727
///
2828
/// This can be used to implement SQL `NULLIF`
2929
///

0 commit comments

Comments
 (0)