Skip to content

Commit 6b290d1

Browse files
alambmhilton
andauthored
Speed up unary not kernel by 50%, add BooleanBuffer::from_bitwise_unary (#8996)
# Which issue does this PR close? - part of #8806 - broken out from #8854 # Rationale for this change The current implementation of the unary not kernel has an extra allocation when operating on sliced data which is not necessary. Also, we can generate more optimal code by processing u64 words at a time when the buffer is already u64 aligned (see #8807) Also, it is hard to find the code to create new Buffers by copying bits # What changes are included in this PR? 1. Introduce `BooleanBuffer::from_bitwise_unary` and `BooleanBuffer::from_bits` 2. Deprecate `bitwise_unary_op_helper` # Are these changes tested? Yes with new tests and benchmarks # Are there any user-facing changes? new PAPI --------- Co-authored-by: Martin Hilton <mhilton@influxdata.com>
1 parent c2bd7d9 commit 6b290d1

File tree

5 files changed

+201
-40
lines changed

5 files changed

+201
-40
lines changed

arrow-buffer/src/buffer/boolean.rs

Lines changed: 157 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,10 @@ use std::ops::{BitAnd, BitOr, BitXor, Not};
2626

2727
/// A slice-able [`Buffer`] containing bit-packed booleans
2828
///
29-
/// `BooleanBuffer`s can be creating using [`BooleanBufferBuilder`]
29+
/// `BooleanBuffer`s can be modified using [`BooleanBufferBuilder`]
3030
///
31-
/// # See Also
3231
///
32+
/// # See Also
3333
/// * [`NullBuffer`] for representing null values in Arrow arrays
3434
///
3535
/// [`NullBuffer`]: crate::NullBuffer
@@ -96,12 +96,128 @@ impl BooleanBuffer {
9696
Self::new(buffer.into(), 0, len)
9797
}
9898

99+
/// Create a new [`BooleanBuffer`] by copying the relevant bits from an
100+
/// input buffer.
101+
///
102+
/// # Notes:
103+
/// * The new `BooleanBuffer` has zero offset, even if `offset_in_bits` is non-zero
104+
///
105+
/// # Example: Create a new [`BooleanBuffer`] copying a bit slice from in input slice
106+
/// ```
107+
/// # use arrow_buffer::BooleanBuffer;
108+
/// let input = [0b11001100u8, 0b10111010u8];
109+
/// // // Copy bits 4..16 from input
110+
/// let result = BooleanBuffer::from_bits(&input, 4, 12);
111+
/// assert_eq!(result.values(), &[0b10101100u8, 0b00001011u8]);
112+
pub fn from_bits(src: impl AsRef<[u8]>, offset_in_bits: usize, len_in_bits: usize) -> Self {
113+
Self::from_bitwise_unary_op(src, offset_in_bits, len_in_bits, |a| a)
114+
}
115+
116+
/// Create a new [`BooleanBuffer`] by applying the bitwise operation to `op`
117+
/// to an input buffer.
118+
///
119+
/// This function is faster than applying the operation bit by bit as
120+
/// it processes input buffers in chunks of 64 bits (8 bytes) at a time
121+
///
122+
/// # Notes:
123+
/// * `op` takes a single `u64` inputs and produces one `u64` output.
124+
/// * `op` must only apply bitwise operations
125+
/// on the relevant bits; the input `u64` may contain irrelevant bits
126+
/// and may be processed differently on different endian architectures.
127+
/// * The output always has zero offset
128+
///
129+
/// # See Also
130+
/// - [`apply_bitwise_unary_op`](bit_util::apply_bitwise_unary_op) for in-place unary bitwise operations
131+
///
132+
/// # Example: Create new [`BooleanBuffer`] from bitwise `NOT` of an input [`Buffer`]
133+
/// ```
134+
/// # use arrow_buffer::BooleanBuffer;
135+
/// let input = [0b11001100u8, 0b10111010u8]; // 2 bytes = 16 bits
136+
/// // NOT of the first 12 bits
137+
/// let result = BooleanBuffer::from_bitwise_unary_op(
138+
/// &input, 0, 12, |a| !a
139+
/// );
140+
/// assert_eq!(result.values(), &[0b00110011u8, 0b11110101u8]);
141+
/// ```
142+
pub fn from_bitwise_unary_op<F>(
143+
src: impl AsRef<[u8]>,
144+
offset_in_bits: usize,
145+
len_in_bits: usize,
146+
mut op: F,
147+
) -> Self
148+
where
149+
F: FnMut(u64) -> u64,
150+
{
151+
// try fast path for aligned input
152+
if offset_in_bits & 0x7 == 0 {
153+
// align to byte boundary
154+
let aligned = &src.as_ref()[offset_in_bits / 8..];
155+
if let Some(result) =
156+
Self::try_from_aligned_bitwise_unary_op(aligned, len_in_bits, &mut op)
157+
{
158+
return result;
159+
}
160+
}
161+
162+
let chunks = BitChunks::new(src.as_ref(), offset_in_bits, len_in_bits);
163+
let mut result = MutableBuffer::with_capacity(chunks.num_u64s() * 8);
164+
for chunk in chunks.iter() {
165+
// SAFETY: reserved enough capacity above, (exactly num_u64s()
166+
// items) and we assume `BitChunks` correctly reports upper bound
167+
unsafe {
168+
result.push_unchecked(op(chunk));
169+
}
170+
}
171+
if chunks.remainder_len() > 0 {
172+
debug_assert!(result.capacity() >= result.len() + 8); // should not reallocate
173+
// SAFETY: reserved enough capacity above, (exactly num_u64s()
174+
// items) and we assume `BitChunks` correctly reports upper bound
175+
unsafe {
176+
result.push_unchecked(op(chunks.remainder_bits()));
177+
}
178+
// Just pushed one u64, which may have trailing zeros
179+
result.truncate(chunks.num_bytes());
180+
}
181+
182+
let buffer = Buffer::from(result);
183+
BooleanBuffer {
184+
buffer,
185+
offset: 0,
186+
len: len_in_bits,
187+
}
188+
}
189+
190+
/// Fast path for [`Self::from_bitwise_unary_op`] when input is aligned to
191+
/// 8-byte (64-bit) boundaries
192+
///
193+
/// Returns None if the fast path cannot be taken
194+
fn try_from_aligned_bitwise_unary_op<F>(
195+
src: &[u8],
196+
len_in_bits: usize,
197+
op: &mut F,
198+
) -> Option<Self>
199+
where
200+
F: FnMut(u64) -> u64,
201+
{
202+
// Safety: all valid bytes are valid u64s
203+
let (prefix, aligned_u6us, suffix) = unsafe { src.align_to::<u64>() };
204+
if !(prefix.is_empty() && suffix.is_empty()) {
205+
// Couldn't make this case any faster than the default path, see
206+
// https://github.com/apache/arrow-rs/pull/8996/changes#r2620022082
207+
return None;
208+
}
209+
// the buffer is word (64 bit) aligned, so use optimized Vec code.
210+
let result_u64s: Vec<u64> = aligned_u6us.iter().map(|l| op(*l)).collect();
211+
let buffer = Buffer::from(result_u64s);
212+
Some(BooleanBuffer::new(buffer, 0, len_in_bits))
213+
}
214+
99215
/// Returns the number of set bits in this buffer
100216
pub fn count_set_bits(&self) -> usize {
101217
self.buffer.count_set_bits_offset(self.offset, self.len)
102218
}
103219

104-
/// Returns a `BitChunks` instance which can be used to iterate over
220+
/// Returns a [`BitChunks`] instance which can be used to iterate over
105221
/// this buffer's bits in `u64` chunks
106222
#[inline]
107223
pub fn bit_chunks(&self) -> BitChunks<'_> {
@@ -437,4 +553,42 @@ mod tests {
437553
assert_eq!(buf.values().len(), 1);
438554
assert!(buf.value(0));
439555
}
556+
557+
#[test]
558+
fn test_from_bitwise_unary_op() {
559+
// Use 1024 boolean values so that at least some of the tests cover multiple u64 chunks and
560+
// perfect alignment
561+
let input_bools = (0..1024)
562+
.map(|_| rand::random::<bool>())
563+
.collect::<Vec<bool>>();
564+
let input_buffer = BooleanBuffer::from(&input_bools[..]);
565+
566+
// Note ensure we test offsets over 100 to cover multiple u64 chunks
567+
for offset in 0..1024 {
568+
let result = BooleanBuffer::from_bitwise_unary_op(
569+
input_buffer.values(),
570+
offset,
571+
input_buffer.len() - offset,
572+
|a| !a,
573+
);
574+
let expected = input_bools[offset..]
575+
.iter()
576+
.map(|b| !*b)
577+
.collect::<BooleanBuffer>();
578+
assert_eq!(result, expected);
579+
}
580+
581+
// Also test when the input doesn't cover the entire buffer
582+
for offset in 0..512 {
583+
let len = 512 - offset; // fixed length less than total
584+
let result =
585+
BooleanBuffer::from_bitwise_unary_op(input_buffer.values(), offset, len, |a| !a);
586+
let expected = input_bools[offset..]
587+
.iter()
588+
.take(len)
589+
.map(|b| !*b)
590+
.collect::<BooleanBuffer>();
591+
assert_eq!(result, expected);
592+
}
593+
}
440594
}

arrow-buffer/src/buffer/immutable.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,14 @@ use std::fmt::Debug;
2020
use std::ptr::NonNull;
2121
use std::sync::Arc;
2222

23-
use crate::BufferBuilder;
2423
use crate::alloc::{Allocation, Deallocation};
2524
use crate::util::bit_chunk_iterator::{BitChunks, UnalignedBitChunk};
25+
use crate::{BooleanBuffer, BufferBuilder};
2626
use crate::{bit_util, bytes::Bytes, native::ArrowNativeType};
2727

2828
#[cfg(feature = "pool")]
2929
use crate::pool::MemoryPool;
3030

31-
use super::ops::bitwise_unary_op_helper;
3231
use super::{MutableBuffer, ScalarBuffer};
3332

3433
/// A contiguous memory region that can be shared with other buffers and across
@@ -344,7 +343,7 @@ impl Buffer {
344343
return self.slice_with_length(offset / 8, bit_util::ceil(len, 8));
345344
}
346345

347-
bitwise_unary_op_helper(self, offset, len, |a| a)
346+
BooleanBuffer::from_bits(self.as_slice(), offset, len).into_inner()
348347
}
349348

350349
/// Returns a `BitChunks` instance which can be used to iterate over this buffers bits

arrow-buffer/src/buffer/ops.rs

Lines changed: 9 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
// under the License.
1717

1818
use super::{Buffer, MutableBuffer};
19+
use crate::BooleanBuffer;
1920
use crate::util::bit_util::ceil;
2021

2122
/// Apply a bitwise operation `op` to four inputs and return the result as a Buffer.
@@ -93,36 +94,20 @@ where
9394

9495
/// Apply a bitwise operation `op` to one input and return the result as a Buffer.
9596
/// The input is treated as a bitmap, meaning that offset and length are specified in number of bits.
97+
#[deprecated(
98+
since = "57.2.0",
99+
note = "use BooleanBuffer::from_bitwise_unary_op instead"
100+
)]
96101
pub fn bitwise_unary_op_helper<F>(
97102
left: &Buffer,
98103
offset_in_bits: usize,
99104
len_in_bits: usize,
100-
mut op: F,
105+
op: F,
101106
) -> Buffer
102107
where
103108
F: FnMut(u64) -> u64,
104109
{
105-
// reserve capacity and set length so we can get a typed view of u64 chunks
106-
let mut result =
107-
MutableBuffer::new(ceil(len_in_bits, 8)).with_bitset(len_in_bits / 64 * 8, false);
108-
109-
let left_chunks = left.bit_chunks(offset_in_bits, len_in_bits);
110-
111-
let result_chunks = result.typed_data_mut::<u64>().iter_mut();
112-
113-
result_chunks
114-
.zip(left_chunks.iter())
115-
.for_each(|(res, left)| {
116-
*res = op(left);
117-
});
118-
119-
let remainder_bytes = ceil(left_chunks.remainder_len(), 8);
120-
let rem = op(left_chunks.remainder_bits());
121-
// we are counting its starting from the least significant bit, to to_le_bytes should be correct
122-
let rem = &rem.to_le_bytes()[0..remainder_bytes];
123-
result.extend_from_slice(rem);
124-
125-
result.into()
110+
BooleanBuffer::from_bitwise_unary_op(left, offset_in_bits, len_in_bits, op).into_inner()
126111
}
127112

128113
/// Apply a bitwise and to two inputs and return the result as a Buffer.
@@ -204,5 +189,6 @@ pub fn buffer_bin_and_not(
204189
/// Apply a bitwise not to one input and return the result as a Buffer.
205190
/// The input is treated as a bitmap, meaning that offset and length are specified in number of bits.
206191
pub fn buffer_unary_not(left: &Buffer, offset_in_bits: usize, len_in_bits: usize) -> Buffer {
207-
bitwise_unary_op_helper(left, offset_in_bits, len_in_bits, |a| !a)
192+
// TODO: should we deprecate this function in favor of the Buffer ! impl ?
193+
BooleanBuffer::from_bitwise_unary_op(left, offset_in_bits, len_in_bits, |a| !a).into_inner()
208194
}

arrow-buffer/src/util/bit_chunk_iterator.rs

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -202,11 +202,10 @@ fn compute_suffix_mask(len: usize, lead_padding: usize) -> (u64, usize) {
202202
(suffix_mask, trailing_padding)
203203
}
204204

205-
/// Iterates over an arbitrarily aligned byte buffer
205+
/// Iterates over an arbitrarily aligned byte buffer 64 bits at a time
206206
///
207-
/// Yields an iterator of u64, and a remainder. The first byte in the buffer
207+
/// [`Self::iter`] yields iterator of `u64`, and a remainder. The first byte in the buffer
208208
/// will be the least significant byte in output u64
209-
///
210209
#[derive(Debug)]
211210
pub struct BitChunks<'a> {
212211
buffer: &'a [u8],
@@ -259,7 +258,7 @@ impl<'a> BitChunks<'a> {
259258
self.remainder_len
260259
}
261260

262-
/// Returns the number of chunks
261+
/// Returns the number of `u64` chunks
263262
#[inline]
264263
pub const fn chunk_len(&self) -> usize {
265264
self.chunk_len
@@ -293,7 +292,28 @@ impl<'a> BitChunks<'a> {
293292
}
294293
}
295294

296-
/// Returns an iterator over chunks of 64 bits represented as an u64
295+
/// Return the number of `u64` that are needed to represent all bits
296+
/// (including remainder).
297+
///
298+
/// This is equal to `chunk_len + 1` if there is a remainder,
299+
/// otherwise it is equal to `chunk_len`.
300+
#[inline]
301+
pub fn num_u64s(&self) -> usize {
302+
if self.remainder_len == 0 {
303+
self.chunk_len
304+
} else {
305+
self.chunk_len + 1
306+
}
307+
}
308+
309+
/// Return the number of *bytes* that are needed to represent all bits
310+
/// (including remainder).
311+
#[inline]
312+
pub fn num_bytes(&self) -> usize {
313+
ceil(self.chunk_len * 64 + self.remainder_len, 8)
314+
}
315+
316+
/// Returns an iterator over chunks of 64 bits represented as an `u64`
297317
#[inline]
298318
pub const fn iter(&self) -> BitChunkIterator<'a> {
299319
BitChunkIterator::<'a> {

arrow-select/src/nullif.rs

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
//! Implements the `nullif` function for Arrow arrays.
1919
2020
use arrow_array::{Array, ArrayRef, BooleanArray, make_array};
21-
use arrow_buffer::buffer::{bitwise_bin_op_helper, bitwise_unary_op_helper};
21+
use arrow_buffer::buffer::bitwise_bin_op_helper;
2222
use arrow_buffer::{BooleanBuffer, NullBuffer};
2323
use arrow_schema::{ArrowError, DataType};
2424

@@ -91,11 +91,13 @@ pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result<ArrayRef, ArrowE
9191
}
9292
None => {
9393
let mut null_count = 0;
94-
let buffer = bitwise_unary_op_helper(right.inner(), right.offset(), len, |b| {
95-
let t = !b;
96-
null_count += t.count_zeros() as usize;
97-
t
98-
});
94+
let buffer =
95+
BooleanBuffer::from_bitwise_unary_op(right.inner(), right.offset(), len, |b| {
96+
let t = !b;
97+
null_count += t.count_zeros() as usize;
98+
t
99+
})
100+
.into_inner();
99101
(buffer, null_count)
100102
}
101103
};

0 commit comments

Comments
 (0)