Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions arrow-buffer/src/buffer/boolean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -800,4 +800,27 @@ mod tests {
}
}
}

#[test]
fn test_extend_trusted_len_sets_byte_len() {
// Ensures extend_trusted_len keeps the underlying byte length in sync with bit length.
let mut builder = BooleanBufferBuilder::new(0);
let bools: Vec<_> = (0..10).map(|i| i % 2 == 0).collect();
unsafe { builder.extend_trusted_len(bools.into_iter()) };
assert_eq!(builder.as_slice().len(), bit_util::ceil(builder.len(), 8));
}

#[test]
fn test_extend_trusted_len_then_append() {
// Exercises append after extend_trusted_len to validate byte length and values.
let mut builder = BooleanBufferBuilder::new(0);
let bools: Vec<_> = (0..9).map(|i| i % 3 == 0).collect();
unsafe { builder.extend_trusted_len(bools.clone().into_iter()) };
builder.append(true);
assert_eq!(builder.as_slice().len(), bit_util::ceil(builder.len(), 8));
let finished = builder.finish();
for (i, v) in bools.into_iter().chain(std::iter::once(true)).enumerate() {
assert_eq!(finished.value(i), v, "at index {}", i);
}
}
}
128 changes: 128 additions & 0 deletions arrow-buffer/src/buffer/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -623,6 +623,134 @@ impl MutableBuffer {
buffer
}

/// Extends this buffer with boolean values.
///
/// This requires `iter` to report an exact size via `size_hint`.
Comment thread
Dandandan marked this conversation as resolved.
/// `offset` indicates the starting offset in bits in this buffer to begin writing to
/// and must be less than or equal to the current length of this buffer.
/// All bits not written to (but readable due to byte alignment) will be zeroed out.
/// # Safety
/// Callers must ensure that `iter` reports an exact size via `size_hint`.
#[inline]
pub unsafe fn extend_bool_trusted_len<I: Iterator<Item = bool>>(
&mut self,
mut iter: I,
offset: usize,
) {
let (lower, upper) = iter.size_hint();
let len = upper.expect("Iterator must have exact size_hint");
assert_eq!(lower, len, "Iterator must have exact size_hint");
debug_assert!(
offset <= self.len * 8,
"offset must be <= buffer length in bits"
);

if len == 0 {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Technically speaking there is no test coverage for this branch

Image

return;
}

let start_len = offset;
let end_bit = start_len + len;

// SAFETY: we will initialize all newly exposed bytes before they are read
let new_len_bytes = bit_util::ceil(end_bit, 8);
if new_len_bytes > self.len {
self.reserve(new_len_bytes - self.len);
// SAFETY: caller will initialize all newly exposed bytes before they are read
unsafe { self.set_len(new_len_bytes) };
}

let slice = self.as_slice_mut();

let mut bit_idx = start_len;

// ---- Unaligned prefix: advance to the next 64-bit boundary ----
let misalignment = bit_idx & 63;
let prefix_bits = if misalignment == 0 {
0
} else {
(64 - misalignment).min(end_bit - bit_idx)
};

if prefix_bits != 0 {
let byte_start = bit_idx / 8;
let byte_end = bit_util::ceil(bit_idx + prefix_bits, 8);
let bit_offset = bit_idx % 8;

// Clear any newly-visible bits in the existing partial byte
if bit_offset != 0 {
let keep_mask = (1u8 << bit_offset).wrapping_sub(1);
slice[byte_start] &= keep_mask;
}

// Zero any new bytes we will partially fill in this prefix
let zero_from = if bit_offset == 0 {
byte_start
} else {
byte_start + 1
};
if byte_end > zero_from {
slice[zero_from..byte_end].fill(0);
}

for _ in 0..prefix_bits {
let v = iter.next().unwrap();
if v {
let byte_idx = bit_idx / 8;
let bit = bit_idx % 8;
slice[byte_idx] |= 1 << bit;
}
bit_idx += 1;
}
}

if bit_idx < end_bit {
// ---- Aligned middle: write u64 chunks ----
debug_assert_eq!(bit_idx & 63, 0);
let remaining_bits = end_bit - bit_idx;
let chunks = remaining_bits / 64;

let words_start = bit_idx / 8;
let words_end = words_start + chunks * 8;
for dst in slice[words_start..words_end].chunks_exact_mut(8) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Once we have MSRV 1.88 we could also use as_chunks_mut and the compiler will know exactly how long the slice is:

            let (chunks, _remain) = slice[words_start..words_end].as_chunks_mut::<8>();
            for dst in chunks {

However, it isn't quite ready

error: current MSRV (Minimum Supported Rust Version) is `1.85.0` but this item is stable since `1.88.0`
   --> arrow-buffer/src/buffer/mutable.rs:720:67
    |
720 |             let (chunks, _remain) = slice[words_start..words_end].as_chunks_mut::<8>();
    |                                                                   ^^^^^^^^^^^^^^^^^^^^

let mut packed: u64 = 0;
for i in 0..64 {
packed |= (iter.next().unwrap() as u64) << i;
}
dst.copy_from_slice(&packed.to_le_bytes());
bit_idx += 64;
}

// ---- Unaligned suffix: remaining < 64 bits ----
let suffix_bits = end_bit - bit_idx;
if suffix_bits != 0 {
debug_assert_eq!(bit_idx % 8, 0);
let byte_start = bit_idx / 8;
let byte_end = bit_util::ceil(end_bit, 8);
slice[byte_start..byte_end].fill(0);

for _ in 0..suffix_bits {
let v = iter.next().unwrap();
if v {
let byte_idx = bit_idx / 8;
let bit = bit_idx % 8;
slice[byte_idx] |= 1 << bit;
}
bit_idx += 1;
}
}
}

// Clear any unused bits in the last byte
let remainder = end_bit % 8;
if remainder != 0 {
let mask = (1u8 << remainder).wrapping_sub(1);
slice[bit_util::ceil(end_bit, 8) - 1] &= mask;
}

debug_assert_eq!(bit_idx, end_bit);
}

/// Register this [`MutableBuffer`] with the provided [`MemoryPool`]
///
/// This claims the memory used by this buffer in the pool, allowing for
Expand Down
75 changes: 75 additions & 0 deletions arrow-buffer/src/builder/boolean.rs
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,20 @@ impl BooleanBufferBuilder {
pub fn finish_cloned(&self) -> BooleanBuffer {
BooleanBuffer::new(Buffer::from_slice_ref(self.as_slice()), 0, self.len)
}

/// Extends the builder from a trusted length iterator of booleans.
/// # Safety
/// Callers must ensure that `iter` reports an exact size via `size_hint`.
///
#[inline]
pub unsafe fn extend_trusted_len<I>(&mut self, iterator: I)
where
I: Iterator<Item = bool>,
{
let len = iterator.size_hint().0;
unsafe { self.buffer.extend_bool_trusted_len(iterator, self.len) };
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might make it easier to understand the correctness if extend_bool_trusted_len returned the number of bits that were appended rather than having to get it from the iterator

self.len += len;
}
}

impl From<BooleanBufferBuilder> for Buffer {
Expand Down Expand Up @@ -526,4 +540,65 @@ mod tests {
assert_eq!(buf.len(), buf2.inner().len());
assert_eq!(buf.as_slice(), buf2.values());
}

#[test]
fn test_extend() {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I recommend adding some explicit tests for the mutable buffer API directly as well.

let mut builder = BooleanBufferBuilder::new(0);
let bools = vec![true, false, true, true, false, true, true, true, false];
unsafe { builder.extend_trusted_len(bools.clone().into_iter()) };
assert_eq!(builder.len(), 9);
let finished = builder.finish();
for (i, v) in bools.into_iter().enumerate() {
assert_eq!(finished.value(i), v);
}

// Test > 64 bits
let mut builder = BooleanBufferBuilder::new(0);
let bools: Vec<_> = (0..100).map(|i| i % 3 == 0 || i % 7 == 0).collect();
unsafe { builder.extend_trusted_len(bools.clone().into_iter()) };
assert_eq!(builder.len(), 100);
let finished = builder.finish();
for (i, v) in bools.into_iter().enumerate() {
assert_eq!(finished.value(i), v, "at index {}", i);
}
}

#[test]
fn test_extend_misaligned() {
// Test misaligned start
for offset in 1..65 {
let mut builder = BooleanBufferBuilder::new(0);
builder.append_n(offset, false);

let bools: Vec<_> = (0..100).map(|i| i % 3 == 0 || i % 7 == 0).collect();
unsafe { builder.extend_trusted_len(bools.clone().into_iter()) };
assert_eq!(builder.len(), offset + 100);

let finished = builder.finish();
for i in 0..offset {
assert!(!finished.value(i));
}
for (i, v) in bools.into_iter().enumerate() {
assert_eq!(finished.value(offset + i), v, "at index {}", offset + i);
}
}
}

#[test]
fn test_extend_misaligned_end() {
for len in 1..130 {
let mut builder = BooleanBufferBuilder::new(0);
let mut bools: Vec<_> = (0..len).map(|i| i % 2 == 0).collect();
unsafe { builder.extend_trusted_len(bools.clone().into_iter()) };
unsafe { builder.extend_trusted_len(bools.clone().into_iter()) };
let copy = bools.clone();
bools.extend(copy);
assert_eq!(builder.len(), 2 * len);

let finished = builder.finish();
for (i, &v) in bools.iter().enumerate() {
assert_eq!(finished.value(i), v, "at index {} for len {}", i, len);
}
}
}
Comment thread
Dandandan marked this conversation as resolved.
}
Loading