Skip to content

Commit 1fec0fb

Browse files
alambJefffrey
andauthored
Improve arrow-buffer documentation (#9020)
# Which issue does this PR close? - Related to #8806 # Rationale for this change As I study this code, I want to encode my learnings for both my future self and future readers of this code # What changes are included in this PR? Add documentation on the structure of the arrow buffer crate as well as details on how BooleanBuffer works # Are these changes tested? CI # Are there any user-facing changes? Just docs, no functional changes --------- Co-authored-by: Jeffrey Vo <jeffrey.vo.australia@gmail.com>
1 parent 34337d2 commit 1fec0fb

File tree

2 files changed

+61
-1
lines changed

2 files changed

+61
-1
lines changed

arrow-buffer/src/buffer/boolean.rs

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,17 +26,58 @@ use std::ops::{BitAnd, BitOr, BitXor, Not};
2626

2727
/// A slice-able [`Buffer`] containing bit-packed booleans
2828
///
29-
/// `BooleanBuffer`s can be modified using [`BooleanBufferBuilder`]
29+
/// This structure represents a sequence of boolean values packed into a
30+
/// byte-aligned [`Buffer`]. Both the offset and length are represented in bits.
3031
///
32+
/// # Layout
33+
///
34+
/// The values are represented as little endian bit-packed values, where the
35+
/// least significant bit of each byte represents the first boolean value and
36+
/// then proceeding to the most significant bit.
37+
///
38+
/// For example, the 10 bit bitmask `0b0111001101` has length 10, and is
39+
/// represented using 2 bytes with offset 0 like this:
40+
///
41+
/// ```text
42+
/// ┌─────────────────────────────────┐ ┌─────────────────────────────────┐
43+
/// │┌───┬───┬───┬───┬───┬───┬───┬───┐│ │┌───┬───┬───┬───┬───┬───┬───┬───┐│
44+
/// ││ 1 │ 0 │ 1 │ 1 │ 0 │ 0 │ 1 │ 1 ││ ││ 1 │ 0 │ ? │ ? │ ? │ ? │ ? │ ? ││
45+
/// │└───┴───┴───┴───┴───┴───┴───┴───┘│ │└───┴───┴───┴───┴───┴───┴───┴───┘│
46+
/// bit └─────────────────────────────────┘ └─────────────────────────────────┘
47+
/// offset 0 Byte 0 7 0 Byte 1 7
48+
///
49+
/// length = 10 bits, offset = 0
50+
/// ```
51+
///
52+
/// The same bitmask with length 10 and offset 3 would be represented using 2
53+
/// bytes like this:
54+
///
55+
/// ```text
56+
/// ┌─────────────────────────────────┐ ┌─────────────────────────────────┐
57+
/// │┌───┬───┬───┬───┬───┬───┬───┬───┐│ │┌───┬───┬───┬───┬───┬───┬───┬───┐│
58+
/// ││ ? │ ? │ ? │ 1 │ 0 │ 1 │ 1 │ 0 ││ ││ 0 │ 1 │ 1 │ 1 │ 0 │ ? │ ? │ ? ││
59+
/// │└───┴───┴───┴───┴───┴───┴───┴───┘│ │└───┴───┴───┴───┴───┴───┴───┴───┘│
60+
/// bit └─────────────────────────────────┘ └─────────────────────────────────┘
61+
/// offset 0 Byte 0 7 0 Byte 1 7
62+
///
63+
/// length = 10 bits, offset = 3
64+
/// ```
65+
///
66+
/// Note that the bits marked `?` are not logically part of the mask and may
67+
/// contain either `0` or `1`
3168
///
3269
/// # See Also
70+
/// * [`BooleanBufferBuilder`] for building [`BooleanBuffer`] instances
3371
/// * [`NullBuffer`] for representing null values in Arrow arrays
3472
///
3573
/// [`NullBuffer`]: crate::NullBuffer
3674
#[derive(Debug, Clone, Eq)]
3775
pub struct BooleanBuffer {
76+
/// Underlying buffer (byte aligned)
3877
buffer: Buffer,
78+
/// Offset in bits (not bytes)
3979
offset: usize,
80+
/// Length in bits (not bytes)
4081
len: usize,
4182
}
4283

@@ -304,12 +345,16 @@ impl BooleanBuffer {
304345
}
305346

306347
/// Returns the inner [`Buffer`]
348+
///
349+
/// Note: this does not account for offset and length of this [`BooleanBuffer`]
307350
#[inline]
308351
pub fn inner(&self) -> &Buffer {
309352
&self.buffer
310353
}
311354

312355
/// Returns the inner [`Buffer`], consuming self
356+
///
357+
/// Note: this does not account for offset and length of this [`BooleanBuffer`]
313358
pub fn into_inner(self) -> Buffer {
314359
self.buffer
315360
}

arrow-buffer/src/lib.rs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,21 @@
1616
// under the License.
1717

1818
//! Low-level buffer abstractions for [Apache Arrow Rust](https://docs.rs/arrow)
19+
//!
20+
//! # Byte Storage abstractions
21+
//! - [`MutableBuffer`]: Raw memory buffer that can be mutated and grown
22+
//! - [`Buffer`]: Immutable buffer that is shared across threads
23+
//!
24+
//! # Typed Abstractions
25+
//!
26+
//! There are also several wrappers over [`Buffer`] with methods for
27+
//! easier manipulation:
28+
//!
29+
//! - [`BooleanBuffer`][]: Bitmasks (buffer of packed bits)
30+
//! - [`NullBuffer`][]: Arrow null (validity) bitmaps ([`BooleanBuffer`] with extra utilities)
31+
//! - [`ScalarBuffer<T>`][]: Typed buffer for primitive types (e.g., `i32`, `f64`)
32+
//! - [`OffsetBuffer<O>`][]: Offsets used in variable-length types (e.g., strings, lists)
33+
//! - [`RunEndBuffer<E>`][]: Run-ends used in run-encoded encoded data
1934
2035
#![doc(
2136
html_logo_url = "https://arrow.apache.org/img/arrow-logo_chevrons_black-txt_white-bg.svg",

0 commit comments

Comments
 (0)