Skip to content

Commit 6f02fcf

Browse files
Dandandanclaude
andauthored
[Parquet] Improve dictionary decoder by unrolling loops (#9662)
## Which issue does this PR close? #9670 ## Rationale Improve dictionary decoding by unrolling loads / OOB check (largest win -15% for i32) and optimizing the `try_from_le_slice` usage to load from u64 instead (small win, mostly readability). ``` ┌──────────────────────────────────┬───────────────┬─────────┬─────────────┐ │ Benchmark │ upstream/main │ branch │ Improvement │ ├──────────────────────────────────┼───────────────┼─────────┼─────────────┤ │ dictionary, mandatory, no NULLs │ 58.5 µs │ 49.5 µs │ -15.4% │ ├──────────────────────────────────┼───────────────┼─────────┼─────────────┤ │ dictionary, optional, no NULLs │ 61.0 µs │ 52.2 µs │ -14.4% │ ├──────────────────────────────────┼───────────────┼─────────┼─────────────┤ │ dictionary, optional, half NULLs │ 97.1 µs │ 93.0 µs │ -4.2% │ └──────────────────────────────────┴───────────────┴─────────┴─────────────┘ ``` ## What changes are included in this PR? ## Are these changes tested? Existing tests cover this path thoroughly (21 bit_util tests pass). 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 65ad652 commit 6f02fcf

File tree

4 files changed

+111
-27
lines changed

4 files changed

+111
-27
lines changed

parquet/src/arrow/array_reader/byte_array_dictionary.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ use crate::column::reader::decoder::ColumnValueDecoder;
3434
use crate::encodings::rle::RleDecoder;
3535
use crate::errors::{ParquetError, Result};
3636
use crate::schema::types::ColumnDescPtr;
37-
use crate::util::bit_util::FromBytes;
37+
use crate::util::bit_util::FromBitpacked;
3838

3939
/// A macro to reduce verbosity of [`make_byte_array_dictionary_reader`]
4040
macro_rules! make_reader {
@@ -128,7 +128,7 @@ struct ByteArrayDictionaryReader<K: ArrowNativeType, V: OffsetSizeTrait> {
128128

129129
impl<K, V> ByteArrayDictionaryReader<K, V>
130130
where
131-
K: FromBytes + Ord + ArrowNativeType,
131+
K: FromBitpacked + Ord + ArrowNativeType,
132132
V: OffsetSizeTrait,
133133
{
134134
fn new(
@@ -148,7 +148,7 @@ where
148148

149149
impl<K, V> ArrayReader for ByteArrayDictionaryReader<K, V>
150150
where
151-
K: FromBytes + Ord + ArrowNativeType,
151+
K: FromBitpacked + Ord + ArrowNativeType,
152152
V: OffsetSizeTrait,
153153
{
154154
fn as_any(&self) -> &dyn Any {
@@ -226,7 +226,7 @@ struct DictionaryDecoder<K, V> {
226226

227227
impl<K, V> ColumnValueDecoder for DictionaryDecoder<K, V>
228228
where
229-
K: FromBytes + Ord + ArrowNativeType,
229+
K: FromBitpacked + Ord + ArrowNativeType,
230230
V: OffsetSizeTrait,
231231
{
232232
type Buffer = DictionaryBuffer<K, V>;

parquet/src/encodings/decoding.rs

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ use crate::encodings::decoding::byte_stream_split_decoder::{
3131
};
3232
use crate::errors::{ParquetError, Result};
3333
use crate::schema::types::ColumnDescPtr;
34-
use crate::util::bit_util::{self, BitReader};
34+
use crate::util::bit_util::{self, BitReader, FromBitpacked};
3535

3636
mod byte_stream_split_decoder;
3737

@@ -455,7 +455,10 @@ impl<T: DataType> RleValueDecoder<T> {
455455
}
456456
}
457457

458-
impl<T: DataType> Decoder<T> for RleValueDecoder<T> {
458+
impl<T: DataType> Decoder<T> for RleValueDecoder<T>
459+
where
460+
T::T: FromBitpacked,
461+
{
459462
#[inline]
460463
fn set_data(&mut self, data: Bytes, num_values: usize) -> Result<()> {
461464
// Only support RLE value reader for boolean values with bit width of 1.
@@ -658,7 +661,7 @@ where
658661

659662
impl<T: DataType> Decoder<T> for DeltaBitPackDecoder<T>
660663
where
661-
T::T: Default + FromPrimitive + WrappingAdd + Copy,
664+
T::T: Default + FromPrimitive + FromBitpacked + WrappingAdd + Copy,
662665
{
663666
// # of total values is derived from encoding
664667
#[inline]

parquet/src/encodings/rle.rs

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ use std::{cmp, mem::size_of};
3939
use bytes::Bytes;
4040

4141
use crate::errors::{ParquetError, Result};
42-
use crate::util::bit_util::{self, BitReader, BitWriter, FromBytes};
42+
use crate::util::bit_util::{self, BitReader, BitWriter, FromBitpacked};
4343

4444
/// Maximum groups of 8 values per bit-packed run. Current value is 64.
4545
const MAX_GROUPS_PER_BIT_PACKED_RUN: usize = 1 << 6;
@@ -84,7 +84,7 @@ impl RleEncoder {
8484

8585
/// Initialize the encoder from existing `buffer`
8686
pub fn new_from_buf(bit_width: u8, buffer: Vec<u8>) -> Self {
87-
assert!(bit_width <= 64);
87+
debug_assert!(bit_width <= 64);
8888
let bit_writer = BitWriter::new_from_buf(buffer);
8989
RleEncoder {
9090
bit_width,
@@ -352,7 +352,7 @@ impl RleDecoder {
352352
// that damage L1d-cache occupancy. This results in a ~18% performance drop
353353
#[inline(never)]
354354
#[allow(unused)]
355-
pub fn get<T: FromBytes>(&mut self) -> Result<Option<T>> {
355+
pub fn get<T: FromBitpacked>(&mut self) -> Result<Option<T>> {
356356
assert!(size_of::<T>() <= 8);
357357

358358
while self.rle_left == 0 && self.bit_packed_left == 0 {
@@ -388,7 +388,7 @@ impl RleDecoder {
388388
}
389389

390390
#[inline(never)]
391-
pub fn get_batch<T: FromBytes + Clone>(&mut self, buffer: &mut [T]) -> Result<usize> {
391+
pub fn get_batch<T: FromBitpacked + Clone>(&mut self, buffer: &mut [T]) -> Result<usize> {
392392
assert!(size_of::<T>() <= 8);
393393

394394
let mut values_read = 0;
@@ -469,7 +469,7 @@ impl RleDecoder {
469469
where
470470
T: Default + Clone,
471471
{
472-
assert!(buffer.len() >= max_values);
472+
debug_assert!(buffer.len() >= max_values);
473473

474474
let mut values_read = 0;
475475
while values_read < max_values {
@@ -507,10 +507,30 @@ impl RleDecoder {
507507
self.bit_packed_left = 0;
508508
break;
509509
}
510-
buffer[values_read..values_read + num_values]
511-
.iter_mut()
512-
.zip(index_buf[..num_values].iter())
513-
.for_each(|(b, i)| b.clone_from(&dict[*i as usize]));
510+
{
511+
let out = &mut buffer[values_read..values_read + num_values];
512+
let idx = &index_buf[..num_values];
513+
let mut out_chunks = out.chunks_exact_mut(8);
514+
let idx_chunks = idx.chunks_exact(8);
515+
for (out_chunk, idx_chunk) in out_chunks.by_ref().zip(idx_chunks) {
516+
let dict_len = dict.len();
517+
assert!(
518+
idx_chunk.iter().all(|&i| (i as usize) < dict_len),
519+
"dictionary index out of bounds"
520+
);
521+
for (b, i) in out_chunk.iter_mut().zip(idx_chunk.iter()) {
522+
// SAFETY: all indices checked above to be in bounds
523+
b.clone_from(unsafe { dict.get_unchecked(*i as usize) });
524+
}
525+
}
526+
for (b, i) in out_chunks
527+
.into_remainder()
528+
.iter_mut()
529+
.zip(idx.chunks_exact(8).remainder().iter())
530+
{
531+
b.clone_from(&dict[*i as usize]);
532+
}
533+
}
514534
self.bit_packed_left -= num_values as u32;
515535
values_read += num_values;
516536
if num_values < to_read {

parquet/src/util/bit_util.rs

Lines changed: 72 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,17 @@ pub unsafe trait FromBytes: Sized {
4646
fn from_le_bytes(bs: Self::Buffer) -> Self;
4747
}
4848

49+
/// Types that can be decoded from bitpacked representations.
50+
///
51+
/// This is implemented for primitive types and bool that can be
52+
/// directly converted from a u64 value. Types like Int96, ByteArray,
53+
/// and FixedLenByteArray that cannot be represented in 64 bits do not
54+
/// implement this trait.
55+
pub trait FromBitpacked: FromBytes {
56+
/// Convert directly from a u64 value by truncation, avoiding byte slice copies.
57+
fn from_u64(v: u64) -> Self;
58+
}
59+
4960
macro_rules! from_le_bytes {
5061
($($ty: ty),*) => {
5162
$(
@@ -60,11 +71,55 @@ macro_rules! from_le_bytes {
6071
<$ty>::from_le_bytes(bs)
6172
}
6273
}
74+
impl FromBitpacked for $ty {
75+
#[inline]
76+
fn from_u64(v: u64) -> Self {
77+
v as Self
78+
}
79+
}
6380
)*
6481
};
6582
}
6683

67-
from_le_bytes! { u8, u16, u32, u64, i8, i16, i32, i64, f32, f64 }
84+
from_le_bytes! { u8, u16, u32, u64, i8, i16, i32, i64 }
85+
86+
// SAFETY: all bit patterns are valid for f32 and f64.
87+
unsafe impl FromBytes for f32 {
88+
const BIT_CAPACITY: usize = 32;
89+
type Buffer = [u8; 4];
90+
fn try_from_le_slice(b: &[u8]) -> Result<Self> {
91+
Ok(Self::from_le_bytes(array_from_slice(b)?))
92+
}
93+
fn from_le_bytes(bs: Self::Buffer) -> Self {
94+
f32::from_le_bytes(bs)
95+
}
96+
}
97+
98+
impl FromBitpacked for f32 {
99+
#[inline]
100+
fn from_u64(v: u64) -> Self {
101+
f32::from_bits(v as u32)
102+
}
103+
}
104+
105+
// SAFETY: all bit patterns are valid for f64.
106+
unsafe impl FromBytes for f64 {
107+
const BIT_CAPACITY: usize = 64;
108+
type Buffer = [u8; 8];
109+
fn try_from_le_slice(b: &[u8]) -> Result<Self> {
110+
Ok(Self::from_le_bytes(array_from_slice(b)?))
111+
}
112+
fn from_le_bytes(bs: Self::Buffer) -> Self {
113+
f64::from_le_bytes(bs)
114+
}
115+
}
116+
117+
impl FromBitpacked for f64 {
118+
#[inline]
119+
fn from_u64(v: u64) -> Self {
120+
f64::from_bits(v)
121+
}
122+
}
68123

69124
// SAFETY: the 0000000x bit pattern is always valid for `bool`.
70125
unsafe impl FromBytes for bool {
@@ -79,6 +134,13 @@ unsafe impl FromBytes for bool {
79134
}
80135
}
81136

137+
impl FromBitpacked for bool {
138+
#[inline]
139+
fn from_u64(v: u64) -> Self {
140+
v != 0
141+
}
142+
}
143+
82144
// SAFETY: BIT_CAPACITY is 0.
83145
unsafe impl FromBytes for Int96 {
84146
const BIT_CAPACITY: usize = 0;
@@ -139,7 +201,7 @@ pub(crate) fn read_num_bytes<T>(size: usize, src: &[u8]) -> T
139201
where
140202
T: FromBytes,
141203
{
142-
assert!(size <= src.len());
204+
debug_assert!(size <= src.len());
143205
let mut buffer = <T as FromBytes>::Buffer::default();
144206
buffer.as_mut()[..size].copy_from_slice(&src[..size]);
145207
<T>::from_le_bytes(buffer)
@@ -413,9 +475,9 @@ impl BitReader {
413475
/// Reads a value of type `T` and of size `num_bits`.
414476
///
415477
/// Returns `None` if there's not enough data available. `Some` otherwise.
416-
pub fn get_value<T: FromBytes>(&mut self, num_bits: usize) -> Option<T> {
417-
assert!(num_bits <= 64);
418-
assert!(num_bits <= size_of::<T>() * 8);
478+
pub fn get_value<T: FromBitpacked>(&mut self, num_bits: usize) -> Option<T> {
479+
debug_assert!(num_bits <= 64);
480+
debug_assert!(num_bits <= size_of::<T>() * 8);
419481

420482
if self.byte_offset * 8 + self.bit_offset + num_bits > self.buffer.len() * 8 {
421483
return None;
@@ -445,8 +507,7 @@ impl BitReader {
445507
}
446508
}
447509

448-
// TODO: better to avoid copying here
449-
T::try_from_le_slice(v.as_bytes()).ok()
510+
Some(T::from_u64(v))
450511
}
451512

452513
/// Read multiple values from their packed representation where each element is represented
@@ -457,8 +518,8 @@ impl BitReader {
457518
/// This function panics if
458519
/// - `num_bits` is larger than the bit-capacity of `T`
459520
///
460-
pub fn get_batch<T: FromBytes>(&mut self, batch: &mut [T], num_bits: usize) -> usize {
461-
assert!(num_bits <= size_of::<T>() * 8);
521+
pub fn get_batch<T: FromBitpacked>(&mut self, batch: &mut [T], num_bits: usize) -> usize {
522+
debug_assert!(num_bits <= size_of::<T>() * 8);
462523

463524
let mut values_to_read = batch.len();
464525
let needed_bits = num_bits * values_to_read;
@@ -602,7 +663,7 @@ impl BitReader {
602663
///
603664
/// Return the number of values skipped (up to num_values)
604665
pub fn skip(&mut self, num_values: usize, num_bits: usize) -> usize {
605-
assert!(num_bits <= 64);
666+
debug_assert!(num_bits <= 64);
606667

607668
let needed_bits = num_bits * num_values;
608669
let remaining_bits = (self.buffer.len() - self.byte_offset) * 8 - self.bit_offset;
@@ -1024,7 +1085,7 @@ mod tests {
10241085

10251086
fn test_get_batch_helper<T>(total: usize, num_bits: usize)
10261087
where
1027-
T: FromBytes + Default + Clone + Debug + Eq,
1088+
T: FromBitpacked + Default + Clone + Debug + Eq,
10281089
{
10291090
assert!(num_bits <= 64);
10301091
let num_bytes = ceil(num_bits, 8);

0 commit comments

Comments
 (0)