From 0c1ec228eac6bca830dce249de1b65e10b72d52e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 4 Apr 2026 09:47:44 +0200 Subject: [PATCH 1/9] Avoid byte slice copy in BitReader::get_value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add `from_u64` method to `FromBytes` trait to convert directly from the u64 bit buffer, eliminating the intermediate `as_bytes()` → `try_from_le_slice()` round-trip that copied through a byte slice. Co-Authored-By: Claude Opus 4.6 (1M context) --- parquet/src/util/bit_util.rs | 56 ++++++++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 3 deletions(-) diff --git a/parquet/src/util/bit_util.rs b/parquet/src/util/bit_util.rs index 7d7907f6f543..d627cf025636 100644 --- a/parquet/src/util/bit_util.rs +++ b/parquet/src/util/bit_util.rs @@ -44,6 +44,8 @@ pub unsafe trait FromBytes: Sized { type Buffer: AsMut<[u8]> + Default; fn try_from_le_slice(b: &[u8]) -> Result; fn from_le_bytes(bs: Self::Buffer) -> Self; + /// Convert directly from a u64 value by truncation, avoiding byte slice copies. + fn from_u64(v: u64) -> Self; } macro_rules! from_le_bytes { @@ -59,12 +61,48 @@ macro_rules! from_le_bytes { fn from_le_bytes(bs: Self::Buffer) -> Self { <$ty>::from_le_bytes(bs) } + #[inline] + fn from_u64(v: u64) -> Self { + v as Self + } } )* }; } -from_le_bytes! { u8, u16, u32, u64, i8, i16, i32, i64, f32, f64 } +from_le_bytes! { u8, u16, u32, u64, i8, i16, i32, i64 } + +// SAFETY: all bit patterns are valid for f32 and f64. +unsafe impl FromBytes for f32 { + const BIT_CAPACITY: usize = 32; + type Buffer = [u8; 4]; + fn try_from_le_slice(b: &[u8]) -> Result { + Ok(Self::from_le_bytes(array_from_slice(b)?)) + } + fn from_le_bytes(bs: Self::Buffer) -> Self { + f32::from_le_bytes(bs) + } + #[inline] + fn from_u64(v: u64) -> Self { + f32::from_bits(v as u32) + } +} + +// SAFETY: all bit patterns are valid for f64. +unsafe impl FromBytes for f64 { + const BIT_CAPACITY: usize = 64; + type Buffer = [u8; 8]; + fn try_from_le_slice(b: &[u8]) -> Result { + Ok(Self::from_le_bytes(array_from_slice(b)?)) + } + fn from_le_bytes(bs: Self::Buffer) -> Self { + f64::from_le_bytes(bs) + } + #[inline] + fn from_u64(v: u64) -> Self { + f64::from_bits(v) + } +} // SAFETY: the 0000000x bit pattern is always valid for `bool`. unsafe impl FromBytes for bool { @@ -77,6 +115,10 @@ unsafe impl FromBytes for bool { fn from_le_bytes(bs: Self::Buffer) -> Self { bs[0] != 0 } + #[inline] + fn from_u64(v: u64) -> Self { + v != 0 + } } // SAFETY: BIT_CAPACITY is 0. @@ -104,6 +146,9 @@ unsafe impl FromBytes for Int96 { ); i } + fn from_u64(_v: u64) -> Self { + unreachable!("Int96 does not support from_u64") + } } // SAFETY: BIT_CAPACITY is 0. @@ -117,6 +162,9 @@ unsafe impl FromBytes for ByteArray { fn from_le_bytes(bs: Self::Buffer) -> Self { bs.into() } + fn from_u64(_v: u64) -> Self { + unreachable!("ByteArray does not support from_u64") + } } // SAFETY: BIT_CAPACITY is 0. @@ -130,6 +178,9 @@ unsafe impl FromBytes for FixedLenByteArray { fn from_le_bytes(bs: Self::Buffer) -> Self { bs.into() } + fn from_u64(_v: u64) -> Self { + unreachable!("FixedLenByteArray does not support from_u64") + } } /// Reads `size` of bytes from `src`, and reinterprets them as type `ty`, in @@ -445,8 +496,7 @@ impl BitReader { } } - // TODO: better to avoid copying here - T::try_from_le_slice(v.as_bytes()).ok() + Some(T::from_u64(v)) } /// Read multiple values from their packed representation where each element is represented From acdbbe05cf2d7546415bb4859556a8da29d562fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 4 Apr 2026 13:26:27 +0200 Subject: [PATCH 2/9] Introduce FromBitpacked trait and use debug_assert in BitReader Replace unreachable!() stubs for from_u64 on Int96/ByteArray/FixedLenByteArray with a separate FromBitpacked trait that is only implemented for types that can actually be converted from u64 (primitives, floats, bool). Also convert assert! to debug_assert! for num_bits bounds checks in get_value/get_batch. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../array_reader/byte_array_dictionary.rs | 8 ++-- parquet/src/encodings/decoding.rs | 9 +++-- parquet/src/encodings/rle.rs | 6 +-- parquet/src/util/bit_util.rs | 37 ++++++++++++------- 4 files changed, 37 insertions(+), 23 deletions(-) diff --git a/parquet/src/arrow/array_reader/byte_array_dictionary.rs b/parquet/src/arrow/array_reader/byte_array_dictionary.rs index f7b93264b760..1f77b4bd2f93 100644 --- a/parquet/src/arrow/array_reader/byte_array_dictionary.rs +++ b/parquet/src/arrow/array_reader/byte_array_dictionary.rs @@ -34,7 +34,7 @@ use crate::column::reader::decoder::ColumnValueDecoder; use crate::encodings::rle::RleDecoder; use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; -use crate::util::bit_util::FromBytes; +use crate::util::bit_util::FromBitpacked; /// A macro to reduce verbosity of [`make_byte_array_dictionary_reader`] macro_rules! make_reader { @@ -128,7 +128,7 @@ struct ByteArrayDictionaryReader { impl ByteArrayDictionaryReader where - K: FromBytes + Ord + ArrowNativeType, + K: FromBitpacked + Ord + ArrowNativeType, V: OffsetSizeTrait, { fn new( @@ -148,7 +148,7 @@ where impl ArrayReader for ByteArrayDictionaryReader where - K: FromBytes + Ord + ArrowNativeType, + K: FromBitpacked + Ord + ArrowNativeType, V: OffsetSizeTrait, { fn as_any(&self) -> &dyn Any { @@ -226,7 +226,7 @@ struct DictionaryDecoder { impl ColumnValueDecoder for DictionaryDecoder where - K: FromBytes + Ord + ArrowNativeType, + K: FromBitpacked + Ord + ArrowNativeType, V: OffsetSizeTrait, { type Buffer = DictionaryBuffer; diff --git a/parquet/src/encodings/decoding.rs b/parquet/src/encodings/decoding.rs index 7da21e6dd091..5bc2cc9f11f8 100644 --- a/parquet/src/encodings/decoding.rs +++ b/parquet/src/encodings/decoding.rs @@ -31,7 +31,7 @@ use crate::encodings::decoding::byte_stream_split_decoder::{ }; use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; -use crate::util::bit_util::{self, BitReader}; +use crate::util::bit_util::{self, BitReader, FromBitpacked}; mod byte_stream_split_decoder; @@ -455,7 +455,10 @@ impl RleValueDecoder { } } -impl Decoder for RleValueDecoder { +impl Decoder for RleValueDecoder +where + T::T: FromBitpacked, +{ #[inline] fn set_data(&mut self, data: Bytes, num_values: usize) -> Result<()> { // Only support RLE value reader for boolean values with bit width of 1. @@ -658,7 +661,7 @@ where impl Decoder for DeltaBitPackDecoder where - T::T: Default + FromPrimitive + WrappingAdd + Copy, + T::T: Default + FromPrimitive + FromBitpacked + WrappingAdd + Copy, { // # of total values is derived from encoding #[inline] diff --git a/parquet/src/encodings/rle.rs b/parquet/src/encodings/rle.rs index 2815c20dab56..3129e026de9d 100644 --- a/parquet/src/encodings/rle.rs +++ b/parquet/src/encodings/rle.rs @@ -39,7 +39,7 @@ use std::{cmp, mem::size_of}; use bytes::Bytes; use crate::errors::{ParquetError, Result}; -use crate::util::bit_util::{self, BitReader, BitWriter, FromBytes}; +use crate::util::bit_util::{self, BitReader, BitWriter, FromBitpacked}; /// Maximum groups of 8 values per bit-packed run. Current value is 64. const MAX_GROUPS_PER_BIT_PACKED_RUN: usize = 1 << 6; @@ -352,7 +352,7 @@ impl RleDecoder { // that damage L1d-cache occupancy. This results in a ~18% performance drop #[inline(never)] #[allow(unused)] - pub fn get(&mut self) -> Result> { + pub fn get(&mut self) -> Result> { assert!(size_of::() <= 8); while self.rle_left == 0 && self.bit_packed_left == 0 { @@ -388,7 +388,7 @@ impl RleDecoder { } #[inline(never)] - pub fn get_batch(&mut self, buffer: &mut [T]) -> Result { + pub fn get_batch(&mut self, buffer: &mut [T]) -> Result { assert!(size_of::() <= 8); let mut values_read = 0; diff --git a/parquet/src/util/bit_util.rs b/parquet/src/util/bit_util.rs index d627cf025636..5259d2ee4abb 100644 --- a/parquet/src/util/bit_util.rs +++ b/parquet/src/util/bit_util.rs @@ -44,6 +44,15 @@ pub unsafe trait FromBytes: Sized { type Buffer: AsMut<[u8]> + Default; fn try_from_le_slice(b: &[u8]) -> Result; fn from_le_bytes(bs: Self::Buffer) -> Self; +} + +/// Types that can be decoded from bitpacked representations. +/// +/// This is implemented for primitive types and bool that can be +/// directly converted from a u64 value. Types like Int96, ByteArray, +/// and FixedLenByteArray that cannot be represented in 64 bits do not +/// implement this trait. +pub trait FromBitpacked: FromBytes { /// Convert directly from a u64 value by truncation, avoiding byte slice copies. fn from_u64(v: u64) -> Self; } @@ -61,6 +70,8 @@ macro_rules! from_le_bytes { fn from_le_bytes(bs: Self::Buffer) -> Self { <$ty>::from_le_bytes(bs) } + } + impl FromBitpacked for $ty { #[inline] fn from_u64(v: u64) -> Self { v as Self @@ -82,6 +93,9 @@ unsafe impl FromBytes for f32 { fn from_le_bytes(bs: Self::Buffer) -> Self { f32::from_le_bytes(bs) } +} + +impl FromBitpacked for f32 { #[inline] fn from_u64(v: u64) -> Self { f32::from_bits(v as u32) @@ -98,6 +112,9 @@ unsafe impl FromBytes for f64 { fn from_le_bytes(bs: Self::Buffer) -> Self { f64::from_le_bytes(bs) } +} + +impl FromBitpacked for f64 { #[inline] fn from_u64(v: u64) -> Self { f64::from_bits(v) @@ -115,6 +132,9 @@ unsafe impl FromBytes for bool { fn from_le_bytes(bs: Self::Buffer) -> Self { bs[0] != 0 } +} + +impl FromBitpacked for bool { #[inline] fn from_u64(v: u64) -> Self { v != 0 @@ -146,9 +166,6 @@ unsafe impl FromBytes for Int96 { ); i } - fn from_u64(_v: u64) -> Self { - unreachable!("Int96 does not support from_u64") - } } // SAFETY: BIT_CAPACITY is 0. @@ -162,9 +179,6 @@ unsafe impl FromBytes for ByteArray { fn from_le_bytes(bs: Self::Buffer) -> Self { bs.into() } - fn from_u64(_v: u64) -> Self { - unreachable!("ByteArray does not support from_u64") - } } // SAFETY: BIT_CAPACITY is 0. @@ -178,9 +192,6 @@ unsafe impl FromBytes for FixedLenByteArray { fn from_le_bytes(bs: Self::Buffer) -> Self { bs.into() } - fn from_u64(_v: u64) -> Self { - unreachable!("FixedLenByteArray does not support from_u64") - } } /// Reads `size` of bytes from `src`, and reinterprets them as type `ty`, in @@ -464,7 +475,7 @@ impl BitReader { /// Reads a value of type `T` and of size `num_bits`. /// /// Returns `None` if there's not enough data available. `Some` otherwise. - pub fn get_value(&mut self, num_bits: usize) -> Option { + pub fn get_value(&mut self, num_bits: usize) -> Option { assert!(num_bits <= 64); assert!(num_bits <= size_of::() * 8); @@ -507,8 +518,8 @@ impl BitReader { /// This function panics if /// - `num_bits` is larger than the bit-capacity of `T` /// - pub fn get_batch(&mut self, batch: &mut [T], num_bits: usize) -> usize { - assert!(num_bits <= size_of::() * 8); + pub fn get_batch(&mut self, batch: &mut [T], num_bits: usize) -> usize { + debug_assert!(num_bits <= size_of::() * 8); let mut values_to_read = batch.len(); let needed_bits = num_bits * values_to_read; @@ -1074,7 +1085,7 @@ mod tests { fn test_get_batch_helper(total: usize, num_bits: usize) where - T: FromBytes + Default + Clone + Debug + Eq, + T: FromBitpacked + Default + Clone + Debug + Eq, { assert!(num_bits <= 64); let num_bytes = ceil(num_bits, 8); From fd18da1bd6cc44496d6c3b338e1814543718d2b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 4 Apr 2026 14:49:55 +0200 Subject: [PATCH 3/9] Unroll dictionary scatter loop in RleDecoder::get_batch_with_dict Use chunks_exact(8) to process dictionary index lookups in groups of 8, allowing the compiler to unroll the inner loop and pipeline dependent memory accesses. This gives ~12% improvement on dictionary-encoded reads. Co-Authored-By: Claude Opus 4.6 (1M context) --- parquet/src/encodings/rle.rs | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/parquet/src/encodings/rle.rs b/parquet/src/encodings/rle.rs index 3129e026de9d..7f0700287229 100644 --- a/parquet/src/encodings/rle.rs +++ b/parquet/src/encodings/rle.rs @@ -507,10 +507,20 @@ impl RleDecoder { self.bit_packed_left = 0; break; } - buffer[values_read..values_read + num_values] - .iter_mut() - .zip(index_buf[..num_values].iter()) - .for_each(|(b, i)| b.clone_from(&dict[*i as usize])); + { + let out = &mut buffer[values_read..values_read + num_values]; + let idx = &index_buf[..num_values]; + let mut out_chunks = out.chunks_exact_mut(8); + let idx_chunks = idx.chunks_exact(8); + for (out_chunk, idx_chunk) in out_chunks.by_ref().zip(idx_chunks) { + for (b, i) in out_chunk.iter_mut().zip(idx_chunk.iter()) { + b.clone_from(&dict[*i as usize]); + } + } + for (b, i) in out_chunks.into_remainder().iter_mut().zip(idx.chunks_exact(8).remainder().iter()) { + b.clone_from(&dict[*i as usize]); + } + } self.bit_packed_left -= num_values as u32; values_read += num_values; if num_values < to_read { From ac3135787e734cd17687b668219bcfc9af4aed4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 4 Apr 2026 14:53:39 +0200 Subject: [PATCH 4/9] Use single bounds check per chunk in dictionary scatter loop Instead of bounds-checking each dictionary index individually, compute the max index across the 8-element chunk and check once. This allows using get_unchecked in the inner loop, improving dictionary-encoded reads by ~17% (up from ~12% with per-element checks). Co-Authored-By: Claude Opus 4.6 (1M context) --- parquet/src/encodings/rle.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/parquet/src/encodings/rle.rs b/parquet/src/encodings/rle.rs index 7f0700287229..dd9569ef6823 100644 --- a/parquet/src/encodings/rle.rs +++ b/parquet/src/encodings/rle.rs @@ -513,8 +513,11 @@ impl RleDecoder { let mut out_chunks = out.chunks_exact_mut(8); let idx_chunks = idx.chunks_exact(8); for (out_chunk, idx_chunk) in out_chunks.by_ref().zip(idx_chunks) { + let max_idx = idx_chunk.iter().copied().max().unwrap_or(0) as usize; + assert!(max_idx < dict.len(), "dictionary index out of bounds"); for (b, i) in out_chunk.iter_mut().zip(idx_chunk.iter()) { - b.clone_from(&dict[*i as usize]); + // SAFETY: max of all indices checked above + b.clone_from(unsafe { dict.get_unchecked(*i as usize) }); } } for (b, i) in out_chunks.into_remainder().iter_mut().zip(idx.chunks_exact(8).remainder().iter()) { From d69da720698375ed7e300386e0a290be0df03d18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 4 Apr 2026 15:00:31 +0200 Subject: [PATCH 5/9] Fix rustfmt style_edition 2024 formatting Co-Authored-By: Claude Opus 4.6 (1M context) --- parquet/src/encodings/rle.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/parquet/src/encodings/rle.rs b/parquet/src/encodings/rle.rs index dd9569ef6823..339006c598b5 100644 --- a/parquet/src/encodings/rle.rs +++ b/parquet/src/encodings/rle.rs @@ -520,7 +520,11 @@ impl RleDecoder { b.clone_from(unsafe { dict.get_unchecked(*i as usize) }); } } - for (b, i) in out_chunks.into_remainder().iter_mut().zip(idx.chunks_exact(8).remainder().iter()) { + for (b, i) in out_chunks + .into_remainder() + .iter_mut() + .zip(idx.chunks_exact(8).remainder().iter()) + { b.clone_from(&dict[*i as usize]); } } From 5aba598d0d923c2dd8e6912dc274947540dd41e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 4 Apr 2026 15:03:13 +0200 Subject: [PATCH 6/9] Use all() instead of max() for dictionary bounds check Replace max()+assert with all(|i| i < len)+assert for the per-chunk dictionary bounds check. This avoids the reduction and allows the compiler to vectorize 8 independent comparisons, improving dictionary reads by ~19% over baseline (vs ~17% with max). Co-Authored-By: Claude Opus 4.6 (1M context) --- parquet/src/encodings/rle.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/parquet/src/encodings/rle.rs b/parquet/src/encodings/rle.rs index 339006c598b5..8113742c4558 100644 --- a/parquet/src/encodings/rle.rs +++ b/parquet/src/encodings/rle.rs @@ -513,8 +513,11 @@ impl RleDecoder { let mut out_chunks = out.chunks_exact_mut(8); let idx_chunks = idx.chunks_exact(8); for (out_chunk, idx_chunk) in out_chunks.by_ref().zip(idx_chunks) { - let max_idx = idx_chunk.iter().copied().max().unwrap_or(0) as usize; - assert!(max_idx < dict.len(), "dictionary index out of bounds"); + let dict_len = dict.len(); + assert!( + idx_chunk.iter().all(|&i| (i as usize) < dict_len), + "dictionary index out of bounds" + ); for (b, i) in out_chunk.iter_mut().zip(idx_chunk.iter()) { // SAFETY: max of all indices checked above b.clone_from(unsafe { dict.get_unchecked(*i as usize) }); From bdbb3a6b2b69ef71130a5fb1f147a5b9b3c6069f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Sat, 4 Apr 2026 19:05:55 +0200 Subject: [PATCH 7/9] Update safety comment to reflect all() bounds check Co-Authored-By: Claude Opus 4.6 (1M context) --- parquet/src/encodings/rle.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/encodings/rle.rs b/parquet/src/encodings/rle.rs index 8113742c4558..fc899ce493ea 100644 --- a/parquet/src/encodings/rle.rs +++ b/parquet/src/encodings/rle.rs @@ -519,7 +519,7 @@ impl RleDecoder { "dictionary index out of bounds" ); for (b, i) in out_chunk.iter_mut().zip(idx_chunk.iter()) { - // SAFETY: max of all indices checked above + // SAFETY: all indices checked above to be in bounds b.clone_from(unsafe { dict.get_unchecked(*i as usize) }); } } From 995041f618fa7ada5e8b3c3a419f065b83495900 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Tue, 7 Apr 2026 11:48:26 +0200 Subject: [PATCH 8/9] Indent, debug_assert --- parquet/src/encodings/rle.rs | 42 +++++++++++++++++------------------- parquet/src/util/bit_util.rs | 6 +++--- 2 files changed, 23 insertions(+), 25 deletions(-) diff --git a/parquet/src/encodings/rle.rs b/parquet/src/encodings/rle.rs index fc899ce493ea..ddbd92bbda5e 100644 --- a/parquet/src/encodings/rle.rs +++ b/parquet/src/encodings/rle.rs @@ -507,30 +507,28 @@ impl RleDecoder { self.bit_packed_left = 0; break; } - { - let out = &mut buffer[values_read..values_read + num_values]; - let idx = &index_buf[..num_values]; - let mut out_chunks = out.chunks_exact_mut(8); - let idx_chunks = idx.chunks_exact(8); - for (out_chunk, idx_chunk) in out_chunks.by_ref().zip(idx_chunks) { - let dict_len = dict.len(); - assert!( - idx_chunk.iter().all(|&i| (i as usize) < dict_len), - "dictionary index out of bounds" - ); - for (b, i) in out_chunk.iter_mut().zip(idx_chunk.iter()) { - // SAFETY: all indices checked above to be in bounds - b.clone_from(unsafe { dict.get_unchecked(*i as usize) }); - } - } - for (b, i) in out_chunks - .into_remainder() - .iter_mut() - .zip(idx.chunks_exact(8).remainder().iter()) - { - b.clone_from(&dict[*i as usize]); + let out: &mut [T] = &mut buffer[values_read..values_read + num_values]; + let idx = &index_buf[..num_values]; + let mut out_chunks = out.chunks_exact_mut(8); + let idx_chunks = idx.chunks_exact(8); + for (out_chunk, idx_chunk) in out_chunks.by_ref().zip(idx_chunks) { + let dict_len = dict.len(); + assert!( + idx_chunk.iter().all(|&i| (i as usize) < dict_len), + "dictionary index out of bounds" + ); + for (b, i) in out_chunk.iter_mut().zip(idx_chunk.iter()) { + // SAFETY: all indices checked above to be in bounds + b.clone_from(unsafe { dict.get_unchecked(*i as usize) }); } } + for (b, i) in out_chunks + .into_remainder() + .iter_mut() + .zip(idx.chunks_exact(8).remainder().iter()) + { + b.clone_from(&dict[*i as usize]); + } self.bit_packed_left -= num_values as u32; values_read += num_values; if num_values < to_read { diff --git a/parquet/src/util/bit_util.rs b/parquet/src/util/bit_util.rs index 5259d2ee4abb..4155c22bba79 100644 --- a/parquet/src/util/bit_util.rs +++ b/parquet/src/util/bit_util.rs @@ -476,8 +476,8 @@ impl BitReader { /// /// Returns `None` if there's not enough data available. `Some` otherwise. pub fn get_value(&mut self, num_bits: usize) -> Option { - assert!(num_bits <= 64); - assert!(num_bits <= size_of::() * 8); + debug_assert!(num_bits <= 64); + debug_assert!(num_bits <= size_of::() * 8); if self.byte_offset * 8 + self.bit_offset + num_bits > self.buffer.len() * 8 { return None; @@ -663,7 +663,7 @@ impl BitReader { /// /// Return the number of values skipped (up to num_values) pub fn skip(&mut self, num_values: usize, num_bits: usize) -> usize { - assert!(num_bits <= 64); + debug_assert!(num_bits <= 64); let needed_bits = num_bits * num_values; let remaining_bits = (self.buffer.len() - self.byte_offset) * 8 - self.bit_offset; From e955fcd913e668562dcb2eb02d7a47ce4a565de1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Tue, 7 Apr 2026 12:06:36 +0200 Subject: [PATCH 9/9] Use debug_assert for redundant runtime checks that add code size Change assert! to debug_assert! in read_num_bytes, RleEncoder::new_from_buf, and RleDecoder::get_batch_with_dict where the checks are either redundant (subsequent indexing already panics) or in cold constructor code. Verified via cargo-asm that this removes instructions from hot paths. Co-Authored-By: Claude Opus 4.6 (1M context) --- parquet/src/encodings/rle.rs | 46 +++++++++++++++++++----------------- parquet/src/util/bit_util.rs | 2 +- 2 files changed, 25 insertions(+), 23 deletions(-) diff --git a/parquet/src/encodings/rle.rs b/parquet/src/encodings/rle.rs index ddbd92bbda5e..c2beb4a534f0 100644 --- a/parquet/src/encodings/rle.rs +++ b/parquet/src/encodings/rle.rs @@ -84,7 +84,7 @@ impl RleEncoder { /// Initialize the encoder from existing `buffer` pub fn new_from_buf(bit_width: u8, buffer: Vec) -> Self { - assert!(bit_width <= 64); + debug_assert!(bit_width <= 64); let bit_writer = BitWriter::new_from_buf(buffer); RleEncoder { bit_width, @@ -469,7 +469,7 @@ impl RleDecoder { where T: Default + Clone, { - assert!(buffer.len() >= max_values); + debug_assert!(buffer.len() >= max_values); let mut values_read = 0; while values_read < max_values { @@ -507,27 +507,29 @@ impl RleDecoder { self.bit_packed_left = 0; break; } - let out: &mut [T] = &mut buffer[values_read..values_read + num_values]; - let idx = &index_buf[..num_values]; - let mut out_chunks = out.chunks_exact_mut(8); - let idx_chunks = idx.chunks_exact(8); - for (out_chunk, idx_chunk) in out_chunks.by_ref().zip(idx_chunks) { - let dict_len = dict.len(); - assert!( - idx_chunk.iter().all(|&i| (i as usize) < dict_len), - "dictionary index out of bounds" - ); - for (b, i) in out_chunk.iter_mut().zip(idx_chunk.iter()) { - // SAFETY: all indices checked above to be in bounds - b.clone_from(unsafe { dict.get_unchecked(*i as usize) }); - } - } - for (b, i) in out_chunks - .into_remainder() - .iter_mut() - .zip(idx.chunks_exact(8).remainder().iter()) { - b.clone_from(&dict[*i as usize]); + let out = &mut buffer[values_read..values_read + num_values]; + let idx = &index_buf[..num_values]; + let mut out_chunks = out.chunks_exact_mut(8); + let idx_chunks = idx.chunks_exact(8); + for (out_chunk, idx_chunk) in out_chunks.by_ref().zip(idx_chunks) { + let dict_len = dict.len(); + assert!( + idx_chunk.iter().all(|&i| (i as usize) < dict_len), + "dictionary index out of bounds" + ); + for (b, i) in out_chunk.iter_mut().zip(idx_chunk.iter()) { + // SAFETY: all indices checked above to be in bounds + b.clone_from(unsafe { dict.get_unchecked(*i as usize) }); + } + } + for (b, i) in out_chunks + .into_remainder() + .iter_mut() + .zip(idx.chunks_exact(8).remainder().iter()) + { + b.clone_from(&dict[*i as usize]); + } } self.bit_packed_left -= num_values as u32; values_read += num_values; diff --git a/parquet/src/util/bit_util.rs b/parquet/src/util/bit_util.rs index 4155c22bba79..2a0a4eb7d6e3 100644 --- a/parquet/src/util/bit_util.rs +++ b/parquet/src/util/bit_util.rs @@ -201,7 +201,7 @@ pub(crate) fn read_num_bytes(size: usize, src: &[u8]) -> T where T: FromBytes, { - assert!(size <= src.len()); + debug_assert!(size <= src.len()); let mut buffer = ::Buffer::default(); buffer.as_mut()[..size].copy_from_slice(&src[..size]); ::from_le_bytes(buffer)