@@ -41,7 +41,12 @@ use bytes::Bytes;
4141use crate :: errors:: { ParquetError , Result } ;
4242use crate :: util:: bit_util:: { self , BitReader , BitWriter , FromBitpacked } ;
4343
44- /// Maximum groups of 8 values per bit-packed run. Current value is 64.
44+ /// Number of values in one bit-packed group. The Parquet RLE/bit-packing hybrid
45+ /// format always bit-packs values in multiples of this count (see the format spec:
46+ /// "we always bit-pack a multiple of 8 values at a time").
47+ const BIT_PACK_GROUP_SIZE : usize = 8 ;
48+
49+ /// Maximum groups of `BIT_PACK_GROUP_SIZE` values per bit-packed run. Current value is 64.
4550const MAX_GROUPS_PER_BIT_PACKED_RUN : usize = 1 << 6 ;
4651
4752/// A RLE/Bit-Packing hybrid encoder.
@@ -54,9 +59,9 @@ pub struct RleEncoder {
5459 bit_writer : BitWriter ,
5560
5661 // Buffered values for bit-packed runs.
57- buffered_values : [ u64 ; 8 ] ,
62+ buffered_values : [ u64 ; BIT_PACK_GROUP_SIZE ] ,
5863
59- // Number of current buffered values. Must be less than 8 .
64+ // Number of current buffered values. Must be less than BIT_PACK_GROUP_SIZE .
6065 num_buffered_values : usize ,
6166
6267 // The current (also last) value that was written and the count of how many
@@ -89,7 +94,7 @@ impl RleEncoder {
8994 RleEncoder {
9095 bit_width,
9196 bit_writer,
92- buffered_values : [ 0 ; 8 ] ,
97+ buffered_values : [ 0 ; BIT_PACK_GROUP_SIZE ] ,
9398 num_buffered_values : 0 ,
9499 current_value : 0 ,
95100 repeat_count : 0 ,
@@ -101,10 +106,10 @@ impl RleEncoder {
101106 /// Returns the maximum buffer size to encode `num_values` values with
102107 /// `bit_width`.
103108 pub fn max_buffer_size ( bit_width : u8 , num_values : usize ) -> usize {
104- // The maximum size occurs with the shortest possible runs of 8
105- let num_runs = bit_util:: ceil ( num_values, 8 ) ;
109+ // The maximum size occurs with the shortest possible runs of BIT_PACK_GROUP_SIZE
110+ let num_runs = bit_util:: ceil ( num_values, BIT_PACK_GROUP_SIZE ) ;
106111
107- // The number of bytes in a run of 8
112+ // The number of bytes in a run of BIT_PACK_GROUP_SIZE
108113 let bytes_per_run = bit_width as usize ;
109114
110115 // The maximum size if stored as shortest possible bit packed runs of 8
@@ -114,7 +119,8 @@ impl RleEncoder {
114119 let rle_len_prefix = 1 ;
115120
116121 // The length of an RLE run of 8
117- let min_rle_run_size = rle_len_prefix + bit_util:: ceil ( bit_width as usize , 8 ) ;
122+ let min_rle_run_size =
123+ rle_len_prefix + bit_util:: ceil ( bit_width as usize , u8:: BITS as usize ) ;
118124
119125 // The maximum size if stored as shortest possible RLE runs of 8
120126 let rle_max_size = num_runs * min_rle_run_size;
@@ -123,7 +129,7 @@ impl RleEncoder {
123129 }
124130
125131 /// Returns `true` if the encoder is currently in RLE accumulation mode
126- /// for the given value (i.e., `repeat_count >= 8 ` and `current_value == value`).
132+ /// for the given value (i.e., `repeat_count >= BIT_PACK_GROUP_SIZE ` and `current_value == value`).
127133 ///
128134 /// The encoder enters accumulation mode as soon as the 8th consecutive identical
129135 /// value has been seen: at that point `flush_buffered_values` has committed the
@@ -132,7 +138,7 @@ impl RleEncoder {
132138 /// repetitions in O(1) once this returns `true`.
133139 #[ inline]
134140 pub fn is_accumulating ( & self , value : u64 ) -> bool {
135- self . repeat_count >= 8 && self . current_value == value
141+ self . repeat_count >= BIT_PACK_GROUP_SIZE && self . current_value == value
136142 }
137143
138144 /// Extends the current RLE run by `count` additional repetitions.
@@ -142,23 +148,24 @@ impl RleEncoder {
142148 /// returns `true` for the same value before calling this method.
143149 #[ inline]
144150 pub fn extend_run ( & mut self , count : usize ) {
145- debug_assert ! ( self . repeat_count >= 8 ) ;
151+ debug_assert ! ( self . repeat_count >= BIT_PACK_GROUP_SIZE ) ;
146152 self . repeat_count += count;
147153 }
148154
149155 /// Encodes `value`, which must be representable with `bit_width` bits.
150156 #[ inline]
151157 pub fn put ( & mut self , value : u64 ) {
152- // This function buffers 8 values at a time. After seeing 8 values, it
153- // decides whether the current run should be encoded in bit-packed or RLE.
158+ // This function buffers BIT_PACK_GROUP_SIZE values at a time. After seeing that
159+ // many values, it decides whether the current run should be encoded in bit-packed
160+ // or RLE.
154161 if self . current_value == value {
155162 self . repeat_count += 1 ;
156- if self . repeat_count > 8 {
163+ if self . repeat_count > BIT_PACK_GROUP_SIZE {
157164 // A continuation of last value. No need to buffer.
158165 return ;
159166 }
160167 } else {
161- if self . repeat_count >= 8 {
168+ if self . repeat_count >= BIT_PACK_GROUP_SIZE {
162169 // The current RLE run has ended and we've gathered enough. Flush first.
163170 debug_assert_eq ! ( self . bit_packed_count, 0 ) ;
164171 self . flush_rle_run ( ) ;
@@ -169,9 +176,9 @@ impl RleEncoder {
169176
170177 self . buffered_values [ self . num_buffered_values ] = value;
171178 self . num_buffered_values += 1 ;
172- if self . num_buffered_values == 8 {
179+ if self . num_buffered_values == BIT_PACK_GROUP_SIZE {
173180 // Buffered values are full. Flush them.
174- debug_assert_eq ! ( self . bit_packed_count % 8 , 0 ) ;
181+ debug_assert_eq ! ( self . bit_packed_count % BIT_PACK_GROUP_SIZE , 0 ) ;
175182 self . flush_buffered_values ( ) ;
176183 }
177184 }
@@ -243,9 +250,9 @@ impl RleEncoder {
243250 if self . repeat_count > 0 && all_repeat {
244251 self . flush_rle_run ( ) ;
245252 } else {
246- // Buffer the last group of bit-packed values to 8 by padding with 0s.
253+ // Buffer the last group of bit-packed values to BIT_PACK_GROUP_SIZE by padding with 0s.
247254 if self . num_buffered_values > 0 {
248- while self . num_buffered_values < 8 {
255+ while self . num_buffered_values < BIT_PACK_GROUP_SIZE {
249256 self . buffered_values [ self . num_buffered_values ] = 0 ;
250257 self . num_buffered_values += 1 ;
251258 }
@@ -263,7 +270,7 @@ impl RleEncoder {
263270 self . bit_writer . put_vlq_int ( indicator_value as u64 ) ;
264271 self . bit_writer . put_aligned (
265272 self . current_value ,
266- bit_util:: ceil ( self . bit_width as usize , 8 ) ,
273+ bit_util:: ceil ( self . bit_width as usize , u8 :: BITS as usize ) ,
267274 ) ;
268275 self . num_buffered_values = 0 ;
269276 self . repeat_count = 0 ;
@@ -281,7 +288,7 @@ impl RleEncoder {
281288 self . num_buffered_values = 0 ;
282289 if update_indicator_byte {
283290 // Write the indicator byte to the reserved position in `bit_writer`
284- let num_groups = self . bit_packed_count / 8 ;
291+ let num_groups = self . bit_packed_count / BIT_PACK_GROUP_SIZE ;
285292 let indicator_byte = ( ( num_groups << 1 ) | 1 ) as u8 ;
286293 self . bit_writer
287294 . put_aligned_offset ( indicator_byte, 1 , self . indicator_byte_pos as usize ) ;
@@ -291,19 +298,19 @@ impl RleEncoder {
291298 }
292299
293300 fn flush_buffered_values ( & mut self ) {
294- if self . repeat_count >= 8 {
301+ if self . repeat_count >= BIT_PACK_GROUP_SIZE {
295302 self . num_buffered_values = 0 ;
296303 if self . bit_packed_count > 0 {
297304 // In this case we choose RLE encoding. Flush the current buffered values
298305 // as bit-packed encoding.
299- debug_assert_eq ! ( self . bit_packed_count % 8 , 0 ) ;
306+ debug_assert_eq ! ( self . bit_packed_count % BIT_PACK_GROUP_SIZE , 0 ) ;
300307 self . flush_bit_packed_run ( true )
301308 }
302309 return ;
303310 }
304311
305312 self . bit_packed_count += self . num_buffered_values ;
306- let num_groups = self . bit_packed_count / 8 ;
313+ let num_groups = self . bit_packed_count / BIT_PACK_GROUP_SIZE ;
307314 if num_groups + 1 >= MAX_GROUPS_PER_BIT_PACKED_RUN {
308315 // We've reached the maximum value that can be hold in a single bit-packed
309316 // run.
@@ -584,10 +591,10 @@ impl RleDecoder {
584591 return Ok ( false ) ;
585592 }
586593 if indicator_value & 1 == 1 {
587- self . bit_packed_left = ( ( indicator_value >> 1 ) * 8 ) as u32 ;
594+ self . bit_packed_left = ( ( indicator_value >> 1 ) * BIT_PACK_GROUP_SIZE as i64 ) as u32 ;
588595 } else {
589596 self . rle_left = ( indicator_value >> 1 ) as u32 ;
590- let value_width = bit_util:: ceil ( self . bit_width as usize , 8 ) ;
597+ let value_width = bit_util:: ceil ( self . bit_width as usize , u8 :: BITS as usize ) ;
591598 self . current_value = bit_reader. get_aligned :: < u64 > ( value_width) ;
592599 self . current_value . ok_or_else ( || {
593600 general_err ! ( "parquet_data_error: not enough data for RLE decoding" )
@@ -869,7 +876,7 @@ mod tests {
869876 & values[ ..] ,
870877 width as u8 ,
871878 None ,
872- 2 * ( 1 + bit_util:: ceil ( width as i64 , 8 ) as i32 ) ,
879+ 2 * ( 1 + bit_util:: ceil ( width as i64 , u8 :: BITS as i64 ) as i32 ) ,
873880 ) ;
874881 }
875882
@@ -879,9 +886,12 @@ mod tests {
879886 for i in 0 ..101 {
880887 values. push ( i % 2 ) ;
881888 }
882- let num_groups = bit_util:: ceil ( 100 , 8 ) as u8 ;
889+ let num_groups = bit_util:: ceil ( 100 , BIT_PACK_GROUP_SIZE ) as u8 ;
883890 expected_buffer. push ( ( num_groups << 1 ) | 1 ) ;
884- expected_buffer. resize ( expected_buffer. len ( ) + 100 / 8 , 0b10101010 ) ;
891+ expected_buffer. resize (
892+ expected_buffer. len ( ) + 100 / BIT_PACK_GROUP_SIZE ,
893+ 0b10101010 ,
894+ ) ;
885895
886896 // For the last 4 0 and 1's, padded with 0.
887897 expected_buffer. push ( 0b00001010 ) ;
@@ -892,12 +902,12 @@ mod tests {
892902 1 + num_groups as i32 ,
893903 ) ;
894904 for width in 2 ..MAX_WIDTH + 1 {
895- let num_values = bit_util:: ceil ( 100 , 8 ) * 8 ;
905+ let num_values = bit_util:: ceil ( 100 , BIT_PACK_GROUP_SIZE ) * BIT_PACK_GROUP_SIZE ;
896906 validate_rle (
897907 & values,
898908 width as u8 ,
899909 None ,
900- 1 + bit_util:: ceil ( width as i64 * num_values, 8 ) as i32 ,
910+ 1 + bit_util:: ceil ( width as i64 * num_values as i64 , u8 :: BITS as i64 ) as i32 ,
901911 ) ;
902912 }
903913 }
@@ -1058,7 +1068,7 @@ mod tests {
10581068 let num_values = 2002 ;
10591069
10601070 // bit-packed header
1061- let run_bytes = ceil ( num_values * bit_width, 8 ) as u64 ;
1071+ let run_bytes = ceil ( num_values * bit_width, u8 :: BITS as usize ) as u64 ;
10621072 writer. put_vlq_int ( ( run_bytes << 1 ) | 1 ) ;
10631073 for _ in 0 ..run_bytes {
10641074 writer. put_aligned ( 0xFF_u8 , 1 ) ;
0 commit comments