Skip to content

Commit 67e04e7

Browse files
feat: change default behavior for Parquet PageEncodingStats to bitmask (#9051)
# Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. --> - Closes #8859 # Rationale for this change <!-- Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed. Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes. --> > Currently the default behavior is to parse the full vector of encoding stats, but given the limited use of this information we should instead default to the more concise and performant bitmask. # What changes are included in this PR? <!-- There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR. --> - Implement `Default` for `ParquetMetaDataOptions` with `encoding_stats_as_mask: true` - Update Thrift decoding logic to default to bitmask even if options are missing - Update documentation to reflect the new default behavior - Update existing tests to maintain coverage for full statistics - Add new tests to verify default behavior and full stats option # Are these changes tested? <!-- We typically require tests for all PRs in order to: 1. Prevent the code from being accidentally broken by subsequent changes 2. Serve as another way to document the expected behavior of the code If tests are not included in your PR, please explain why (for example, are they covered by existing tests)? --> Yes # Are there any user-facing changes? <!-- If there are user-facing changes then we may require documentation to be updated before approving the PR. If there are any breaking changes to public APIs, please call them out. --> Yes
1 parent 9e822e0 commit 67e04e7

File tree

7 files changed

+138
-38
lines changed

7 files changed

+138
-38
lines changed

parquet/benches/metadata.rs

Lines changed: 11 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ use std::sync::Arc;
2121
use parquet::basic::{Encoding, PageType, Type as PhysicalType};
2222
use parquet::file::metadata::{
2323
ColumnChunkMetaData, FileMetaData, PageEncodingStats, ParquetMetaData, ParquetMetaDataOptions,
24-
ParquetMetaDataReader, ParquetMetaDataWriter, ParquetStatisticsPolicy, RowGroupMetaData,
24+
ParquetMetaDataReader, ParquetMetaDataWriter, RowGroupMetaData,
2525
};
2626
use parquet::file::statistics::Statistics;
2727
use parquet::file::writer::TrackedWrite;
@@ -164,26 +164,17 @@ fn criterion_benchmark(c: &mut Criterion) {
164164
})
165165
});
166166

167-
let schema = ParquetMetaDataReader::decode_schema(&meta_data).unwrap();
168-
let options = ParquetMetaDataOptions::new().with_schema(schema);
169-
c.bench_function("decode metadata with schema", |b| {
170-
b.iter(|| {
171-
ParquetMetaDataReader::decode_metadata_with_options(&meta_data, Some(&options))
172-
.unwrap();
173-
})
174-
});
175-
176-
let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(true);
177-
c.bench_function("decode metadata with stats mask", |b| {
167+
let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false);
168+
c.bench_function("decode metadata (full stats)", |b| {
178169
b.iter(|| {
179170
ParquetMetaDataReader::decode_metadata_with_options(&meta_data, Some(&options))
180171
.unwrap();
181172
})
182173
});
183174

184-
let options =
185-
ParquetMetaDataOptions::new().with_encoding_stats_policy(ParquetStatisticsPolicy::SkipAll);
186-
c.bench_function("decode metadata with skip PES", |b| {
175+
let schema = ParquetMetaDataReader::decode_schema(&meta_data).unwrap();
176+
let options = ParquetMetaDataOptions::new().with_schema(schema);
177+
c.bench_function("decode metadata with schema", |b| {
187178
b.iter(|| {
188179
ParquetMetaDataReader::decode_metadata_with_options(&meta_data, Some(&options))
189180
.unwrap();
@@ -197,24 +188,16 @@ fn criterion_benchmark(c: &mut Criterion) {
197188
})
198189
});
199190

200-
let schema = ParquetMetaDataReader::decode_schema(&buf).unwrap();
201-
let options = ParquetMetaDataOptions::new().with_schema(schema);
202-
c.bench_function("decode metadata (wide) with schema", |b| {
191+
let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false);
192+
c.bench_function("decode metadata (wide) (full stats)", |b| {
203193
b.iter(|| {
204194
ParquetMetaDataReader::decode_metadata_with_options(&buf, Some(&options)).unwrap();
205195
})
206196
});
207197

208-
let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(true);
209-
c.bench_function("decode metadata (wide) with stats mask", |b| {
210-
b.iter(|| {
211-
ParquetMetaDataReader::decode_metadata_with_options(&buf, Some(&options)).unwrap();
212-
})
213-
});
214-
215-
let options =
216-
ParquetMetaDataOptions::new().with_encoding_stats_policy(ParquetStatisticsPolicy::SkipAll);
217-
c.bench_function("decode metadata (wide) with skip PES", |b| {
198+
let schema = ParquetMetaDataReader::decode_schema(&buf).unwrap();
199+
let options = ParquetMetaDataOptions::new().with_schema(schema);
200+
c.bench_function("decode metadata (wide) with schema", |b| {
218201
b.iter(|| {
219202
ParquetMetaDataReader::decode_metadata_with_options(&buf, Some(&options)).unwrap();
220203
})

parquet/src/arrow/arrow_writer/mod.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4480,7 +4480,10 @@ mod tests {
44804480
.unwrap();
44814481

44824482
// check that the read metadata is also correct
4483-
let options = ReadOptionsBuilder::new().with_page_index().build();
4483+
let options = ReadOptionsBuilder::new()
4484+
.with_page_index()
4485+
.with_encoding_stats_as_mask(false)
4486+
.build();
44844487
let reader = SerializedFileReader::new_with_options(file, options).unwrap();
44854488

44864489
let rowgroup = reader.get_row_group(0).expect("row group missing");

parquet/src/arrow/mod.rs

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -494,7 +494,9 @@ pub fn parquet_column<'a>(
494494
#[cfg(test)]
495495
mod test {
496496
use crate::arrow::ArrowWriter;
497-
use crate::file::metadata::{ParquetMetaData, ParquetMetaDataReader, ParquetMetaDataWriter};
497+
use crate::file::metadata::{
498+
ParquetMetaData, ParquetMetaDataOptions, ParquetMetaDataReader, ParquetMetaDataWriter,
499+
};
498500
use crate::file::properties::{EnabledStatistics, WriterProperties};
499501
use crate::schema::parser::parse_message_type;
500502
use crate::schema::types::SchemaDescriptor;
@@ -511,13 +513,17 @@ mod test {
511513
let parquet_bytes = create_parquet_file();
512514

513515
// read the metadata from the file WITHOUT the page index structures
516+
let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false);
514517
let original_metadata = ParquetMetaDataReader::new()
518+
.with_metadata_options(Some(options))
515519
.parse_and_finish(&parquet_bytes)
516520
.unwrap();
517521

518522
// this should error because the page indexes are not present, but have offsets specified
519523
let metadata_bytes = metadata_to_bytes(&original_metadata);
524+
let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false);
520525
let err = ParquetMetaDataReader::new()
526+
.with_metadata_options(Some(options))
521527
.with_page_indexes(true) // there are no page indexes in the metadata
522528
.parse_and_finish(&metadata_bytes)
523529
.err()
@@ -533,7 +539,9 @@ mod test {
533539
let parquet_bytes = create_parquet_file();
534540

535541
// read the metadata from the file
542+
let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false);
536543
let original_metadata = ParquetMetaDataReader::new()
544+
.with_metadata_options(Some(options))
537545
.parse_and_finish(&parquet_bytes)
538546
.unwrap();
539547

@@ -545,7 +553,9 @@ mod test {
545553
"metadata is subset of parquet"
546554
);
547555

556+
let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false);
548557
let roundtrip_metadata = ParquetMetaDataReader::new()
558+
.with_metadata_options(Some(options))
549559
.parse_and_finish(&metadata_bytes)
550560
.unwrap();
551561

@@ -559,14 +569,18 @@ mod test {
559569

560570
// read the metadata from the file including the page index structures
561571
// (which are stored elsewhere in the footer)
572+
let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false);
562573
let original_metadata = ParquetMetaDataReader::new()
574+
.with_metadata_options(Some(options))
563575
.with_page_indexes(true)
564576
.parse_and_finish(&parquet_bytes)
565577
.unwrap();
566578

567579
// read metadata back from the serialized bytes and ensure it is the same
568580
let metadata_bytes = metadata_to_bytes(&original_metadata);
581+
let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false);
569582
let roundtrip_metadata = ParquetMetaDataReader::new()
583+
.with_metadata_options(Some(options))
570584
.with_page_indexes(true)
571585
.parse_and_finish(&metadata_bytes)
572586
.unwrap();

parquet/src/file/metadata/mod.rs

Lines changed: 75 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1062,6 +1062,10 @@ impl ColumnChunkMetaData {
10621062

10631063
/// Returns the page encoding statistics, or `None` if no page encoding statistics
10641064
/// are available (or they were converted to a mask).
1065+
///
1066+
/// Note: By default, this crate converts page encoding statistics to a mask for performance
1067+
/// reasons. To get the full statistics, you must set [`ParquetMetaDataOptions::with_encoding_stats_as_mask`]
1068+
/// to `false`.
10651069
pub fn page_encoding_stats(&self) -> Option<&Vec<PageEncodingStats>> {
10661070
match self.encoding_stats.as_ref() {
10671071
Some(ParquetPageEncodingStats::Full(stats)) => Some(stats),
@@ -1072,6 +1076,8 @@ impl ColumnChunkMetaData {
10721076
/// Returns the page encoding statistics reduced to a bitmask, or `None` if statistics are
10731077
/// not available (or they were left in their original form).
10741078
///
1079+
/// Note: This is the default behavior for this crate.
1080+
///
10751081
/// The [`PageEncodingStats`] struct was added to the Parquet specification specifically to
10761082
/// enable fast determination of whether all pages in a column chunk are dictionary encoded
10771083
/// (see <https://github.com/apache/parquet-format/pull/16>).
@@ -1667,7 +1673,9 @@ impl OffsetIndexBuilder {
16671673
mod tests {
16681674
use super::*;
16691675
use crate::basic::{PageType, SortOrder};
1670-
use crate::file::metadata::thrift::tests::{read_column_chunk, read_row_group};
1676+
use crate::file::metadata::thrift::tests::{
1677+
read_column_chunk, read_column_chunk_with_options, read_row_group,
1678+
};
16711679

16721680
#[test]
16731681
fn test_row_group_metadata_thrift_conversion() {
@@ -1822,7 +1830,72 @@ mod tests {
18221830
let mut buf = Vec::new();
18231831
let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
18241832
col_metadata.write_thrift(&mut writer).unwrap();
1825-
let col_chunk_res = read_column_chunk(&mut buf, column_descr).unwrap();
1833+
let col_chunk_res = read_column_chunk(&mut buf, column_descr.clone()).unwrap();
1834+
1835+
let expected_metadata = ColumnChunkMetaData::builder(column_descr)
1836+
.set_encodings_mask(EncodingMask::new_from_encodings(
1837+
[Encoding::PLAIN, Encoding::RLE].iter(),
1838+
))
1839+
.set_file_path("file_path".to_owned())
1840+
.set_num_values(1000)
1841+
.set_compression(Compression::SNAPPY)
1842+
.set_total_compressed_size(2000)
1843+
.set_total_uncompressed_size(3000)
1844+
.set_data_page_offset(4000)
1845+
.set_dictionary_page_offset(Some(5000))
1846+
.set_page_encoding_stats_mask(EncodingMask::new_from_encodings(
1847+
[Encoding::PLAIN, Encoding::RLE].iter(),
1848+
))
1849+
.set_bloom_filter_offset(Some(6000))
1850+
.set_bloom_filter_length(Some(25))
1851+
.set_offset_index_offset(Some(7000))
1852+
.set_offset_index_length(Some(25))
1853+
.set_column_index_offset(Some(8000))
1854+
.set_column_index_length(Some(25))
1855+
.set_unencoded_byte_array_data_bytes(Some(2000))
1856+
.set_repetition_level_histogram(Some(LevelHistogram::from(vec![100, 100])))
1857+
.set_definition_level_histogram(Some(LevelHistogram::from(vec![0, 200])))
1858+
.build()
1859+
.unwrap();
1860+
1861+
assert_eq!(col_chunk_res, expected_metadata);
1862+
}
1863+
1864+
#[test]
1865+
fn test_column_chunk_metadata_thrift_conversion_full_stats() {
1866+
let column_descr = get_test_schema_descr().column(0);
1867+
let stats = vec![
1868+
PageEncodingStats {
1869+
page_type: PageType::DATA_PAGE,
1870+
encoding: Encoding::PLAIN,
1871+
count: 3,
1872+
},
1873+
PageEncodingStats {
1874+
page_type: PageType::DATA_PAGE,
1875+
encoding: Encoding::RLE,
1876+
count: 5,
1877+
},
1878+
];
1879+
let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1880+
.set_encodings_mask(EncodingMask::new_from_encodings(
1881+
[Encoding::PLAIN, Encoding::RLE].iter(),
1882+
))
1883+
.set_num_values(1000)
1884+
.set_compression(Compression::SNAPPY)
1885+
.set_total_compressed_size(2000)
1886+
.set_total_uncompressed_size(3000)
1887+
.set_data_page_offset(4000)
1888+
.set_page_encoding_stats(stats)
1889+
.build()
1890+
.unwrap();
1891+
1892+
let mut buf = Vec::new();
1893+
let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1894+
col_metadata.write_thrift(&mut writer).unwrap();
1895+
1896+
let options = ParquetMetaDataOptions::new().with_encoding_stats_as_mask(false);
1897+
let col_chunk_res =
1898+
read_column_chunk_with_options(&mut buf, column_descr, Some(&options)).unwrap();
18261899

18271900
assert_eq!(col_chunk_res, col_metadata);
18281901
}

parquet/src/file/metadata/options.rs

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,13 +87,23 @@ impl ParquetStatisticsPolicy {
8787
/// [`ParquetMetaData`]: crate::file::metadata::ParquetMetaData
8888
/// [`ParquetMetaDataReader`]: crate::file::metadata::ParquetMetaDataReader
8989
/// [`ParquetMetaDataPushDecoder`]: crate::file::metadata::ParquetMetaDataPushDecoder
90-
#[derive(Default, Debug, Clone)]
90+
#[derive(Debug, Clone)]
9191
pub struct ParquetMetaDataOptions {
9292
schema_descr: Option<SchemaDescPtr>,
9393
encoding_stats_as_mask: bool,
9494
encoding_stats_policy: ParquetStatisticsPolicy,
9595
}
9696

97+
impl Default for ParquetMetaDataOptions {
98+
fn default() -> Self {
99+
Self {
100+
schema_descr: None,
101+
encoding_stats_as_mask: true,
102+
encoding_stats_policy: ParquetStatisticsPolicy::KeepAll,
103+
}
104+
}
105+
}
106+
97107
impl ParquetMetaDataOptions {
98108
/// Return a new default [`ParquetMetaDataOptions`].
99109
pub fn new() -> Self {
@@ -118,7 +128,7 @@ impl ParquetMetaDataOptions {
118128
}
119129

120130
/// Returns whether to present the [`encoding_stats`] field of the Parquet `ColumnMetaData`
121-
/// as a bitmask (defaults to `false`).
131+
/// as a bitmask (defaults to `true`).
122132
///
123133
/// See [`ColumnChunkMetaData::page_encoding_stats_mask`] for an explanation of why this
124134
/// might be desirable.
@@ -193,6 +203,12 @@ mod tests {
193203
};
194204
use std::{io::Read, sync::Arc};
195205

206+
#[test]
207+
fn test_options_default() {
208+
let options = ParquetMetaDataOptions::default();
209+
assert!(options.encoding_stats_as_mask());
210+
}
211+
196212
#[test]
197213
fn test_provide_schema() {
198214
let mut buf: Vec<u8> = Vec::new();

parquet/src/file/metadata/thrift/mod.rs

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -410,7 +410,7 @@ fn read_column_metadata<'a>(
410410
let mut seen_mask = 0u16;
411411

412412
let mut skip_pes = false;
413-
let mut pes_mask = false;
413+
let mut pes_mask = true;
414414

415415
if let Some(opts) = options {
416416
skip_pes = opts.skip_encoding_stats(col_index);
@@ -1704,7 +1704,7 @@ write_thrift_field!(RustBoundingBox, FieldType::Struct);
17041704
pub(crate) mod tests {
17051705
use crate::errors::Result;
17061706
use crate::file::metadata::thrift::{BoundingBox, SchemaElement, write_schema};
1707-
use crate::file::metadata::{ColumnChunkMetaData, RowGroupMetaData};
1707+
use crate::file::metadata::{ColumnChunkMetaData, ParquetMetaDataOptions, RowGroupMetaData};
17081708
use crate::parquet_thrift::tests::test_roundtrip;
17091709
use crate::parquet_thrift::{
17101710
ElementType, ThriftCompactOutputProtocol, ThriftSliceInputProtocol, read_thrift_vec,
@@ -1726,9 +1726,17 @@ pub(crate) mod tests {
17261726
pub(crate) fn read_column_chunk(
17271727
buf: &mut [u8],
17281728
column_descr: Arc<ColumnDescriptor>,
1729+
) -> Result<ColumnChunkMetaData> {
1730+
read_column_chunk_with_options(buf, column_descr, None)
1731+
}
1732+
1733+
pub(crate) fn read_column_chunk_with_options(
1734+
buf: &mut [u8],
1735+
column_descr: Arc<ColumnDescriptor>,
1736+
options: Option<&ParquetMetaDataOptions>,
17291737
) -> Result<ColumnChunkMetaData> {
17301738
let mut reader = ThriftSliceInputProtocol::new(buf);
1731-
crate::file::metadata::thrift::read_column_chunk(&mut reader, &column_descr, 0, None)
1739+
crate::file::metadata::thrift::read_column_chunk(&mut reader, &column_descr, 0, options)
17321740
}
17331741

17341742
pub(crate) fn roundtrip_schema(schema: TypePtr) -> Result<TypePtr> {

parquet/src/file/serialized_reader.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1855,7 +1855,10 @@ mod tests {
18551855
fn test_file_reader_optional_metadata() {
18561856
// file with optional metadata: bloom filters, encoding stats, column index and offset index.
18571857
let file = get_test_file("data_index_bloom_encoding_stats.parquet");
1858-
let file_reader = Arc::new(SerializedFileReader::new(file).unwrap());
1858+
let options = ReadOptionsBuilder::new()
1859+
.with_encoding_stats_as_mask(false)
1860+
.build();
1861+
let file_reader = Arc::new(SerializedFileReader::new_with_options(file, options).unwrap());
18591862

18601863
let row_group_metadata = file_reader.metadata.row_group(0);
18611864
let col0_metadata = row_group_metadata.column(0);

0 commit comments

Comments
 (0)