Skip to content

Commit 4f6298f

Browse files
committed
make path_in_schema writing optional
1 parent aac969d commit 4f6298f

File tree

7 files changed

+93
-7
lines changed

7 files changed

+93
-7
lines changed

parquet/benches/metadata.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,8 @@ fn encoded_meta(is_nullable: bool, has_lists: bool) -> Vec<u8> {
143143
let mut buffer = Vec::with_capacity(1024);
144144
{
145145
let buf = TrackedWrite::new(&mut buffer);
146-
let writer = ParquetMetaDataWriter::new_with_tracked(buf, &metadata);
146+
let writer = ParquetMetaDataWriter::new_with_tracked(buf, &metadata)
147+
.with_write_path_in_schema(false);
147148
writer.finish().unwrap();
148149
}
149150

parquet/src/bin/parquet-rewrite.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,10 @@ struct Args {
279279
#[clap(long)]
280280
write_page_header_statistics: Option<bool>,
281281

282+
/// Write path_in_schema to the column metadata.
283+
#[clap(long)]
284+
write_path_in_schema: Option<bool>,
285+
282286
/// Sets whether bloom filter is enabled for all columns.
283287
#[clap(long)]
284288
bloom_filter_enabled: Option<bool>,
@@ -406,6 +410,9 @@ fn main() {
406410
if let Some(value) = args.coerce_types {
407411
writer_properties_builder = writer_properties_builder.set_coerce_types(value);
408412
}
413+
if let Some(value) = args.write_path_in_schema {
414+
writer_properties_builder = writer_properties_builder.set_write_path_in_schema(value);
415+
}
409416
if let Some(value) = args.write_batch_size {
410417
writer_properties_builder = writer_properties_builder.set_write_batch_size(value);
411418
}

parquet/src/file/metadata/thrift/mod.rs

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1328,10 +1328,15 @@ pub(super) fn serialize_column_meta_data<W: Write>(
13281328
.encodings()
13291329
.collect::<Vec<_>>()
13301330
.write_thrift_field(w, 2, 1)?;
1331-
let path = column_chunk.column_descr.path().parts();
1332-
let path: Vec<&str> = path.iter().map(|v| v.as_str()).collect();
1333-
path.write_thrift_field(w, 3, 2)?;
1334-
column_chunk.compression.write_thrift_field(w, 4, 3)?;
1331+
if w.write_path_in_schema() {
1332+
let path = column_chunk.column_descr.path().parts();
1333+
let path: Vec<&str> = path.iter().map(|v| v.as_str()).collect();
1334+
path.write_thrift_field(w, 3, 2)?;
1335+
column_chunk.compression.write_thrift_field(w, 4, 3)?;
1336+
} else {
1337+
column_chunk.compression.write_thrift_field(w, 4, 2)?;
1338+
}
1339+
13351340
column_chunk.num_values.write_thrift_field(w, 5, 4)?;
13361341
column_chunk
13371342
.total_uncompressed_size
@@ -1401,6 +1406,8 @@ pub(super) fn serialize_column_meta_data<W: Write>(
14011406
pub(super) struct FileMeta<'a> {
14021407
pub(super) file_metadata: &'a crate::file::metadata::FileMetaData,
14031408
pub(super) row_groups: &'a Vec<RowGroupMetaData>,
1409+
// If true, then write the `path_in_schema` field in the ColumnMetaData struct.
1410+
pub(super) write_path_in_schema: bool,
14041411
}
14051412

14061413
// struct FileMetaData {
@@ -1420,6 +1427,8 @@ impl<'a> WriteThrift for FileMeta<'a> {
14201427
// needed for last_field_id w/o encryption
14211428
#[allow(unused_assignments)]
14221429
fn write_thrift<W: Write>(&self, writer: &mut ThriftCompactOutputProtocol<W>) -> Result<()> {
1430+
writer.set_write_path_in_schema(self.write_path_in_schema);
1431+
14231432
self.file_metadata
14241433
.version
14251434
.write_thrift_field(writer, 1, 0)?;

parquet/src/file/metadata/writer.rs

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ pub(crate) struct ThriftMetadataWriter<'a, W: Write> {
6262
created_by: Option<String>,
6363
object_writer: MetadataObjectWriter,
6464
writer_version: i32,
65+
write_path_in_schema: bool,
6566
}
6667

6768
impl<'a, W: Write> ThriftMetadataWriter<'a, W> {
@@ -259,6 +260,7 @@ impl<'a, W: Write> ThriftMetadataWriter<'a, W> {
259260
let file_meta = FileMeta {
260261
file_metadata: &file_metadata,
261262
row_groups: &row_groups,
263+
write_path_in_schema: self.write_path_in_schema,
262264
};
263265

264266
// Write file metadata
@@ -293,6 +295,7 @@ impl<'a, W: Write> ThriftMetadataWriter<'a, W> {
293295
row_groups: Vec<RowGroupMetaData>,
294296
created_by: Option<String>,
295297
writer_version: i32,
298+
write_path_in_schema: bool,
296299
) -> Self {
297300
Self {
298301
buf,
@@ -304,6 +307,7 @@ impl<'a, W: Write> ThriftMetadataWriter<'a, W> {
304307
created_by,
305308
object_writer: Default::default(),
306309
writer_version,
310+
write_path_in_schema,
307311
}
308312
}
309313

@@ -415,6 +419,7 @@ impl<'a, W: Write> ThriftMetadataWriter<'a, W> {
415419
pub struct ParquetMetaDataWriter<'a, W: Write> {
416420
buf: TrackedWrite<W>,
417421
metadata: &'a ParquetMetaData,
422+
write_path_in_schema: bool,
418423
}
419424

420425
impl<'a, W: Write> ParquetMetaDataWriter<'a, W> {
@@ -436,7 +441,20 @@ impl<'a, W: Write> ParquetMetaDataWriter<'a, W> {
436441
///
437442
/// See example on the struct level documentation
438443
pub fn new_with_tracked(buf: TrackedWrite<W>, metadata: &'a ParquetMetaData) -> Self {
439-
Self { buf, metadata }
444+
Self {
445+
buf,
446+
metadata,
447+
write_path_in_schema: true,
448+
}
449+
}
450+
451+
/// Set whether or not to write the `path_in_schema` field in the Thrift `ColumnMetaData`
452+
/// struct.
453+
pub fn with_write_path_in_schema(self, val: bool) -> Self {
454+
Self {
455+
write_path_in_schema: val,
456+
..self
457+
}
440458
}
441459

442460
/// Write the metadata to the buffer
@@ -460,6 +478,7 @@ impl<'a, W: Write> ParquetMetaDataWriter<'a, W> {
460478
row_groups,
461479
created_by,
462480
file_metadata.version(),
481+
self.write_path_in_schema,
463482
);
464483

465484
if let Some(column_indexes) = column_indexes {

parquet/src/file/properties.rs

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option<usize> = Some(64);
6767
pub const DEFAULT_OFFSET_INDEX_DISABLED: bool = false;
6868
/// Default values for [`WriterProperties::coerce_types`]
6969
pub const DEFAULT_COERCE_TYPES: bool = false;
70+
/// Default value for [`WriterProperties::write_path_in_schema`]
71+
pub const DEFAULT_WRITE_PATH_IN_SCHEMA: bool = true;
7072
/// Default minimum chunk size for content-defined chunking: 256 KiB.
7173
pub const DEFAULT_CDC_MIN_CHUNK_SIZE: usize = 256 * 1024;
7274
/// Default maximum chunk size for content-defined chunking: 1024 KiB.
@@ -233,6 +235,7 @@ pub struct WriterProperties {
233235
statistics_truncate_length: Option<usize>,
234236
coerce_types: bool,
235237
content_defined_chunking: Option<CdcOptions>,
238+
write_path_in_schema: bool,
236239
#[cfg(feature = "encryption")]
237240
pub(crate) file_encryption_properties: Option<Arc<FileEncryptionProperties>>,
238241
}
@@ -429,6 +432,14 @@ impl WriterProperties {
429432
self.coerce_types
430433
}
431434

435+
/// Returns `true` if the `path_in_schema` field of the `ColumnMetaData` Thrift struct
436+
/// should be written.
437+
///
438+
/// For more details see [`WriterPropertiesBuilder::set_write_path_in_schema`]
439+
pub fn write_path_in_schema(&self) -> bool {
440+
self.write_path_in_schema
441+
}
442+
432443
/// EXPERIMENTAL: Returns content-defined chunking options, or `None` if CDC is disabled.
433444
///
434445
/// For more details see [`WriterPropertiesBuilder::set_content_defined_chunking`]
@@ -560,6 +571,7 @@ pub struct WriterPropertiesBuilder {
560571
statistics_truncate_length: Option<usize>,
561572
coerce_types: bool,
562573
content_defined_chunking: Option<CdcOptions>,
574+
write_path_in_schema: bool,
563575
#[cfg(feature = "encryption")]
564576
file_encryption_properties: Option<Arc<FileEncryptionProperties>>,
565577
}
@@ -584,6 +596,7 @@ impl Default for WriterPropertiesBuilder {
584596
statistics_truncate_length: DEFAULT_STATISTICS_TRUNCATE_LENGTH,
585597
coerce_types: DEFAULT_COERCE_TYPES,
586598
content_defined_chunking: None,
599+
write_path_in_schema: DEFAULT_WRITE_PATH_IN_SCHEMA,
587600
#[cfg(feature = "encryption")]
588601
file_encryption_properties: None,
589602
}
@@ -622,6 +635,7 @@ impl WriterPropertiesBuilder {
622635
statistics_truncate_length: self.statistics_truncate_length,
623636
coerce_types: self.coerce_types,
624637
content_defined_chunking: self.content_defined_chunking,
638+
write_path_in_schema: self.write_path_in_schema,
625639
#[cfg(feature = "encryption")]
626640
file_encryption_properties: self.file_encryption_properties,
627641
}
@@ -837,6 +851,22 @@ impl WriterPropertiesBuilder {
837851
self
838852
}
839853

854+
/// Should the writer should emit the `path_in_schema` element of the
855+
/// `ColumnMetaData` Thrift struct.
856+
///
857+
/// The `path_in_schema` field in the Thrift metadata is redundant and wastes a great
858+
/// deal of space. Parquet file footers can be made much smaller by omitting this field.
859+
/// Because the field was originally a mandatory field, this property defaults to `true`
860+
/// to maintain compatibility with older readers that expect this field to be present.
861+
/// If one knows that all readers one plans to use are tolerant of the absense of this field,
862+
/// this may be safely set to `false`.
863+
///
864+
/// At some point in the future this will default to `false`.
865+
pub fn set_write_path_in_schema(mut self, write_path_in_schema: bool) -> Self {
866+
self.write_path_in_schema = write_path_in_schema;
867+
self
868+
}
869+
840870
/// EXPERIMENTAL: Sets content-defined chunking options, or disables CDC with `None`.
841871
///
842872
/// When enabled, data page boundaries are determined by a rolling hash of the
@@ -1157,6 +1187,7 @@ impl From<WriterProperties> for WriterPropertiesBuilder {
11571187
statistics_truncate_length: props.statistics_truncate_length,
11581188
coerce_types: props.coerce_types,
11591189
content_defined_chunking: props.content_defined_chunking,
1190+
write_path_in_schema: props.write_path_in_schema,
11601191
#[cfg(feature = "encryption")]
11611192
file_encryption_properties: props.file_encryption_properties,
11621193
}

parquet/src/file/writer.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,12 +345,14 @@ impl<W: Write + Send> SerializedFileWriter<W> {
345345
let column_indexes = std::mem::take(&mut self.column_indexes);
346346
let offset_indexes = std::mem::take(&mut self.offset_indexes);
347347

348+
let write_path_in_schema = self.props.write_path_in_schema();
348349
let mut encoder = ThriftMetadataWriter::new(
349350
&mut self.buf,
350351
&self.descr,
351352
row_groups,
352353
Some(self.props.created_by().to_string()),
353354
self.props.writer_version().as_num(),
355+
write_path_in_schema,
354356
);
355357

356358
#[cfg(feature = "encryption")]

parquet/src/parquet_thrift.rs

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -708,12 +708,29 @@ where
708708
/// [compact output]: https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md
709709
pub(crate) struct ThriftCompactOutputProtocol<W: Write> {
710710
writer: W,
711+
write_path_in_schema: bool,
711712
}
712713

713714
impl<W: Write> ThriftCompactOutputProtocol<W> {
714715
/// Create a new `ThriftCompactOutputProtocol` wrapping the byte sink `writer`.
715716
pub(crate) fn new(writer: W) -> Self {
716-
Self { writer }
717+
Self {
718+
writer,
719+
write_path_in_schema: true,
720+
}
721+
}
722+
723+
// TODO(ets): at some point there should probably be a properties object
724+
// to control aspects of thrift output. But since this is the only option to date
725+
// I'm choosing a simpler API.
726+
/// Control the writing of the `path_in_schema` element of the `ColumnMetaData`
727+
pub(crate) fn set_write_path_in_schema(&mut self, val: bool) {
728+
self.write_path_in_schema = val;
729+
}
730+
731+
/// Indicate whether or not to emit `path_in_schema`.
732+
pub(crate) fn write_path_in_schema(&self) -> bool {
733+
self.write_path_in_schema
717734
}
718735

719736
/// Write a single byte to the output stream.

0 commit comments

Comments
 (0)