Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions parquet/src/file/metadata/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -836,6 +836,11 @@ pub struct ColumnChunkMetaData {
column_crypto_metadata: Option<Box<ColumnCryptoMetaData>>,
#[cfg(feature = "encryption")]
encrypted_column_metadata: Option<Vec<u8>>,
/// When true, indicates the footer is plaintext (not encrypted).
/// This affects how column metadata is serialized when `encrypted_column_metadata` is present.
/// This field is only used at write time and is not needed when reading metadata.
#[cfg(feature = "encryption")]
plaintext_footer_mode: bool,
}

/// Histograms for repetition and definition levels.
Expand Down Expand Up @@ -1244,6 +1249,8 @@ impl ColumnChunkMetaDataBuilder {
column_crypto_metadata: None,
#[cfg(feature = "encryption")]
encrypted_column_metadata: None,
#[cfg(feature = "encryption")]
plaintext_footer_mode: false,
})
}

Expand Down
12 changes: 10 additions & 2 deletions parquet/src/file/metadata/thrift/encryption.rs
Original file line number Diff line number Diff line change
Expand Up @@ -145,10 +145,18 @@ fn row_group_from_encrypted_thrift(
}
Some(ColumnCryptoMetaData::ENCRYPTION_WITH_COLUMN_KEY(crypto_metadata)) => {
let column_name = crypto_metadata.path_in_schema.join(".");
decryptor.get_column_metadata_decryptor(
// Try to get the decryptor - if it fails, we don't have the key
match decryptor.get_column_metadata_decryptor(
column_name.as_str(),
crypto_metadata.key_metadata.as_deref(),
)?
) {
Ok(dec) => dec,
Err(_) => {
// We don't have the key for this column, so we can't decrypt its metadata.
columns.push(c);
continue;
}
}
}
Some(ColumnCryptoMetaData::ENCRYPTION_WITH_FOOTER_KEY) => {
decryptor.get_footer_decryptor()?
Expand Down
116 changes: 66 additions & 50 deletions parquet/src/file/metadata/thrift/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1285,6 +1285,19 @@ impl PageHeader {
/////////////////////////////////////////////////
// helper functions for writing file meta data

#[cfg(feature = "encryption")]
fn should_write_column_stats(column_chunk: &ColumnChunkMetaData) -> bool {
// If there is encrypted column metadata present,
// the column is encrypted with a different key to the footer or a plaintext footer is used,
// so the statistics are sensitive and shouldn't be written.
column_chunk.encrypted_column_metadata.is_none()
}

#[cfg(not(feature = "encryption"))]
fn should_write_column_stats(_column_chunk: &ColumnChunkMetaData) -> bool {
true
}

// serialize the bits of the column chunk needed for a thrift ColumnMetaData
// struct ColumnMetaData {
// 1: required Type type
Expand Down Expand Up @@ -1335,48 +1348,51 @@ pub(super) fn serialize_column_meta_data<W: Write>(
if let Some(dictionary_page_offset) = column_chunk.dictionary_page_offset {
last_field_id = dictionary_page_offset.write_thrift_field(w, 11, last_field_id)?;
}
// PageStatistics is the same as thrift Statistics, but writable
let stats = page_stats_to_thrift(column_chunk.statistics());
if let Some(stats) = stats {
last_field_id = stats.write_thrift_field(w, 12, last_field_id)?;
}
if let Some(page_encoding_stats) = column_chunk.page_encoding_stats() {
last_field_id = page_encoding_stats.write_thrift_field(w, 13, last_field_id)?;
}
if let Some(bloom_filter_offset) = column_chunk.bloom_filter_offset {
last_field_id = bloom_filter_offset.write_thrift_field(w, 14, last_field_id)?;
}
if let Some(bloom_filter_length) = column_chunk.bloom_filter_length {
last_field_id = bloom_filter_length.write_thrift_field(w, 15, last_field_id)?;
}

// SizeStatistics
let size_stats = if column_chunk.unencoded_byte_array_data_bytes.is_some()
|| column_chunk.repetition_level_histogram.is_some()
|| column_chunk.definition_level_histogram.is_some()
{
let repetition_level_histogram = column_chunk
.repetition_level_histogram()
.map(|hist| hist.clone().into_inner());

let definition_level_histogram = column_chunk
.definition_level_histogram()
.map(|hist| hist.clone().into_inner());

Some(SizeStatistics {
unencoded_byte_array_data_bytes: column_chunk.unencoded_byte_array_data_bytes,
repetition_level_histogram,
definition_level_histogram,
})
} else {
None
};
if let Some(size_stats) = size_stats {
last_field_id = size_stats.write_thrift_field(w, 16, last_field_id)?;
}
if should_write_column_stats(column_chunk) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the idea here to only write the required bits of the column meta_data so readers won't complain? If so, then should all of the fields below also be skipped?

Copy link
Copy Markdown
Contributor

@adamreeve adamreeve Dec 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, I agree all the below fields should be skipped too.

I checked what the C++ implementation does and it's actually a bit different between the plaintext footer case and when the footer is encrypted but the column is encrypted with a different key: https://github.com/apache/arrow/blob/cbd36b817fc77812f8df1a15bf24314de3b27f29/cpp/src/parquet/metadata.cc#L1748-L1755.

When there's a plaintext footer, only the statistics and encoding_stats are stripped out of the unencrypted metadata, similar to what we're proposing here. But when the footer is also encrypted, it's assumed that the reader can handle when the whole metadata field isn't set so this is completely skipped, which is what was done before.

It might make sense to match what C++ does and keep the previous behaviour of excluding the full ColumnChunk metadata field when the footer is encrypted, and check that the reader can handle this when it doesn't have the column key. That should have the benefit of not increasing the footer size too much. And then in the plaintext footer case, also exclude the other fields below.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch @etseidl !
I've moved the other sensitive metadata into the conditional and added tests. (except for geospatial since it requires #[cfg(feature = "geospatial")] and it seems straightforward enough to not cover here?)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @rok, looks good now. I'll leave it to @adamreeve if we want to revert to excluding the meta_data entirely when the footer is encrypted.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Per @adamreeve suggestion this now omits metadata in case encrypted metadata is present and strips out stats if it is not.

// PageStatistics is the same as thrift Statistics, but writable
let stats = page_stats_to_thrift(column_chunk.statistics());
if let Some(stats) = stats {
last_field_id = stats.write_thrift_field(w, 12, last_field_id)?;
}
if let Some(page_encoding_stats) = column_chunk.page_encoding_stats() {
last_field_id = page_encoding_stats.write_thrift_field(w, 13, last_field_id)?;
}
if let Some(bloom_filter_offset) = column_chunk.bloom_filter_offset {
last_field_id = bloom_filter_offset.write_thrift_field(w, 14, last_field_id)?;
}
if let Some(bloom_filter_length) = column_chunk.bloom_filter_length {
last_field_id = bloom_filter_length.write_thrift_field(w, 15, last_field_id)?;
}

if let Some(geo_stats) = column_chunk.geo_statistics() {
geo_stats.write_thrift_field(w, 17, last_field_id)?;
// SizeStatistics
let size_stats = if column_chunk.unencoded_byte_array_data_bytes.is_some()
|| column_chunk.repetition_level_histogram.is_some()
|| column_chunk.definition_level_histogram.is_some()
{
let repetition_level_histogram = column_chunk
.repetition_level_histogram()
.map(|hist| hist.clone().into_inner());

let definition_level_histogram = column_chunk
.definition_level_histogram()
.map(|hist| hist.clone().into_inner());

Some(SizeStatistics {
unencoded_byte_array_data_bytes: column_chunk.unencoded_byte_array_data_bytes,
repetition_level_histogram,
definition_level_histogram,
})
} else {
None
};
if let Some(size_stats) = size_stats {
last_field_id = size_stats.write_thrift_field(w, 16, last_field_id)?;
}

if let Some(geo_stats) = column_chunk.geo_statistics() {
geo_stats.write_thrift_field(w, 17, last_field_id)?;
}
}

w.write_struct_end()
Expand Down Expand Up @@ -1596,17 +1612,17 @@ impl WriteThrift for ColumnChunkMetaData {
.write_thrift_field(writer, 2, last_field_id)?;

#[cfg(feature = "encryption")]
{
// only write the ColumnMetaData if we haven't already encrypted it
if self.encrypted_column_metadata.is_none() {
writer.write_field_begin(FieldType::Struct, 3, last_field_id)?;
serialize_column_meta_data(self, writer)?;
last_field_id = 3;
}
}
let write_meta_data =
self.encrypted_column_metadata.is_none() || self.plaintext_footer_mode;
#[cfg(not(feature = "encryption"))]
{
// always write the ColumnMetaData
let write_meta_data = true;

// When the footer is encrypted and encrypted_column_metadata is present,
// skip writing the plaintext meta_data field to reduce footer size.
// When the footer is plaintext (plaintext_footer_mode=true), we still write
// meta_data for backward compatibility with readers that expect it, but with
// sensitive fields (statistics, bloom filter info, etc.) stripped out.
if write_meta_data {
writer.write_field_begin(FieldType::Struct, 3, last_field_id)?;
serialize_column_meta_data(self, writer)?;
last_field_id = 3;
Expand Down
54 changes: 34 additions & 20 deletions parquet/src/file/metadata/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -818,34 +818,48 @@ impl MetadataObjectWriter {
) -> Result<ColumnChunkMetaData> {
// Column crypto metadata should have already been set when the column was created.
// Here we apply the encryption by encrypting the column metadata if required.
match column_chunk.column_crypto_metadata.as_deref() {
None => {}
let encryptor = match column_chunk.column_crypto_metadata.as_deref() {
None => None,
Some(ColumnCryptoMetaData::ENCRYPTION_WITH_FOOTER_KEY) => {
let is_footer_encrypted = file_encryptor.properties().encrypt_footer();

// When uniform encryption is used the footer is already encrypted,
// so the column chunk does not need additional encryption.
// Except if we're in plaintext footer mode, then we need to encrypt
// the column metadata here.
if !is_footer_encrypted {
Some(file_encryptor.get_footer_encryptor()?)
} else {
None
}
}
Some(ColumnCryptoMetaData::ENCRYPTION_WITH_COLUMN_KEY(col_key)) => {
use crate::file::metadata::thrift::serialize_column_meta_data;

let column_path = col_key.path_in_schema.join(".");
let mut column_encryptor = file_encryptor.get_column_encryptor(&column_path)?;
let aad = create_module_aad(
file_encryptor.file_aad(),
ModuleType::ColumnMetaData,
row_group_index,
column_index,
None,
)?;
// create temp ColumnMetaData that we can encrypt
let mut buffer: Vec<u8> = vec![];
{
let mut prot = ThriftCompactOutputProtocol::new(&mut buffer);
serialize_column_meta_data(&column_chunk, &mut prot)?;
}
let ciphertext = column_encryptor.encrypt(&buffer, &aad)?;
Some(file_encryptor.get_column_encryptor(&column_path)?)
}
};

if let Some(mut encryptor) = encryptor {
use crate::file::metadata::thrift::serialize_column_meta_data;

column_chunk.encrypted_column_metadata = Some(ciphertext);
let aad = create_module_aad(
file_encryptor.file_aad(),
ModuleType::ColumnMetaData,
row_group_index,
column_index,
None,
)?;
// create temp ColumnMetaData that we can encrypt
let mut buffer: Vec<u8> = vec![];
{
let mut prot = ThriftCompactOutputProtocol::new(&mut buffer);
serialize_column_meta_data(&column_chunk, &mut prot)?;
}
let ciphertext = encryptor.encrypt(&buffer, &aad)?;
column_chunk.encrypted_column_metadata = Some(ciphertext);
// Track whether the footer is plaintext, which affects how we serialize
// the column metadata (we need to write stripped metadata for backward compatibility)
column_chunk.plaintext_footer_mode = !file_encryptor.properties().encrypt_footer();
}

Ok(column_chunk)
Expand Down
Loading
Loading