From 455bc8b01e1523cf47bad816ebb9e20c2a744e52 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Wed, 1 Apr 2026 14:42:59 +0200 Subject: [PATCH 1/8] fix(parquet): fix CDC panic on nested ListArrays with null entries (#9637) The CDC chunker's value_offset diverged from actual leaf array positions when null list entries had non-empty child offset ranges (valid per the Arrow columnar format spec). This caused slice_for_chunk to produce incorrect non_null_indices, leading to an out-of-bounds panic in write_mini_batch. Track non-null value counts (nni) separately from leaf slot counts in the chunker, and use them in slice_for_chunk to correctly index into non_null_indices regardless of gaps in the leaf array. --- parquet/src/arrow/arrow_writer/levels.rs | 196 +++++++++----------- parquet/src/column/chunker/cdc.rs | 219 ++++++++++++++++++++--- parquet/src/column/chunker/mod.rs | 6 +- 3 files changed, 287 insertions(+), 134 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index d1da24872c49..2ebe1319160f 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -805,37 +805,26 @@ impl ArrayLevels { /// Create a sliced view of this `ArrayLevels` for a CDC chunk. /// - /// Note: `def_levels`, `rep_levels`, and `non_null_indices` are copied (not zero-copy), - /// while `array` is sliced without copying. + /// The chunk's `value_offset`/`num_values` select the relevant slice of + /// `non_null_indices`. The array is sliced to the range covered by + /// those indices, and they are shifted to be relative to the slice. pub(crate) fn slice_for_chunk(&self, chunk: &CdcChunk) -> Self { - let level_offset = chunk.level_offset; - let num_levels = chunk.num_levels; - let value_offset = chunk.value_offset; - let num_values = chunk.num_values; - let def_levels = self - .def_levels - .as_ref() - .map(|levels| levels[level_offset..level_offset + num_levels].to_vec()); - let rep_levels = self - .rep_levels - .as_ref() - .map(|levels| levels[level_offset..level_offset + num_levels].to_vec()); - - // Filter non_null_indices to [value_offset, value_offset + num_values) - // and shift by -value_offset. Use binary search since the slice is sorted. - let value_end = value_offset + num_values; - let start = self - .non_null_indices - .partition_point(|&idx| idx < value_offset); - let end = self - .non_null_indices - .partition_point(|&idx| idx < value_end); - let non_null_indices: Vec = self.non_null_indices[start..end] - .iter() - .map(|&idx| idx - value_offset) - .collect(); + let def_levels = self.def_levels.as_ref().map(|levels| { + levels[chunk.level_offset..chunk.level_offset + chunk.num_levels].to_vec() + }); + let rep_levels = self.rep_levels.as_ref().map(|levels| { + levels[chunk.level_offset..chunk.level_offset + chunk.num_levels].to_vec() + }); - let array = self.array.slice(value_offset, num_values); + // Select the non-null indices for this chunk. + let nni = &self.non_null_indices[chunk.value_offset..chunk.value_offset + chunk.num_values]; + // Compute the array range spanned by the non-null indices + let start = nni.first().copied().unwrap_or(0); + let end = nni.last().map_or(0, |&i| i + 1); + // Shift indices to be relative to the sliced array. + let non_null_indices = nni.iter().map(|&idx| idx - start).collect(); + // Slice the array to the computed range. + let array = self.array.slice(start, end - start); let logical_nulls = array.logical_nulls(); Self { @@ -2149,9 +2138,8 @@ mod tests { fn test_slice_for_chunk_flat() { // Case 1: required field (max_def_level=0, no def/rep levels stored). // Array has 6 values; all are non-null so non_null_indices covers every position. - // The chunk selects value_offset=2, num_values=3 → the sub-array [3, 4, 5]. - // Since there are no levels, num_levels=0 and level_offset are irrelevant. - // non_null_indices [0,1,2,3,4,5] filtered to [2,4) and shifted by -2 → [0,1,2]. + // value_offset=2, num_values=3 → non_null_indices[2..5] = [2,3,4]. + // Array is sliced (no def_levels → write_batch_internal uses values.len()). let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6])); let logical_nulls = array.logical_nulls(); let levels = ArrayLevels { @@ -2176,14 +2164,9 @@ mod tests { // Case 2: optional field (max_def_level=1, def levels present, no rep levels). // Array: [Some(1), None, Some(3), None, Some(5), Some(6)] - // def_levels: [1, 0, 1, 0, 1, 1] (1=non-null, 0=null) - // non_null_indices: [0, 2, 4, 5] (array positions of the four non-null values) - // - // The chunk selects level_offset=1, num_levels=3, value_offset=1, num_values=3: - // - def_levels[1..4] = [0, 1, 0] → null, non-null, null - // - sub-array slice(1, 3) = [None, Some(3), None] - // - non_null_indices filtered to [value_offset=1, value_end=4): only index 2 qualifies, - // shifted by -1 → [1] (position of Some(3) within the sliced sub-array) + // non_null_indices: [0, 2, 4, 5] + // value_offset=1, num_values=1 → non_null_indices[1..2] = [2]. + // Array is not sliced (def_levels present → num_levels from def_levels.len()). let array: ArrayRef = Arc::new(Int32Array::from(vec![ Some(1), None, @@ -2206,90 +2189,85 @@ mod tests { level_offset: 1, num_levels: 3, value_offset: 1, - num_values: 3, + num_values: 1, }); assert_eq!(sliced.def_levels, Some(vec![0, 1, 0])); assert!(sliced.rep_levels.is_none()); - assert_eq!(sliced.non_null_indices, vec![1]); - assert_eq!(sliced.array.len(), 3); + assert_eq!(sliced.non_null_indices, vec![0]); // [2] shifted by -2 (nni[0]) + assert_eq!(sliced.array.len(), 1); } #[test] - fn test_slice_for_chunk_nested() { - // [[1,2],[3],[4,5]]: def=[2,2,2,2,2], rep=[0,1,0,0,1] - // Slice levels 2..5 (def=[2,2,2], rep=[0,0,1]), values 2..5 - let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])); + fn test_slice_for_chunk_nested_with_nulls() { + // Regression test for https://github.com/apache/arrow-rs/issues/9637 + // + // Simulates a List where null list entries have non-zero child + // ranges (valid per Arrow spec: "a null value may correspond to a + // non-empty segment in the child array"). This creates gaps in the + // leaf array that don't correspond to any levels. + // + // 5 rows with 2 null list entries owning non-empty child ranges: + // row 0: [1] → leaf[0] + // row 1: null list → owns leaf[1..3] (gap of 2) + // row 2: [2, null] → leaf[3], leaf[4]=null element + // row 3: null list → owns leaf[5..8] (gap of 3) + // row 4: [4, 5] → leaf[8], leaf[9] + // + // def_levels: [3, 0, 3, 2, 0, 3, 3] + // rep_levels: [0, 0, 0, 1, 0, 0, 1] + // non_null_indices: [0, 3, 8, 9] + // gaps in array: 0→3 (skip 1,2), 3→8 (skip 5,6,7) + let array: ArrayRef = Arc::new(Int32Array::from(vec![ + Some(1), // 0: row 0 + None, // 1: gap (null list row 1) + None, // 2: gap (null list row 1) + Some(2), // 3: row 2 + None, // 4: row 2, null element + None, // 5: gap (null list row 3) + None, // 6: gap (null list row 3) + None, // 7: gap (null list row 3) + Some(4), // 8: row 4 + Some(5), // 9: row 4 + ])); let logical_nulls = array.logical_nulls(); let levels = ArrayLevels { - def_levels: Some(vec![2, 2, 2, 2, 2]), - rep_levels: Some(vec![0, 1, 0, 0, 1]), - non_null_indices: vec![0, 1, 2, 3, 4], - max_def_level: 2, + def_levels: Some(vec![3, 0, 3, 2, 0, 3, 3]), + rep_levels: Some(vec![0, 0, 0, 1, 0, 0, 1]), + non_null_indices: vec![0, 3, 8, 9], + max_def_level: 3, max_rep_level: 1, array, logical_nulls, }; - let sliced = levels.slice_for_chunk(&CdcChunk { + + // Chunk 0: rows 0-1, nni=[0] → array sliced to [0..1] + let chunk0 = levels.slice_for_chunk(&CdcChunk { + level_offset: 0, + num_levels: 2, + value_offset: 0, + num_values: 1, + }); + assert_eq!(chunk0.non_null_indices, vec![0]); + assert_eq!(chunk0.array.len(), 1); + + // Chunk 1: rows 2-3, nni=[3] → array sliced to [3..4] + let chunk1 = levels.slice_for_chunk(&CdcChunk { level_offset: 2, num_levels: 3, - value_offset: 2, - num_values: 3, + value_offset: 1, + num_values: 1, }); - assert_eq!(sliced.def_levels, Some(vec![2, 2, 2])); - assert_eq!(sliced.rep_levels, Some(vec![0, 0, 1])); - // [0,1,2,3,4] filtered to [2,5) → [2,3,4] → shifted -2 → [0,1,2] - assert_eq!(sliced.non_null_indices, vec![0, 1, 2]); - assert_eq!(sliced.array.len(), 3); - } + assert_eq!(chunk1.non_null_indices, vec![0]); + assert_eq!(chunk1.array.len(), 1); - #[test] - fn test_slice_for_chunk_non_null_indices_boundary() { - // [1, null, 3]: non_null_indices=[0, 2]; test inclusive lower / exclusive upper bounds - let array: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), None, Some(3)])); - let logical_nulls = array.logical_nulls(); - let levels = ArrayLevels { - def_levels: Some(vec![1, 0, 1]), - rep_levels: None, - non_null_indices: vec![0, 2], - max_def_level: 1, - max_rep_level: 0, - array, - logical_nulls, - }; - assert_eq!( - levels - .slice_for_chunk(&CdcChunk { - level_offset: 0, - num_levels: 1, - value_offset: 0, - num_values: 1 - }) - .non_null_indices, - vec![0] - ); - // idx 2 in range [1,3), shifted -1 → 1 - assert_eq!( - levels - .slice_for_chunk(&CdcChunk { - level_offset: 1, - num_levels: 2, - value_offset: 1, - num_values: 2 - }) - .non_null_indices, - vec![1] - ); - // idx 2 excluded from [1,2) - assert_eq!( - levels - .slice_for_chunk(&CdcChunk { - level_offset: 1, - num_levels: 1, - value_offset: 1, - num_values: 1 - }) - .non_null_indices, - Vec::::new() - ); + // Chunk 2: row 4, nni=[8, 9] → array sliced to [8..10] + let chunk2 = levels.slice_for_chunk(&CdcChunk { + level_offset: 5, + num_levels: 2, + value_offset: 2, + num_values: 2, + }); + assert_eq!(chunk2.non_null_indices, vec![0, 1]); + assert_eq!(chunk2.array.len(), 2); } } diff --git a/parquet/src/column/chunker/cdc.rs b/parquet/src/column/chunker/cdc.rs index f21f58780a6a..750735730874 100644 --- a/parquet/src/column/chunker/cdc.rs +++ b/parquet/src/column/chunker/cdc.rs @@ -289,27 +289,39 @@ impl ContentDefinedChunker { let mut chunks = Vec::new(); let mut prev_offset: usize = 0; let mut prev_value_offset: usize = 0; - // Total number of values seen; for non-nested data this equals num_levels. - let mut total_values: usize = num_levels; + let mut value_offset: usize = 0; if !has_rep_levels && !has_def_levels { // Fastest path: non-nested, non-null data. + // Every level corresponds to exactly one non-null value, so + // value_offset == level_offset and num_values == num_levels. + // + // Example: required Int32, array = [10, 20, 30] + // level: 0 1 2 + // value_offset: 0 1 2 for offset in 0..num_levels { roll_value(self, offset); if self.need_new_chunk() { chunks.push(CdcChunk { level_offset: prev_offset, - value_offset: prev_offset, num_levels: offset - prev_offset, + value_offset: prev_offset, num_values: offset - prev_offset, }); prev_offset = offset; } } - // Set the previous value offset to add the last chunk. prev_value_offset = prev_offset; + value_offset = num_levels; } else if !has_rep_levels { - // Non-nested data with nulls. + // Non-nested data with nulls. value_offset only increments for + // non-null values (def == max_def), so it diverges from the + // level offset when nulls are present. + // + // Example: optional Int32, array = [1, null, 2, null, 3] + // def_levels: [1, 0, 1, 0, 1] + // level: 0 1 2 3 4 + // value_offset: 0 1 2 (only increments on def==1) let def_levels = def_levels.expect("def_levels required when max_def_level > 0"); #[allow(clippy::needless_range_loop)] for offset in 0..num_levels { @@ -318,23 +330,56 @@ impl ContentDefinedChunker { if def_level == self.max_def_level { roll_value(self, offset); } + // Check boundary before incrementing value_offset so that + // num_values reflects only entries in the completed chunk. if self.need_new_chunk() { chunks.push(CdcChunk { level_offset: prev_offset, - value_offset: prev_offset, num_levels: offset - prev_offset, - num_values: offset - prev_offset, + value_offset: prev_value_offset, + num_values: value_offset - prev_value_offset, }); prev_offset = offset; + prev_value_offset = value_offset; + } + if def_level == self.max_def_level { + value_offset += 1; } } - // Set the previous value offset to add the last chunk. - prev_value_offset = prev_offset; } else { - // Nested data with nulls. + // Nested data with nulls. Two counters are needed: + // + // leaf_offset: index into the leaf values array for hashing, + // incremented for all leaf slots (def >= repeated_ancestor_def_level), + // including null elements. + // + // value_offset: index into non_null_indices for chunk boundaries, + // incremented only for non-null leaf values (def == max_def_level). + // + // These diverge when nullable elements exist inside lists. + // + // Example: List with repeated_ancestor_def_level=2, max_def=3 + // row 0: [1, null, 2] (3 leaf slots, 2 non-null) + // row 1: [3] (1 leaf slot, 1 non-null) + // + // leaf array: [1, null, 2, 3] + // def_levels: [3, 2, 3, 3] + // rep_levels: [0, 1, 1, 0] + // + // level def leaf_offset value_offset action + // ───── ─── ─────────── ──────────── ────────────────────────── + // 0 3 0 0 roll_value(0), value++, leaf++ + // 1 2 1 1 leaf++ only (null element) + // 2 3 2 1 roll_value(2), value++, leaf++ + // 3 3 3 2 roll_value(3), value++, leaf++ + // + // roll_value(2) correctly indexes leaf array position 2 (value "2"). + // Using value_offset=1 would index position 1 (the null slot). + // + // Using value_offset for roll_value would hash the wrong array slot. let def_levels = def_levels.expect("def_levels required for nested data"); let rep_levels = rep_levels.expect("rep_levels required for nested data"); - let mut value_offset: usize = 0; + let mut leaf_offset: usize = 0; for offset in 0..num_levels { let def_level = def_levels[offset]; @@ -343,43 +388,45 @@ impl ContentDefinedChunker { self.roll_level(def_level); self.roll_level(rep_level); if def_level == self.max_def_level { - roll_value(self, value_offset); + roll_value(self, leaf_offset); } + // Check boundary before incrementing value_offset so that + // num_values reflects only entries in the completed chunk. if rep_level == 0 && self.need_new_chunk() { - // If we are at a record boundary and need a new chunk, create one. let levels_to_write = offset - prev_offset; if levels_to_write > 0 { chunks.push(CdcChunk { level_offset: prev_offset, - value_offset: prev_value_offset, num_levels: levels_to_write, + value_offset: prev_value_offset, num_values: value_offset - prev_value_offset, }); prev_offset = offset; prev_value_offset = value_offset; } } - if def_level >= self.repeated_ancestor_def_level { - // We only increment the value offset if we have a leaf value. + if def_level == self.max_def_level { value_offset += 1; } + if def_level >= self.repeated_ancestor_def_level { + leaf_offset += 1; + } } - total_values = value_offset; } // Add the last chunk if we have any levels left. if prev_offset < num_levels { chunks.push(CdcChunk { level_offset: prev_offset, - value_offset: prev_value_offset, num_levels: num_levels - prev_offset, - num_values: total_values - prev_value_offset, + value_offset: prev_value_offset, + num_values: value_offset - prev_value_offset, }); } #[cfg(debug_assertions)] - self.validate_chunks(&chunks, num_levels, total_values); + self.validate_chunks(&chunks, num_levels, value_offset); chunks } @@ -626,8 +673,9 @@ mod tests { assert_eq!(chunks1.len(), chunks2.len()); for (a, b) in chunks1.iter().zip(chunks2.iter()) { assert_eq!(a.level_offset, b.level_offset); - assert_eq!(a.value_offset, b.value_offset); assert_eq!(a.num_levels, b.num_levels); + assert_eq!(a.value_offset, b.value_offset); + assert_eq!(a.num_values, b.num_values); } } @@ -663,9 +711,12 @@ mod arrow_tests { use std::borrow::Borrow; use std::sync::Arc; + use arrow::util::data_gen::create_random_batch; use arrow_array::cast::AsArray; use arrow_array::{Array, ArrayRef, BooleanArray, Int32Array, RecordBatch}; - use arrow_schema::{DataType, Field, Schema}; + use arrow_buffer::Buffer; + use arrow_data::ArrayData; + use arrow_schema::{DataType, Field, Fields, Schema}; use crate::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; use crate::arrow::arrow_writer::ArrowWriter; @@ -2153,4 +2204,128 @@ mod arrow_tests { "all chunks after the first must be identical" ); } + + /// Helper to write a batch with CDC and read it back. + fn cdc_roundtrip(batch: &RecordBatch) -> RecordBatch { + let props = WriterProperties::builder() + .set_content_defined_chunking(Some(CdcOptions::default())) + .build(); + let mut buffer = Vec::new(); + let mut writer = ArrowWriter::try_new(&mut buffer, batch.schema(), Some(props)).unwrap(); + writer.write(batch).unwrap(); + writer.close().unwrap(); + + let reader = ParquetRecordBatchReaderBuilder::try_new(bytes::Bytes::from(buffer)) + .unwrap() + .build() + .unwrap(); + reader.into_iter().next().unwrap().unwrap() + } + + /// Regression test for + /// + /// Writing nested list data with CDC enabled panicked with an out-of-bounds + /// slice access when null list entries had non-zero child ranges. + #[test] + fn test_cdc_list_roundtrip() { + let schema = Arc::new(Schema::new(vec![ + Field::new( + "_1", + DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))), + true, + ), + Field::new( + "_2", + DataType::List(Arc::new(Field::new_list_field(DataType::Boolean, true))), + true, + ), + Field::new( + "_3", + DataType::LargeList(Arc::new(Field::new_list_field(DataType::Utf8, true))), + true, + ), + ])); + let batch = create_random_batch(schema, 2, 0.25, 0.75).unwrap(); + assert_eq!(cdc_roundtrip(&batch), batch); + } + + /// Test CDC with deeply nested types: List>, List>> + #[test] + fn test_cdc_deeply_nested_roundtrip() { + let inner_field = Field::new_list_field(DataType::Int32, true); + let inner_type = DataType::List(Arc::new(inner_field)); + let outer_field = Field::new_list_field(inner_type.clone(), true); + let list_list_type = DataType::List(Arc::new(outer_field)); + + let struct_inner_field = Field::new_list_field(DataType::Int32, true); + let struct_inner_type = DataType::List(Arc::new(struct_inner_field)); + let struct_fields = Fields::from(vec![Field::new("a", struct_inner_type, true)]); + let struct_type = DataType::Struct(struct_fields); + let struct_list_field = Field::new_list_field(struct_type, true); + let list_struct_type = DataType::List(Arc::new(struct_list_field)); + + let schema = Arc::new(Schema::new(vec![ + Field::new("list_list", list_list_type, true), + Field::new("list_struct_list", list_struct_type, true), + ])); + let batch = create_random_batch(schema, 200, 0.25, 0.75).unwrap(); + assert_eq!(cdc_roundtrip(&batch), batch); + } + + /// Test CDC with list arrays that have non-empty null segments. + /// + /// Per the Arrow columnar format spec: "a null value may correspond to a + /// non-empty segment in the child array". This test constructs such arrays + /// manually and verifies the CDC writer handles them correctly. + #[test] + fn test_cdc_list_non_empty_null_segments() { + // Build List where null entries own non-zero child ranges: + // row 0: [1, 2] offsets[0..2] valid + // row 1: null offsets[2..5] null, but owns 3 child values + // row 2: [6, 7] offsets[5..7] valid + // row 3: null offsets[7..9] null, but owns 2 child values + // row 4: [10] offsets[9..10] valid + let values = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); + let offsets = Buffer::from_iter([0_i32, 2, 5, 7, 9, 10]); + let null_bitmap = Buffer::from([0b00010101]); // rows 0, 2, 4 valid + + let list_type = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, false))); + let list_data = unsafe { + ArrayData::new_unchecked( + list_type.clone(), + 5, + None, + Some(null_bitmap), + 0, + vec![offsets], + vec![values.to_data()], + ) + }; + let list_array = arrow_array::make_array(list_data); + + let schema = Arc::new(Schema::new(vec![Field::new("col", list_type, true)])); + let batch = RecordBatch::try_new(schema, vec![list_array]).unwrap(); + + let read = cdc_roundtrip(&batch); + let read_list = read.column(0).as_list::(); + assert_eq!(read_list.len(), 5); + assert!(read_list.is_valid(0)); + assert!(read_list.is_null(1)); + assert!(read_list.is_valid(2)); + assert!(read_list.is_null(3)); + assert!(read_list.is_valid(4)); + + let get_vals = |i: usize| -> Vec { + read_list + .value(i) + .as_primitive::() + .values() + .iter() + .copied() + .collect() + }; + assert_eq!(get_vals(0), vec![1, 2]); + assert_eq!(get_vals(2), vec![6, 7]); + assert_eq!(get_vals(4), vec![10]); + } } diff --git a/parquet/src/column/chunker/mod.rs b/parquet/src/column/chunker/mod.rs index c4caf18af66b..42631e026db4 100644 --- a/parquet/src/column/chunker/mod.rs +++ b/parquet/src/column/chunker/mod.rs @@ -31,10 +31,10 @@ pub(crate) use cdc::ContentDefinedChunker; pub(crate) struct CdcChunk { /// The start offset of this chunk inside the given levels. pub level_offset: usize, - /// The start offset of this chunk inside the given values array. - pub value_offset: usize, /// The number of levels in this chunk. pub num_levels: usize, - /// The number of values (Arrow array elements) in this chunk. + /// The start index into `non_null_indices` for this chunk. + pub value_offset: usize, + /// The number of `non_null_indices` entries in this chunk. pub num_values: usize, } From 4fb27f65647939e2c97f91564d5071758498eced Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Thu, 2 Apr 2026 16:51:13 +0200 Subject: [PATCH 2/8] refactor(parquet): reuse existing write_with_cdc_options in regression tests --- parquet/src/column/chunker/cdc.rs | 46 ++++++++++++++++--------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/parquet/src/column/chunker/cdc.rs b/parquet/src/column/chunker/cdc.rs index 750735730874..bd03af2b471d 100644 --- a/parquet/src/column/chunker/cdc.rs +++ b/parquet/src/column/chunker/cdc.rs @@ -2205,23 +2205,6 @@ mod arrow_tests { ); } - /// Helper to write a batch with CDC and read it back. - fn cdc_roundtrip(batch: &RecordBatch) -> RecordBatch { - let props = WriterProperties::builder() - .set_content_defined_chunking(Some(CdcOptions::default())) - .build(); - let mut buffer = Vec::new(); - let mut writer = ArrowWriter::try_new(&mut buffer, batch.schema(), Some(props)).unwrap(); - writer.write(batch).unwrap(); - writer.close().unwrap(); - - let reader = ParquetRecordBatchReaderBuilder::try_new(bytes::Bytes::from(buffer)) - .unwrap() - .build() - .unwrap(); - reader.into_iter().next().unwrap().unwrap() - } - /// Regression test for /// /// Writing nested list data with CDC enabled panicked with an out-of-bounds @@ -2245,8 +2228,14 @@ mod arrow_tests { true, ), ])); - let batch = create_random_batch(schema, 2, 0.25, 0.75).unwrap(); - assert_eq!(cdc_roundtrip(&batch), batch); + let batch = create_random_batch(schema, 10_000, 0.25, 0.75).unwrap(); + write_with_cdc_options( + &[&batch], + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + None, + true, + ); } /// Test CDC with deeply nested types: List>, List>> @@ -2268,8 +2257,14 @@ mod arrow_tests { Field::new("list_list", list_list_type, true), Field::new("list_struct_list", list_struct_type, true), ])); - let batch = create_random_batch(schema, 200, 0.25, 0.75).unwrap(); - assert_eq!(cdc_roundtrip(&batch), batch); + let batch = create_random_batch(schema, 10_000, 0.25, 0.75).unwrap(); + write_with_cdc_options( + &[&batch], + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + None, + true, + ); } /// Test CDC with list arrays that have non-empty null segments. @@ -2306,7 +2301,14 @@ mod arrow_tests { let schema = Arc::new(Schema::new(vec![Field::new("col", list_type, true)])); let batch = RecordBatch::try_new(schema, vec![list_array]).unwrap(); - let read = cdc_roundtrip(&batch); + let buf = write_with_cdc_options( + &[&batch], + CDC_MIN_CHUNK_SIZE, + CDC_MAX_CHUNK_SIZE, + None, + true, + ); + let read = concat_batches(&read_batches(&buf)); let read_list = read.column(0).as_list::(); assert_eq!(read_list.len(), 5); assert!(read_list.is_valid(0)); From 3533fd8cc309385c849706f283ee5fbe568d8489 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Wed, 1 Apr 2026 17:35:30 +0200 Subject: [PATCH 3/8] feat(parquet): add content-addressed page store with CDC deduplication --- parquet/Cargo.toml | 13 + parquet/examples/page_store.rs | 102 +++++ parquet/src/arrow/arrow_writer/mod.rs | 34 +- parquet/src/arrow/mod.rs | 2 + parquet/src/arrow/page_store/mod.rs | 539 +++++++++++++++++++++++++ parquet/src/arrow/page_store/reader.rs | 248 ++++++++++++ parquet/src/arrow/page_store/writer.rs | 511 +++++++++++++++++++++++ parquet/src/bin/parquet-page-store.rs | 278 +++++++++++++ 8 files changed, 1710 insertions(+), 17 deletions(-) create mode 100644 parquet/examples/page_store.rs create mode 100644 parquet/src/arrow/page_store/mod.rs create mode 100644 parquet/src/arrow/page_store/reader.rs create mode 100644 parquet/src/arrow/page_store/writer.rs create mode 100644 parquet/src/bin/parquet-page-store.rs diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index efcd1fe2190b..5e4eeacee1f8 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -43,6 +43,7 @@ arrow-csv = { workspace = true, optional = true } arrow-data = { workspace = true, optional = true } arrow-schema = { workspace = true, optional = true } arrow-select = { workspace = true, optional = true } +arrow-cast = { workspace = true, optional = true, features = ["prettyprint"] } arrow-ipc = { workspace = true, optional = true } parquet-geospatial = { workspace = true, optional = true } parquet-variant = { workspace = true, optional = true } @@ -77,6 +78,7 @@ half = { version = "2.1", default-features = false, features = ["num-traits"] } crc32fast = { version = "1.4.2", optional = true, default-features = false } simdutf8 = { workspace = true , optional = true } ring = { version = "0.17", default-features = false, features = ["std"], optional = true } +blake3 = { version = "1", default-features = false, optional = true } [dev-dependencies] base64 = { version = "0.22", default-features = false, features = ["std"] } @@ -134,6 +136,8 @@ flate2-zlib-rs = ["flate2/zlib-rs"] variant_experimental = ["arrow", "parquet-variant", "parquet-variant-json", "parquet-variant-compute"] # Enable geospatial support geospatial = ["parquet-geospatial"] +# Enable page store (content-addressed page storage) +page_store = ["arrow", "dep:blake3", "dep:arrow-cast", "serde", "serde_json"] [[example]] @@ -151,6 +155,11 @@ name = "write_parquet" required-features = ["cli"] path = "./examples/write_parquet.rs" +[[example]] +name = "page_store" +required-features = ["page_store"] +path = "./examples/page_store.rs" + [[example]] name = "read_with_rowgroup" required-features = ["arrow", "async"] @@ -180,6 +189,10 @@ name = "variant_integration" required-features = ["arrow", "variant_experimental", "serde"] path = "./tests/variant_integration.rs" +[[bin]] +name = "parquet-page-store" +required-features = ["page_store", "cli"] + [[bin]] name = "parquet-read" required-features = ["cli"] diff --git a/parquet/examples/page_store.rs b/parquet/examples/page_store.rs new file mode 100644 index 000000000000..8b963329c997 --- /dev/null +++ b/parquet/examples/page_store.rs @@ -0,0 +1,102 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Example demonstrating the Parquet Page Store. +//! +//! Writes Arrow RecordBatches to a content-addressed page store and reads them back. + +use std::sync::Arc; + +use arrow_array::{ArrayRef, Float64Array, Int32Array, RecordBatch, StringArray}; +use arrow_cast::pretty::pretty_format_batches; +use parquet::arrow::page_store::{PageStoreReader, PageStoreWriter}; +use parquet::file::properties::{CdcOptions, EnabledStatistics, WriterProperties}; +use tempfile::TempDir; + +fn main() -> parquet::errors::Result<()> { + let tempdir = TempDir::new().unwrap(); + let store_dir = tempdir.path().join("page_store"); + + // Create sample data + let batch = RecordBatch::try_from_iter(vec![ + ( + "id", + Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10])) as ArrayRef, + ), + ( + "value", + Arc::new(Float64Array::from(vec![ + 1.0, 2.5, 3.7, 4.2, 5.9, 6.1, 7.3, 8.8, 9.0, 10.5, + ])) as ArrayRef, + ), + ( + "name", + Arc::new(StringArray::from(vec![ + "alice", "bob", "charlie", "diana", "eve", "frank", "grace", "heidi", "ivan", + "judy", + ])) as ArrayRef, + ), + ]) + .unwrap(); + + let props = WriterProperties::builder() + .set_statistics_enabled(EnabledStatistics::Page) + .set_content_defined_chunking(Some(CdcOptions::default())) + .build(); + + let metadata_path = tempdir.path().join("table.parquet"); + + // Write to page store + println!("Page store dir: {}", store_dir.display()); + println!("Metadata file: {}", metadata_path.display()); + let mut writer = PageStoreWriter::try_new(&store_dir, batch.schema(), Some(props))?; + writer.write(&batch)?; + let metadata = writer.finish(&metadata_path)?; + + println!( + "Wrote {} row group(s), {} total rows", + metadata.num_row_groups(), + metadata.file_metadata().num_rows() + ); + + // List page files + let page_files: Vec<_> = std::fs::read_dir(&store_dir) + .unwrap() + .filter_map(|e| e.ok()) + .filter(|e| e.path().extension().map_or(false, |ext| ext == "page")) + .collect(); + println!("Page files in store: {}", page_files.len()); + + // Read back from page store + println!("\nReading from page store..."); + let reader = PageStoreReader::try_new(&metadata_path, &store_dir)?; + let batches = reader.read_batches()?; + + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + println!("Read {} batch(es), {} total rows", batches.len(), total_rows); + + // Display + let formatted = pretty_format_batches(&batches).unwrap(); + println!("\n{formatted}"); + + // Verify round-trip + assert_eq!(batches.len(), 1); + assert_eq!(batches[0], batch); + println!("\nRound-trip verification: PASSED"); + + Ok(()) +} diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 2ef71d5745a2..5bf226701671 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -605,9 +605,9 @@ impl ArrowWriterOptions { /// A single column chunk produced by [`ArrowColumnWriter`] #[derive(Default)] -struct ArrowColumnChunkData { - length: usize, - data: Vec, +pub(crate) struct ArrowColumnChunkData { + pub(crate) length: usize, + pub(crate) data: Vec, } impl Length for ArrowColumnChunkData { @@ -632,7 +632,7 @@ impl ChunkReader for ArrowColumnChunkData { } /// A [`Read`] for [`ArrowColumnChunkData`] -struct ArrowColumnChunkReader(Peekable>); +pub(crate) struct ArrowColumnChunkReader(Peekable>); impl Read for ArrowColumnChunkReader { fn read(&mut self, out: &mut [u8]) -> std::io::Result { @@ -658,7 +658,7 @@ impl Read for ArrowColumnChunkReader { /// /// This allows it to be owned by [`ArrowPageWriter`] whilst allowing access via /// [`ArrowRowGroupWriter`] on flush, without requiring self-referential borrows -type SharedColumnChunk = Arc>; +pub(crate) type SharedColumnChunk = Arc>; #[derive(Default)] struct ArrowPageWriter { @@ -752,8 +752,8 @@ pub fn compute_leaves(field: &Field, array: &ArrayRef) -> Result), Column(ColumnWriter<'static>), } @@ -989,14 +989,14 @@ impl ArrowColumnWriter { /// /// See the example on [`ArrowColumnWriter`] for how to encode columns in parallel #[derive(Debug)] -struct ArrowRowGroupWriter { - writers: Vec, +pub(crate) struct ArrowRowGroupWriter { + pub(crate) writers: Vec, schema: SchemaRef, - buffered_rows: usize, + pub(crate) buffered_rows: usize, } impl ArrowRowGroupWriter { - fn new(writers: Vec, arrow: &SchemaRef) -> Self { + pub(crate) fn new(writers: Vec, arrow: &SchemaRef) -> Self { Self { writers, schema: arrow.clone(), @@ -1004,7 +1004,7 @@ impl ArrowRowGroupWriter { } } - fn write(&mut self, batch: &RecordBatch) -> Result<()> { + pub(crate) fn write(&mut self, batch: &RecordBatch) -> Result<()> { self.buffered_rows += batch.num_rows(); let mut writers = self.writers.iter_mut(); for (field, column) in self.schema.fields().iter().zip(batch.columns()) { @@ -1015,7 +1015,7 @@ impl ArrowRowGroupWriter { Ok(()) } - fn write_with_chunkers( + pub(crate) fn write_with_chunkers( &mut self, batch: &RecordBatch, chunkers: &mut [ContentDefinedChunker], @@ -1042,7 +1042,7 @@ impl ArrowRowGroupWriter { .sum() } - fn close(self) -> Result> { + pub(crate) fn close(self) -> Result> { self.writers .into_iter() .map(|writer| writer.close()) diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index 52152988166f..e24788e4bcd7 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -182,6 +182,8 @@ experimental!(mod array_reader); pub mod arrow_reader; pub mod arrow_writer; +#[cfg(feature = "page_store")] +pub mod page_store; mod buffer; mod decoder; diff --git a/parquet/src/arrow/page_store/mod.rs b/parquet/src/arrow/page_store/mod.rs new file mode 100644 index 000000000000..87aa408b7840 --- /dev/null +++ b/parquet/src/arrow/page_store/mod.rs @@ -0,0 +1,539 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Content-addressed page store for Parquet files. +//! +//! This module provides [`PageStoreWriter`] and [`PageStoreReader`] for writing +//! and reading Parquet data through a content-addressed page store. Each data +//! page is stored as a separate file named by its BLAKE3 hash, enabling +//! cross-file page-level deduplication when used with +//! [content-defined chunking](crate::file::properties::CdcOptions). + +mod reader; +mod writer; + +pub use reader::PageStoreReader; +pub use writer::PageStoreWriter; + +use serde::{Deserialize, Serialize}; + +/// A reference to a page stored in the content-addressed page store. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PageRef { + /// Row group index + pub row_group: usize, + /// Column index (leaf column) + pub column: usize, + /// Page index within this column chunk (0-based) + pub page_index: usize, + /// Byte offset within the virtual column chunk + pub offset: i64, + /// Compressed page size in bytes (thrift header + data) + pub size: i32, + /// BLAKE3 hash hex string (64 chars) + pub hash: String, + /// True for dictionary pages + pub is_dict: bool, +} + +/// Manifest stored in the metadata-only parquet file's key-value metadata. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PageStoreManifest { + /// All page references across all row groups and columns + pub pages: Vec, +} + +/// The key used to store the page store manifest in parquet key-value metadata. +const MANIFEST_KEY: &str = "page_store.manifest"; + +#[cfg(test)] +mod tests { + use std::fs; + use std::path::Path; + use std::sync::Arc; + + use arrow_array::{ + ArrayRef, BooleanArray, Float64Array, Int32Array, ListArray, RecordBatch, + StringArray, StructArray, + }; + use arrow_schema::Field; + + use super::*; + use crate::errors::Result; + use crate::file::metadata::{ + FileMetaData, KeyValue, ParquetMetaData, ParquetMetaDataWriter, + }; + use crate::file::properties::{EnabledStatistics, WriterProperties}; + use crate::arrow::ArrowSchemaConverter; + use crate::schema::types::SchemaDescriptor; + + // ----------------------------------------------------------------------- + // Helpers + // ----------------------------------------------------------------------- + + fn count_page_files(dir: &Path) -> usize { + fs::read_dir(dir) + .unwrap() + .filter_map(|e| e.ok()) + .filter(|e| e.path().extension().map_or(false, |ext| ext == "page")) + .count() + } + + fn write_batches( + store_dir: &Path, + metadata_path: &Path, + batches: &[RecordBatch], + props: Option, + ) -> Result { + let schema = batches[0].schema(); + let mut writer = PageStoreWriter::try_new(store_dir, schema, props)?; + for batch in batches { + writer.write(batch)?; + } + writer.finish(metadata_path) + } + + fn sample_batch() -> RecordBatch { + RecordBatch::try_from_iter(vec![ + ("id", Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])) as ArrayRef), + ("value", Arc::new(Float64Array::from(vec![1.0, 2.5, 3.7, 4.2, 5.9])) as ArrayRef), + ("name", Arc::new(StringArray::from(vec!["alice", "bob", "charlie", "diana", "eve"])) as ArrayRef), + ]) + .unwrap() + } + + // ----------------------------------------------------------------------- + // Round-trip tests + // ----------------------------------------------------------------------- + + #[test] + fn test_round_trip() { + let tmp = tempfile::tempdir().unwrap(); + let store = tmp.path().join("pages"); + let meta = tmp.path().join("data.parquet"); + + let batch = sample_batch(); + let metadata = write_batches(&store, &meta, &[batch.clone()], None).unwrap(); + + assert_eq!(metadata.num_row_groups(), 1); + assert_eq!(metadata.file_metadata().num_rows(), 5); + assert!(count_page_files(&store) > 0); + + let reader = PageStoreReader::try_new(&meta, &store).unwrap(); + let batches = reader.read_batches().unwrap(); + assert_eq!(batches.len(), 1); + assert_eq!(batches[0], batch); + } + + #[test] + fn test_multiple_batches_single_row_group() { + let tmp = tempfile::tempdir().unwrap(); + let store = tmp.path().join("pages"); + let meta = tmp.path().join("data.parquet"); + + let b1 = RecordBatch::try_from_iter(vec![ + ("x", Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef), + ]).unwrap(); + let b2 = RecordBatch::try_from_iter(vec![ + ("x", Arc::new(Int32Array::from(vec![4, 5])) as ArrayRef), + ]).unwrap(); + + let metadata = write_batches(&store, &meta, &[b1, b2], None).unwrap(); + assert_eq!(metadata.num_row_groups(), 1); + assert_eq!(metadata.file_metadata().num_rows(), 5); + + let batches = PageStoreReader::try_new(&meta, &store).unwrap().read_batches().unwrap(); + let total: usize = batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total, 5); + } + + #[test] + fn test_multiple_row_groups_via_flush() { + let tmp = tempfile::tempdir().unwrap(); + let store = tmp.path().join("pages"); + let meta = tmp.path().join("data.parquet"); + + let batch = sample_batch(); + let mut writer = PageStoreWriter::try_new(&store, batch.schema(), None).unwrap(); + writer.write(&batch).unwrap(); + writer.flush().unwrap(); + writer.write(&batch).unwrap(); + writer.flush().unwrap(); + writer.write(&batch).unwrap(); + let metadata = writer.finish(&meta).unwrap(); + + assert_eq!(metadata.num_row_groups(), 3); + assert_eq!(metadata.file_metadata().num_rows(), 15); + + let total: usize = PageStoreReader::try_new(&meta, &store) + .unwrap().read_batches().unwrap() + .iter().map(|b| b.num_rows()).sum(); + assert_eq!(total, 15); + } + + #[test] + fn test_flush_empty_is_noop() { + let tmp = tempfile::tempdir().unwrap(); + let store = tmp.path().join("pages"); + let meta = tmp.path().join("data.parquet"); + + let batch = sample_batch(); + let mut writer = PageStoreWriter::try_new(&store, batch.schema(), None).unwrap(); + writer.flush().unwrap(); + writer.flush().unwrap(); + writer.write(&batch).unwrap(); + let metadata = writer.finish(&meta).unwrap(); + + assert_eq!(metadata.num_row_groups(), 1); + assert_eq!(metadata.file_metadata().num_rows(), 5); + } + + // ----------------------------------------------------------------------- + // Column type tests + // ----------------------------------------------------------------------- + + #[test] + fn test_nullable_columns() { + let tmp = tempfile::tempdir().unwrap(); + let store = tmp.path().join("pages"); + let meta = tmp.path().join("data.parquet"); + + let batch = RecordBatch::try_from_iter(vec![ + ("id", Arc::new(Int32Array::from(vec![Some(1), None, Some(3), None, Some(5)])) as ArrayRef), + ("label", Arc::new(StringArray::from(vec![Some("a"), Some("b"), None, None, Some("e")])) as ArrayRef), + ]).unwrap(); + + write_batches(&store, &meta, &[batch.clone()], None).unwrap(); + let batches = PageStoreReader::try_new(&meta, &store).unwrap().read_batches().unwrap(); + assert_eq!(batches[0], batch); + } + + #[test] + fn test_boolean_column() { + let tmp = tempfile::tempdir().unwrap(); + let store = tmp.path().join("pages"); + let meta = tmp.path().join("data.parquet"); + + let batch = RecordBatch::try_from_iter(vec![ + ("flag", Arc::new(BooleanArray::from(vec![true, false, true, true, false])) as ArrayRef), + ]).unwrap(); + + write_batches(&store, &meta, &[batch.clone()], None).unwrap(); + let batches = PageStoreReader::try_new(&meta, &store).unwrap().read_batches().unwrap(); + assert_eq!(batches[0], batch); + } + + #[test] + fn test_nested_struct_column() { + let tmp = tempfile::tempdir().unwrap(); + let store = tmp.path().join("pages"); + let meta = tmp.path().join("data.parquet"); + + let struct_array = StructArray::from(vec![ + (Arc::new(Field::new("a", arrow_schema::DataType::Int32, false)), + Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef), + (Arc::new(Field::new("b", arrow_schema::DataType::Utf8, false)), + Arc::new(StringArray::from(vec!["x", "y", "z"])) as ArrayRef), + ]); + let batch = RecordBatch::try_from_iter(vec![ + ("s", Arc::new(struct_array) as ArrayRef), + ]).unwrap(); + + write_batches(&store, &meta, &[batch.clone()], None).unwrap(); + let batches = PageStoreReader::try_new(&meta, &store).unwrap().read_batches().unwrap(); + assert_eq!(batches[0], batch); + } + + #[test] + fn test_list_column() { + let tmp = tempfile::tempdir().unwrap(); + let store = tmp.path().join("pages"); + let meta = tmp.path().join("data.parquet"); + + let values = Int32Array::from(vec![1, 2, 3, 4, 5, 6]); + let offsets = arrow_buffer::OffsetBuffer::new(vec![0, 2, 2, 5, 6].into()); + let list = ListArray::new( + Arc::new(Field::new_list_field(arrow_schema::DataType::Int32, false)), + offsets, + Arc::new(values), + None, + ); + let batch = RecordBatch::try_from_iter(vec![ + ("items", Arc::new(list) as ArrayRef), + ]).unwrap(); + + write_batches(&store, &meta, &[batch.clone()], None).unwrap(); + let batches = PageStoreReader::try_new(&meta, &store).unwrap().read_batches().unwrap(); + assert_eq!(batches[0], batch); + } + + // ----------------------------------------------------------------------- + // CDC / dedup tests + // ----------------------------------------------------------------------- + + #[test] + fn test_cdc_enabled_by_default() { + let tmp = tempfile::tempdir().unwrap(); + let store = tmp.path().join("pages"); + let meta = tmp.path().join("data.parquet"); + + let batch = sample_batch(); + write_batches(&store, &meta, &[batch.clone()], None).unwrap(); + + let total: usize = PageStoreReader::try_new(&meta, &store) + .unwrap().read_batches().unwrap() + .iter().map(|b| b.num_rows()).sum(); + assert_eq!(total, 5); + } + + #[test] + fn test_cdc_enabled_even_with_custom_props() { + let tmp = tempfile::tempdir().unwrap(); + let store = tmp.path().join("pages"); + let meta = tmp.path().join("data.parquet"); + + let batch = sample_batch(); + let props = WriterProperties::builder() + .set_statistics_enabled(EnabledStatistics::Page) + .build(); + write_batches(&store, &meta, &[batch.clone()], Some(props)).unwrap(); + + let batches = PageStoreReader::try_new(&meta, &store).unwrap().read_batches().unwrap(); + assert_eq!(batches[0], batch); + } + + #[test] + fn test_dedup_identical_row_groups() { + let tmp = tempfile::tempdir().unwrap(); + let store = tmp.path().join("pages"); + let meta = tmp.path().join("data.parquet"); + + let batch = sample_batch(); + let props = WriterProperties::builder() + .set_max_row_group_row_count(Some(5)) + .set_statistics_enabled(EnabledStatistics::Page) + .build(); + let mut writer = PageStoreWriter::try_new(&store, batch.schema(), Some(props)).unwrap(); + writer.write(&batch).unwrap(); + writer.flush().unwrap(); + writer.write(&batch).unwrap(); + let metadata = writer.finish(&meta).unwrap(); + + assert_eq!(metadata.num_row_groups(), 2); + + let reader = PageStoreReader::try_new(&meta, &store).unwrap(); + let manifest = reader.manifest(); + + let rg0: Vec<_> = manifest.pages.iter().filter(|p| p.row_group == 0).collect(); + let rg1: Vec<_> = manifest.pages.iter().filter(|p| p.row_group == 1).collect(); + assert_eq!(rg0.len(), rg1.len()); + for (p0, p1) in rg0.iter().zip(rg1.iter()) { + assert_eq!(p0.hash, p1.hash); + } + + let unique: std::collections::HashSet<_> = manifest.pages.iter().map(|p| &p.hash).collect(); + assert_eq!(count_page_files(&store), unique.len()); + + let total: usize = reader.read_batches().unwrap().iter().map(|b| b.num_rows()).sum(); + assert_eq!(total, 10); + } + + #[test] + fn test_cross_file_dedup() { + let tmp = tempfile::tempdir().unwrap(); + let store = tmp.path().join("pages"); + let meta_a = tmp.path().join("table_a.parquet"); + let meta_b = tmp.path().join("table_b.parquet"); + + let batch = sample_batch(); + + write_batches(&store, &meta_a, &[batch.clone()], None).unwrap(); + let pages_after_first = count_page_files(&store); + + write_batches(&store, &meta_b, &[batch.clone()], None).unwrap(); + let pages_after_second = count_page_files(&store); + + assert_eq!(pages_after_first, pages_after_second); + + let batches_a = PageStoreReader::try_new(&meta_a, &store).unwrap().read_batches().unwrap(); + let batches_b = PageStoreReader::try_new(&meta_b, &store).unwrap().read_batches().unwrap(); + assert_eq!(batches_a, batches_b); + assert_eq!(batches_a[0], batch); + } + + // ----------------------------------------------------------------------- + // Page integrity tests + // ----------------------------------------------------------------------- + + #[test] + fn test_metadata_path_outside_store() { + let tmp = tempfile::tempdir().unwrap(); + let store = tmp.path().join("shared_pages"); + let meta = tmp.path().join("elsewhere").join("my_table.parquet"); + fs::create_dir_all(meta.parent().unwrap()).unwrap(); + + let batch = sample_batch(); + write_batches(&store, &meta, &[batch.clone()], None).unwrap(); + + let batches = PageStoreReader::try_new(&meta, &store).unwrap().read_batches().unwrap(); + assert_eq!(batches[0], batch); + } + + #[test] + fn test_page_integrity() { + let tmp = tempfile::tempdir().unwrap(); + let store = tmp.path().join("pages"); + let meta = tmp.path().join("data.parquet"); + + write_batches(&store, &meta, &[sample_batch()], None).unwrap(); + + for entry in fs::read_dir(&store).unwrap() { + let entry = entry.unwrap(); + let path = entry.path(); + if path.extension().map_or(false, |ext| ext == "page") { + let data = fs::read(&path).unwrap(); + let hash = blake3::hash(&data); + let expected = format!("{}.page", hash.to_hex()); + assert_eq!(path.file_name().unwrap().to_str().unwrap(), expected); + } + } + } + + #[test] + fn test_manifest_page_refs_consistent() { + let tmp = tempfile::tempdir().unwrap(); + let store = tmp.path().join("pages"); + let meta = tmp.path().join("data.parquet"); + + let metadata = write_batches(&store, &meta, &[sample_batch()], None).unwrap(); + let reader = PageStoreReader::try_new(&meta, &store).unwrap(); + let manifest = reader.manifest(); + + for pr in &manifest.pages { + assert!(store.join(format!("{}.page", pr.hash)).exists()); + } + assert!(manifest.pages.iter().all(|p| p.row_group == 0)); + + let columns: std::collections::HashSet<_> = manifest.pages.iter().map(|p| p.column).collect(); + assert_eq!(columns.len(), metadata.row_groups()[0].num_columns()); + + for col in &columns { + let mut idxs: Vec<_> = manifest.pages.iter() + .filter(|p| p.column == *col) + .map(|p| p.page_index) + .collect(); + idxs.sort(); + assert_eq!(idxs, (0..idxs.len()).collect::>()); + } + } + + // ----------------------------------------------------------------------- + // Reader accessors + // ----------------------------------------------------------------------- + + #[test] + fn test_reader_schema() { + let tmp = tempfile::tempdir().unwrap(); + let store = tmp.path().join("pages"); + let meta = tmp.path().join("data.parquet"); + + let batch = sample_batch(); + write_batches(&store, &meta, &[batch.clone()], None).unwrap(); + + let schema = PageStoreReader::try_new(&meta, &store).unwrap().schema().unwrap(); + assert_eq!(schema.fields(), batch.schema().fields()); + } + + #[test] + fn test_reader_metadata() { + let tmp = tempfile::tempdir().unwrap(); + let store = tmp.path().join("pages"); + let meta = tmp.path().join("data.parquet"); + + write_batches(&store, &meta, &[sample_batch()], None).unwrap(); + + let md = PageStoreReader::try_new(&meta, &store).unwrap(); + assert_eq!(md.metadata().num_row_groups(), 1); + assert_eq!(md.metadata().file_metadata().num_rows(), 5); + assert_eq!(md.metadata().row_groups()[0].num_columns(), 3); + } + + // ----------------------------------------------------------------------- + // Reader error cases + // ----------------------------------------------------------------------- + + #[test] + fn test_reader_missing_metadata_file() { + let tmp = tempfile::tempdir().unwrap(); + assert!(PageStoreReader::try_new(tmp.path().join("no.parquet"), tmp.path()).is_err()); + } + + #[test] + fn test_reader_missing_page_file() { + let tmp = tempfile::tempdir().unwrap(); + let store = tmp.path().join("pages"); + let meta = tmp.path().join("data.parquet"); + + write_batches(&store, &meta, &[sample_batch()], None).unwrap(); + + let first_page = fs::read_dir(&store).unwrap() + .filter_map(|e| e.ok()) + .find(|e| e.path().extension().map_or(false, |ext| ext == "page")) + .unwrap(); + fs::remove_file(first_page.path()).unwrap(); + + assert!(PageStoreReader::try_new(&meta, &store).unwrap().read_batches().is_err()); + } + + #[test] + fn test_reader_corrupt_manifest() { + let tmp = tempfile::tempdir().unwrap(); + let store = tmp.path().join("pages"); + let meta = tmp.path().join("data.parquet"); + + let schema = ArrowSchemaConverter::new().convert(&sample_batch().schema()).unwrap(); + let schema_descr = Arc::new(SchemaDescriptor::new(schema.root_schema_ptr())); + let file_metadata = FileMetaData::new( + 2, 0, None, + Some(vec![KeyValue::new(MANIFEST_KEY.to_string(), "not json{{{".to_string())]), + schema_descr, None, + ); + fs::create_dir_all(&store).unwrap(); + let file = fs::File::create(&meta).unwrap(); + ParquetMetaDataWriter::new(file, &ParquetMetaData::new(file_metadata, vec![])).finish().unwrap(); + + let err = PageStoreReader::try_new(&meta, &store).unwrap_err().to_string(); + assert!(err.contains("expected"), "unexpected error: {err}"); + } + + #[test] + fn test_reader_missing_manifest_key() { + let tmp = tempfile::tempdir().unwrap(); + let store = tmp.path().join("pages"); + let meta = tmp.path().join("data.parquet"); + + let schema = ArrowSchemaConverter::new().convert(&sample_batch().schema()).unwrap(); + let schema_descr = Arc::new(SchemaDescriptor::new(schema.root_schema_ptr())); + let file_metadata = FileMetaData::new(2, 0, None, None, schema_descr, None); + fs::create_dir_all(&store).unwrap(); + let file = fs::File::create(&meta).unwrap(); + ParquetMetaDataWriter::new(file, &ParquetMetaData::new(file_metadata, vec![])).finish().unwrap(); + + let err = PageStoreReader::try_new(&meta, &store).unwrap_err().to_string(); + assert!(err.contains(MANIFEST_KEY), "error should mention key: {err}"); + } +} diff --git a/parquet/src/arrow/page_store/reader.rs b/parquet/src/arrow/page_store/reader.rs new file mode 100644 index 000000000000..9712bf0e2c03 --- /dev/null +++ b/parquet/src/arrow/page_store/reader.rs @@ -0,0 +1,248 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! [`PageStoreReader`] — reads Arrow data from a content-addressed page store. + +use std::collections::BTreeMap; +use std::fs; +use std::io::{self, Cursor}; +use std::path::{Path, PathBuf}; +use std::sync::Arc; + +use bytes::Bytes; + +use arrow_array::RecordBatch; +use arrow_schema::{ArrowError, SchemaRef}; + +use super::{PageStoreManifest, MANIFEST_KEY}; +use crate::arrow::arrow_reader::{ + ArrowReaderMetadata, ArrowReaderOptions, ParquetRecordBatchReaderBuilder, +}; +use crate::errors::Result; +use crate::file::metadata::{PageIndexPolicy, ParquetMetaData, ParquetMetaDataReader}; +use crate::file::reader::{ChunkReader, Length}; + +/// Reads Parquet data from a content-addressed page store. +/// +/// Takes a metadata-only Parquet file (written by [`super::PageStoreWriter`]) +/// and the `store_dir` that holds the `{hash}.page` blobs. The metadata file +/// can live anywhere — it does not need to be inside `store_dir`. +/// +/// Pages are read on-demand from the store directory — only the pages +/// needed for the requested row groups are loaded into memory. +/// +/// # Example +/// ```no_run +/// # use parquet::arrow::page_store::PageStoreReader; +/// let reader = PageStoreReader::try_new( +/// "/data/tables/my_table.parquet", +/// "/data/pages", +/// ).unwrap(); +/// let batches = reader.read_batches().unwrap(); +/// ``` +pub struct PageStoreReader { + store_dir: PathBuf, + metadata: Arc, + manifest: PageStoreManifest, +} + +impl std::fmt::Debug for PageStoreReader { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("PageStoreReader") + .field("store_dir", &self.store_dir) + .field("num_pages", &self.manifest.pages.len()) + .finish() + } +} + +impl PageStoreReader { + /// Open a page-store-backed Parquet file. + /// + /// * `metadata_path` — path to the metadata-only `.parquet` file. + /// * `store_dir` — directory containing `{hash}.page` blobs. + pub fn try_new( + metadata_path: impl AsRef, + store_dir: impl Into, + ) -> Result { + let store_dir = store_dir.into(); + let file = fs::File::open(metadata_path.as_ref())?; + + let metadata = ParquetMetaDataReader::new() + .with_page_index_policy(PageIndexPolicy::Required) + .parse_and_finish(&file)?; + + let manifest = Self::parse_manifest(&metadata)?; + + Ok(Self { + store_dir, + metadata: Arc::new(metadata), + manifest, + }) + } + + /// Returns a reference to the Parquet metadata. + pub fn metadata(&self) -> &ParquetMetaData { + &self.metadata + } + + /// Returns the manifest with all page references. + pub fn manifest(&self) -> &PageStoreManifest { + &self.manifest + } + + /// Returns the Arrow schema. + pub fn schema(&self) -> std::result::Result { + let parquet_schema = self.metadata.file_metadata().schema_descr(); + Ok(Arc::new(crate::arrow::parquet_to_arrow_schema( + parquet_schema, + self.metadata.file_metadata().key_value_metadata(), + )?)) + } + + /// Build a streaming [`ParquetRecordBatchReader`] over the page store. + /// + /// Prefer this over [`Self::read_batches`] for large files — batches are + /// decoded on-demand and only one batch is held in memory at a time. + pub fn reader(&self) -> Result { + let chunk_reader = PageStoreChunkReader::new(self.store_dir.clone(), &self.manifest); + let options = + ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::Required); + let arrow_metadata = ArrowReaderMetadata::try_new(Arc::clone(&self.metadata), options)?; + ParquetRecordBatchReaderBuilder::new_with_metadata(chunk_reader, arrow_metadata).build() + } + + /// Read all data from the page store and return as [`RecordBatch`]es. + /// + /// Convenient for small datasets and tests. For large files use + /// [`Self::reader`] to stream batches one at a time. + pub fn read_batches(&self) -> Result> { + self.reader()? + .collect::, _>>() + .map_err(|e| crate::errors::ParquetError::General(e.to_string())) + } + + fn parse_manifest(metadata: &ParquetMetaData) -> Result { + let kv = metadata + .file_metadata() + .key_value_metadata() + .and_then(|kvs| kvs.iter().find(|kv| kv.key == MANIFEST_KEY)) + .ok_or_else(|| { + crate::errors::ParquetError::General(format!( + "Missing '{MANIFEST_KEY}' in parquet key-value metadata" + )) + })?; + + let value = kv.value.as_ref().ok_or_else(|| { + crate::errors::ParquetError::General(format!("'{MANIFEST_KEY}' has no value")) + })?; + + serde_json::from_str(value) + .map_err(|e| crate::errors::ParquetError::General(e.to_string())) + } +} + +// --------------------------------------------------------------------------- +// PageStoreChunkReader — on-demand ChunkReader backed by page blobs +// --------------------------------------------------------------------------- + +/// A [`ChunkReader`] that serves byte ranges from page store blobs. +/// +/// Builds a sorted interval map from the metadata offsets to page file hashes, +/// so that any byte-range request from the Parquet decoder is resolved by +/// reading only the appropriate `.page` file(s) from disk. +pub struct PageStoreChunkReader { + store_dir: PathBuf, + /// Sorted map: virtual file offset -> (size, hash). + pages: BTreeMap, + /// Virtual file length (max offset + size across all pages). + total_len: u64, +} + +impl PageStoreChunkReader { + fn new(store_dir: PathBuf, manifest: &PageStoreManifest) -> Self { + let mut pages = BTreeMap::new(); + let mut total_len: u64 = 0; + for pr in &manifest.pages { + pages.insert(pr.offset, (pr.size, pr.hash.clone())); + let end = pr.offset as u64 + pr.size as u64; + if end > total_len { + total_len = end; + } + } + Self { + store_dir, + pages, + total_len, + } + } + + fn read_page_file(&self, hash: &str) -> io::Result { + let path = self.store_dir.join(format!("{hash}.page")); + let data = fs::read(&path)?; + Ok(Bytes::from(data)) + } +} + +impl Length for PageStoreChunkReader { + fn len(&self) -> u64 { + self.total_len + } +} + +impl ChunkReader for PageStoreChunkReader { + type T = Cursor; + + fn get_read(&self, start: u64) -> Result { + let bytes = self.get_bytes(start, (self.total_len - start) as usize)?; + Ok(Cursor::new(bytes)) + } + + fn get_bytes(&self, start: u64, length: usize) -> Result { + let end = start as i64 + length as i64; + let mut result = Vec::with_capacity(length); + + let scan_start = self + .pages + .range(..=start as i64) + .next_back() + .map(|(&o, _)| o) + .unwrap_or(0); + + for (&offset, (size, hash)) in self.pages.range(scan_start..) { + if offset >= end { + break; + } + + let page_data = self.read_page_file(hash)?; + + let copy_start = (start as i64 - offset).max(0) as usize; + let copy_end = (end - offset).min(*size as i64) as usize; + + if copy_start < copy_end && copy_start < page_data.len() { + let actual_end = copy_end.min(page_data.len()); + result.extend_from_slice(&page_data[copy_start..actual_end]); + } + + if result.len() >= length { + break; + } + } + + result.truncate(length); + Ok(Bytes::from(result)) + } +} diff --git a/parquet/src/arrow/page_store/writer.rs b/parquet/src/arrow/page_store/writer.rs new file mode 100644 index 000000000000..0c5377d3741c --- /dev/null +++ b/parquet/src/arrow/page_store/writer.rs @@ -0,0 +1,511 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! [`PageStoreWriter`] — writes Arrow data to a content-addressed page store. + +use std::fs; +use std::io::Write; +use std::path::{Path, PathBuf}; +use std::sync::{Arc, Mutex}; + +use bytes::Bytes; + +use arrow_array::RecordBatch; +use arrow_schema::{DataType as ArrowDataType, SchemaRef}; + +use super::{PageRef, PageStoreManifest, MANIFEST_KEY}; +use crate::arrow::arrow_writer::{ + ArrowColumnChunk, ArrowColumnChunkData, ArrowColumnWriterImpl, ArrowRowGroupWriter, + SharedColumnChunk, +}; +use crate::arrow::ArrowSchemaConverter; +use crate::column::chunker::ContentDefinedChunker; +use crate::column::page::{CompressedPage, PageWriteSpec, PageWriter}; +use crate::column::writer::{GenericColumnWriter, get_column_writer}; +use crate::errors::Result; +use crate::file::metadata::{ + FileMetaData, KeyValue, ParquetMetaData, ParquetMetaDataBuilder, ParquetMetaDataWriter, + RowGroupMetaData, +}; +use crate::file::page_index::column_index::ColumnIndexMetaData; +use crate::file::page_index::offset_index::OffsetIndexMetaData; +use crate::file::properties::{CdcOptions, EnabledStatistics, WriterProperties, WriterPropertiesPtr}; +use crate::parquet_thrift::{ThriftCompactOutputProtocol, WriteThrift}; +use crate::schema::types::{ColumnDescPtr, SchemaDescPtr, SchemaDescriptor}; + +// --------------------------------------------------------------------------- +// ContentAddressedPageWriter — internal PageWriter impl +// --------------------------------------------------------------------------- + +/// A [`PageWriter`] that writes each page to a content-addressed store directory. +struct ContentAddressedPageWriter { + buffer: SharedColumnChunk, + store_dir: PathBuf, + page_refs: Arc>>, + row_group: usize, + column: usize, + page_count: usize, +} + +impl ContentAddressedPageWriter { + fn new( + store_dir: PathBuf, + page_refs: Arc>>, + row_group: usize, + column: usize, + ) -> Self { + Self { + buffer: Arc::new(Mutex::new(ArrowColumnChunkData::default())), + store_dir, + page_refs, + row_group, + column, + page_count: 0, + } + } +} + +impl PageWriter for ContentAddressedPageWriter { + fn write_page(&mut self, page: CompressedPage) -> Result { + let page_header = page.to_thrift_header()?; + let mut header_bytes = Vec::with_capacity(256); + { + let mut protocol = ThriftCompactOutputProtocol::new(&mut header_bytes); + page_header.write_thrift(&mut protocol)?; + } + let header = Bytes::from(header_bytes); + + let data = page.compressed_page().buffer().clone(); + let compressed_size = data.len() + header.len(); + + let mut hasher = blake3::Hasher::new(); + hasher.update(&header); + hasher.update(&data); + let hash = hasher.finalize(); + let hash_hex = hash.to_hex().to_string(); + + let page_path = self.store_dir.join(format!("{hash_hex}.page")); + if !page_path.exists() { + let mut file = fs::File::create(&page_path)?; + file.write_all(&header)?; + file.write_all(&data)?; + } + + let mut buf = self.buffer.try_lock().unwrap(); + let offset = buf.length as u64; + buf.length += compressed_size; + buf.data.push(header.clone()); + buf.data.push(data); + + let is_dict = page.page_type() == crate::basic::PageType::DICTIONARY_PAGE; + self.page_refs.lock().unwrap().push(PageRef { + row_group: self.row_group, + column: self.column, + page_index: self.page_count, + offset: offset as i64, + size: compressed_size as i32, + hash: hash_hex, + is_dict, + }); + self.page_count += 1; + + let mut spec = PageWriteSpec::new(); + spec.page_type = page.page_type(); + spec.num_values = page.num_values(); + spec.uncompressed_size = page.uncompressed_size() + header.len(); + spec.offset = offset; + spec.compressed_size = compressed_size; + spec.bytes_written = compressed_size as u64; + Ok(spec) + } + + fn close(&mut self) -> Result<()> { + Ok(()) + } +} + +// --------------------------------------------------------------------------- +// Column writer factory +// --------------------------------------------------------------------------- + +fn create_column_writers( + schema: &SchemaDescriptor, + arrow_schema: &SchemaRef, + props: &WriterPropertiesPtr, + store_dir: &Path, + page_refs: &Arc>>, + row_group: usize, +) -> Result> { + let mut writers = Vec::new(); + let mut leaves = schema.columns().iter(); + let mut col_idx = 0usize; + for field in &arrow_schema.fields { + create_writers_for_type( + field.data_type(), + props, + &mut leaves, + store_dir, + page_refs, + row_group, + &mut col_idx, + &mut writers, + )?; + } + Ok(writers) +} + +fn make_column_writer( + desc: &ColumnDescPtr, + props: &WriterPropertiesPtr, + store_dir: &Path, + page_refs: &Arc>>, + row_group: usize, + col_idx: usize, + use_byte_array: bool, +) -> Result { + let pw = Box::new(ContentAddressedPageWriter::new( + store_dir.to_path_buf(), + page_refs.clone(), + row_group, + col_idx, + )); + let chunk: SharedColumnChunk = pw.buffer.clone(); + + let writer = if use_byte_array { + ArrowColumnWriterImpl::ByteArray(GenericColumnWriter::new( + desc.clone(), + props.clone(), + pw, + )) + } else { + ArrowColumnWriterImpl::Column(get_column_writer(desc.clone(), props.clone(), pw)) + }; + + Ok(crate::arrow::arrow_writer::ArrowColumnWriter { chunk, writer }) +} + +fn create_writers_for_type( + data_type: &ArrowDataType, + props: &WriterPropertiesPtr, + leaves: &mut std::slice::Iter<'_, ColumnDescPtr>, + store_dir: &Path, + page_refs: &Arc>>, + row_group: usize, + col_idx: &mut usize, + out: &mut Vec, +) -> Result<()> { + let col = |idx: &mut usize, leaves: &mut std::slice::Iter<'_, ColumnDescPtr>| { + let desc = leaves.next().unwrap(); + let i = *idx; + *idx += 1; + make_column_writer(desc, props, store_dir, page_refs, row_group, i, false) + }; + + let bytes = |idx: &mut usize, leaves: &mut std::slice::Iter<'_, ColumnDescPtr>| { + let desc = leaves.next().unwrap(); + let i = *idx; + *idx += 1; + make_column_writer(desc, props, store_dir, page_refs, row_group, i, true) + }; + + match data_type { + _ if data_type.is_primitive() => out.push(col(col_idx, leaves)?), + ArrowDataType::FixedSizeBinary(_) | ArrowDataType::Boolean | ArrowDataType::Null => { + out.push(col(col_idx, leaves)?) + } + ArrowDataType::LargeBinary + | ArrowDataType::Binary + | ArrowDataType::Utf8 + | ArrowDataType::LargeUtf8 + | ArrowDataType::BinaryView + | ArrowDataType::Utf8View => out.push(bytes(col_idx, leaves)?), + ArrowDataType::List(f) + | ArrowDataType::LargeList(f) + | ArrowDataType::FixedSizeList(f, _) + | ArrowDataType::ListView(f) + | ArrowDataType::LargeListView(f) => { + create_writers_for_type(f.data_type(), props, leaves, store_dir, page_refs, row_group, col_idx, out)?; + } + ArrowDataType::Struct(fields) => { + for field in fields { + create_writers_for_type(field.data_type(), props, leaves, store_dir, page_refs, row_group, col_idx, out)?; + } + } + ArrowDataType::Map(f, _) => match f.data_type() { + ArrowDataType::Struct(f) => { + create_writers_for_type(f[0].data_type(), props, leaves, store_dir, page_refs, row_group, col_idx, out)?; + create_writers_for_type(f[1].data_type(), props, leaves, store_dir, page_refs, row_group, col_idx, out)?; + } + _ => unreachable!("invalid map type"), + }, + ArrowDataType::Dictionary(_, value_type) => match value_type.as_ref() { + ArrowDataType::Utf8 + | ArrowDataType::LargeUtf8 + | ArrowDataType::Binary + | ArrowDataType::LargeBinary + | ArrowDataType::Utf8View + | ArrowDataType::BinaryView + | ArrowDataType::FixedSizeBinary(_) => out.push(bytes(col_idx, leaves)?), + _ => out.push(col(col_idx, leaves)?), + }, + _ => { + return Err(crate::errors::ParquetError::NYI(format!( + "PageStoreWriter: unsupported Arrow type {data_type}" + ))); + } + } + Ok(()) +} + +// --------------------------------------------------------------------------- +// PageStoreWriter +// --------------------------------------------------------------------------- + +/// Writes Arrow [`RecordBatch`]es to a content-addressed page store. +/// +/// Each data page is written as a separate file named by its BLAKE3 hash +/// under `store_dir`. The metadata-only Parquet file is written to an +/// explicit path on [`Self::finish`], containing the schema, row group +/// metadata, and a manifest mapping page locations to their hashes. +/// +/// A single `store_dir` can hold pages belonging to many Parquet files; +/// identical pages across files are automatically deduplicated. +/// +/// # Example +/// ```no_run +/// # use std::sync::Arc; +/// # use arrow_array::{ArrayRef, Int32Array, RecordBatch}; +/// # use parquet::arrow::page_store::PageStoreWriter; +/// # use parquet::file::properties::WriterProperties; +/// let batch = RecordBatch::try_from_iter(vec![ +/// ("id", Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef), +/// ]).unwrap(); +/// +/// let store = std::env::temp_dir().join("pages"); +/// let mut writer = PageStoreWriter::try_new(&store, batch.schema(), None).unwrap(); +/// writer.write(&batch).unwrap(); +/// writer.finish(store.join("table_a.parquet")).unwrap(); +/// ``` +pub struct PageStoreWriter { + store_dir: PathBuf, + schema: SchemaDescPtr, + arrow_schema: SchemaRef, + props: WriterPropertiesPtr, + page_refs: Arc>>, + row_groups: Vec, + column_indexes: Vec>, + offset_indexes: Vec>, + in_progress: Option, + cdc_chunkers: Option>, + row_group_index: usize, + total_rows: i64, + next_page_offset: i64, +} + +impl PageStoreWriter { + /// Create a new `PageStoreWriter`. + /// + /// Creates `store_dir` if it does not exist. + pub fn try_new( + store_dir: impl Into, + arrow_schema: SchemaRef, + props: Option, + ) -> Result { + let store_dir = store_dir.into(); + fs::create_dir_all(&store_dir)?; + + let props = props.unwrap_or_else(|| { + WriterProperties::builder() + .set_statistics_enabled(EnabledStatistics::Page) + .set_content_defined_chunking(Some(CdcOptions::default())) + .build() + }); + + let cdc_default = CdcOptions::default(); + let cdc_opts = props.content_defined_chunking().or(Some(&cdc_default)); + + let schema = { + let converter = ArrowSchemaConverter::new().with_coerce_types(props.coerce_types()); + converter.convert(&arrow_schema)? + }; + + let schema_descr = Arc::new(SchemaDescriptor::new(schema.root_schema_ptr())); + + let cdc_chunkers = cdc_opts + .map(|opts| { + schema_descr + .columns() + .iter() + .map(|desc| ContentDefinedChunker::new(desc, opts)) + .collect::>>() + }) + .transpose()?; + + let props_ptr = Arc::new(props); + + Ok(Self { + store_dir, + schema: schema_descr, + arrow_schema, + props: props_ptr, + page_refs: Arc::new(Mutex::new(Vec::new())), + row_groups: Vec::new(), + column_indexes: Vec::new(), + offset_indexes: Vec::new(), + in_progress: None, + cdc_chunkers, + row_group_index: 0, + total_rows: 0, + next_page_offset: 0, + }) + } + + /// Write a [`RecordBatch`] to the page store. + pub fn write(&mut self, batch: &RecordBatch) -> Result<()> { + if self.in_progress.is_none() { + let writers = create_column_writers( + &self.schema, + &self.arrow_schema, + &self.props, + &self.store_dir, + &self.page_refs, + self.row_group_index, + )?; + self.in_progress = Some(ArrowRowGroupWriter::new(writers, &self.arrow_schema)); + } + + let in_progress = self.in_progress.as_mut().unwrap(); + match self.cdc_chunkers.as_mut() { + Some(chunkers) => in_progress.write_with_chunkers(batch, chunkers)?, + None => in_progress.write(batch)?, + } + Ok(()) + } + + /// Flush the current row group. + pub fn flush(&mut self) -> Result<()> { + let in_progress = match self.in_progress.take() { + Some(ip) => ip, + None => return Ok(()), + }; + + let buffered_rows = in_progress.buffered_rows; + let chunks: Vec = in_progress.close()?; + + let mut column_metadata = Vec::with_capacity(chunks.len()); + let mut col_indexes: Vec = Vec::with_capacity(chunks.len()); + let mut off_indexes: Vec = Vec::with_capacity(chunks.len()); + let mut total_byte_size = 0i64; + + let mut cumulative_offset: i64 = self.next_page_offset; + let mut col_idx = 0usize; + + for chunk in chunks { + let mut close = chunk.close; + total_byte_size += close.metadata.uncompressed_size(); + + let src_dict_offset = close.metadata.dictionary_page_offset(); + let src_data_offset = close.metadata.data_page_offset(); + let src_start = src_dict_offset.unwrap_or(src_data_offset); + let delta = cumulative_offset - src_start; + + let mut col_builder = close.metadata.into_builder(); + col_builder = col_builder.set_data_page_offset(src_data_offset + delta); + if let Some(dict_off) = src_dict_offset { + col_builder = col_builder.set_dictionary_page_offset(Some(dict_off + delta)); + } + close.metadata = col_builder.build()?; + + if let Some(ref mut oi) = close.offset_index { + for loc in &mut oi.page_locations { + loc.offset += delta; + } + } + + { + let mut page_refs = self.page_refs.lock().unwrap(); + for pr in page_refs.iter_mut() { + if pr.row_group == self.row_group_index && pr.column == col_idx { + pr.offset += delta; + } + } + } + + col_idx += 1; + cumulative_offset += close.metadata.compressed_size(); + + column_metadata.push(close.metadata); + col_indexes.push(close.column_index.unwrap_or(ColumnIndexMetaData::NONE)); + if let Some(oi) = close.offset_index { + off_indexes.push(oi); + } else { + off_indexes.push(OffsetIndexMetaData { + page_locations: vec![], + unencoded_byte_array_data_bytes: None, + }); + } + } + + self.next_page_offset = cumulative_offset; + + let row_group = RowGroupMetaData::builder(self.schema.clone()) + .set_column_metadata(column_metadata) + .set_total_byte_size(total_byte_size) + .set_num_rows(buffered_rows as i64) + .set_ordinal(self.row_group_index as i16) + .build()?; + + self.total_rows += buffered_rows as i64; + self.row_groups.push(row_group); + self.column_indexes.push(col_indexes); + self.offset_indexes.push(off_indexes); + self.row_group_index += 1; + Ok(()) + } + + /// Flush remaining data and write the metadata-only Parquet file to `path`. + pub fn finish(mut self, path: impl AsRef) -> Result { + self.flush()?; + + let page_refs = self.page_refs.lock().unwrap().clone(); + let manifest = PageStoreManifest { pages: page_refs }; + let manifest_json = serde_json::to_string(&manifest) + .map_err(|e| crate::errors::ParquetError::General(e.to_string()))?; + + let file_metadata = FileMetaData::new( + 2, + self.total_rows, + Some("parquet-rs page_store".to_string()), + Some(vec![KeyValue::new(MANIFEST_KEY.to_string(), manifest_json)]), + self.schema.clone(), + None, + ); + + let mut builder = ParquetMetaDataBuilder::new(file_metadata); + for rg in self.row_groups { + builder = builder.add_row_group(rg); + } + builder = builder.set_column_index(Some(self.column_indexes)); + builder = builder.set_offset_index(Some(self.offset_indexes)); + let metadata = builder.build(); + + let file = fs::File::create(path.as_ref())?; + ParquetMetaDataWriter::new(file, &metadata).finish()?; + + Ok(metadata) + } +} diff --git a/parquet/src/bin/parquet-page-store.rs b/parquet/src/bin/parquet-page-store.rs new file mode 100644 index 000000000000..47f9e4be1f75 --- /dev/null +++ b/parquet/src/bin/parquet-page-store.rs @@ -0,0 +1,278 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! CLI tool for working with a content-addressed Parquet page store. +//! +//! # Install +//! +//! ```text +//! cargo install parquet --features=page_store,cli +//! ``` +//! +//! # Write a Parquet file into a page store +//! +//! ```text +//! parquet-page-store write input.parquet --store ./pages --output ./meta +//! ``` +//! +//! # Read a page-store-backed Parquet file +//! +//! ```text +//! parquet-page-store read ./meta/input.meta.parquet --store ./pages +//! ``` + +use std::fs::File; +use std::path::PathBuf; + +use arrow_array::RecordBatchReader; +use clap::{Parser, Subcommand, ValueEnum}; +use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; +use parquet::arrow::page_store::{PageStoreReader, PageStoreWriter}; +use parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel}; +use parquet::errors::Result; +use parquet::file::properties::WriterProperties; + +#[derive(Debug, Parser)] +#[clap(author, version)] +/// Content-addressed Parquet page store. +/// +/// A page store splits Parquet data pages into individual files named by their +/// BLAKE3 hash. Identical pages across files are stored only once, enabling +/// efficient deduplication when used with content-defined chunking (CDC). +/// +/// The workflow has two steps: +/// +/// 1. `write` — reads regular Parquet files, re-encodes their pages with CDC +/// chunking, writes each page as a {hash}.page blob into a shared store +/// directory, and produces a lightweight metadata-only Parquet file. +/// +/// 2. `read` — given a metadata Parquet file and the store directory, +/// reassembles the data and prints it. +/// +/// Quick start: +/// +/// # Write a file into the store +/// parquet-page-store write data.parquet --store ./pages --output ./meta +/// +/// # Read it back +/// parquet-page-store read ./meta/data.meta.parquet --store ./pages +/// +/// # Write several files (pages are deduplicated across them) +/// parquet-page-store write a.parquet b.parquet --store ./pages +struct Cli { + #[command(subcommand)] + command: Command, +} + +#[derive(Debug, Subcommand)] +enum Command { + /// Write Parquet files into a page store. + /// + /// Each input file is read, its pages are re-encoded with CDC chunking and + /// written to the store directory as {hash}.page blobs. A metadata-only + /// Parquet file is produced for each input (named {stem}.meta.parquet). + /// + /// Multiple files can share the same store directory — identical pages are + /// automatically deduplicated. + /// + /// Examples: + /// + /// # Single file, metadata written to current directory + /// parquet-page-store write data.parquet --store ./pages + /// + /// # Explicit output directory + /// parquet-page-store write data.parquet --store ./pages --output ./meta + /// + /// # Multiple files into the same store + /// parquet-page-store write a.parquet b.parquet --store ./pages + /// + /// # Write without compression + /// parquet-page-store write data.parquet --store ./pages --compression none + Write { + /// Input Parquet file(s). + #[clap(required = true)] + inputs: Vec, + + /// Page store directory for .page blobs (created if it does not exist). + #[clap(short, long)] + store: PathBuf, + + /// Output directory for metadata Parquet files [default: current directory]. + #[clap(short, long)] + output: Option, + + /// Compression codec for page data [default: zstd]. + #[clap(long, default_value = "zstd")] + compression: CompressionArg, + }, + + /// Read a page-store-backed Parquet file and print its contents. + /// + /// The metadata Parquet file contains the schema, row group structure, and + /// a manifest mapping each page to its BLAKE3 hash. The actual page data + /// is read from the store directory. + /// + /// Example: + /// + /// parquet-page-store read data.meta.parquet --store ./pages + Read { + /// Path to the metadata-only Parquet file. + input: PathBuf, + + /// Page store directory containing the .page blobs. + #[clap(short, long)] + store: PathBuf, + }, +} + +#[derive(Debug, Clone, ValueEnum)] +enum CompressionArg { + None, + Snappy, + Gzip, + Lzo, + Brotli, + Lz4, + Zstd, + Lz4Raw, +} + +impl CompressionArg { + fn to_parquet(&self) -> Compression { + match self { + CompressionArg::None => Compression::UNCOMPRESSED, + CompressionArg::Snappy => Compression::SNAPPY, + CompressionArg::Gzip => Compression::GZIP(GzipLevel::default()), + CompressionArg::Lzo => Compression::LZO, + CompressionArg::Brotli => Compression::BROTLI(BrotliLevel::default()), + CompressionArg::Lz4 => Compression::LZ4, + CompressionArg::Zstd => Compression::ZSTD(ZstdLevel::default()), + CompressionArg::Lz4Raw => Compression::LZ4_RAW, + } + } +} + +fn main() { + let cli = Cli::parse(); + let result = match cli.command { + Command::Write { + inputs, + store, + output, + compression, + } => cmd_write(&inputs, &store, output.as_deref(), compression), + Command::Read { input, store } => cmd_read(&input, &store), + }; + if let Err(e) = result { + eprintln!("Error: {e}"); + std::process::exit(1); + } +} + +fn cmd_write( + inputs: &[PathBuf], + store: &PathBuf, + output_dir: Option<&std::path::Path>, + compression: CompressionArg, +) -> Result<()> { + let output_dir = output_dir.unwrap_or_else(|| std::path::Path::new(".")); + std::fs::create_dir_all(output_dir)?; + + for input in inputs { + let file = File::open(input)?; + let reader = ParquetRecordBatchReaderBuilder::try_new(file)? + .with_batch_size(8192) + .build()?; + let schema = reader.schema(); + + let stem = input + .file_stem() + .and_then(|s| s.to_str()) + .unwrap_or("output"); + let meta_path = output_dir.join(format!("{stem}.meta.parquet")); + + let props = WriterProperties::builder() + .set_compression(compression.to_parquet()) + .build(); + let mut writer = PageStoreWriter::try_new(store, schema, Some(props))?; + let mut total_rows = 0usize; + for batch in reader { + let batch = + batch.map_err(|e| parquet::errors::ParquetError::General(e.to_string()))?; + total_rows += batch.num_rows(); + writer.write(&batch)?; + } + let metadata = writer.finish(&meta_path)?; + + let page_count = metadata + .file_metadata() + .key_value_metadata() + .and_then(|kvs| { + kvs.iter() + .find(|kv| kv.key == "page_store.manifest") + .and_then(|kv| kv.value.as_ref()) + }) + .and_then(|v| { + serde_json::from_str::(v) + .ok() + .and_then(|j| j["pages"].as_array().map(|a| a.len())) + }) + .unwrap_or(0); + + eprintln!( + "{}: {} rows, {} row group(s), {} pages -> {}", + input.display(), + total_rows, + metadata.num_row_groups(), + page_count, + meta_path.display(), + ); + } + + let page_files = std::fs::read_dir(store)? + .filter_map(|e| e.ok()) + .filter(|e| e.path().extension().map_or(false, |ext| ext == "page")) + .count(); + eprintln!( + "Page store: {} page file(s) in {}", + page_files, + store.display() + ); + + Ok(()) +} + +fn cmd_read(input: &PathBuf, store: &PathBuf) -> Result<()> { + let reader = PageStoreReader::try_new(input, store)?; + let md = reader.metadata(); + + eprintln!( + "Schema: {} column(s), {} row group(s), {} total row(s)", + md.row_groups().first().map_or(0, |rg| rg.num_columns()), + md.num_row_groups(), + md.file_metadata().num_rows(), + ); + + let mut total_rows = 0usize; + for batch in reader.reader()? { + let batch = batch.map_err(|e| parquet::errors::ParquetError::General(e.to_string()))?; + total_rows += batch.num_rows(); + } + eprintln!("Read {} row(s)", total_rows); + + Ok(()) +} From 0d34a46a66e2bdb85df821d94cd3bd3fd01ea8b6 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Fri, 3 Apr 2026 19:05:10 +0200 Subject: [PATCH 4/8] feat(parquet): add page store demo, reconstruct CLI command, and roundtrip verification --- parquet/Cargo.toml | 3 +- parquet/examples/page_store.rs | 6 +- parquet/examples/page_store_dedup/.gitignore | 6 + parquet/examples/page_store_dedup/README.md | 159 +++++++ parquet/examples/page_store_dedup/concept.py | 360 +++++++++++++++ .../page_store_dedup/page_store_concept.svg | 220 ++++++++++ parquet/examples/page_store_dedup/pipeline.py | 409 ++++++++++++++++++ parquet/src/arrow/mod.rs | 4 +- parquet/src/arrow/page_store/mod.rs | 314 +++++++++++--- parquet/src/arrow/page_store/reader.rs | 13 +- parquet/src/arrow/page_store/writer.rs | 58 ++- parquet/src/bin/parquet-page-store.rs | 117 ++++- 12 files changed, 1571 insertions(+), 98 deletions(-) create mode 100644 parquet/examples/page_store_dedup/.gitignore create mode 100644 parquet/examples/page_store_dedup/README.md create mode 100644 parquet/examples/page_store_dedup/concept.py create mode 100644 parquet/examples/page_store_dedup/page_store_concept.svg create mode 100644 parquet/examples/page_store_dedup/pipeline.py diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 5e4eeacee1f8..0bf5f66eeeac 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -66,6 +66,7 @@ num-integer = { version = "0.1.46", default-features = false, features = ["std"] num-traits = { version = "0.2.19", default-features = false, features = ["std"] } base64 = { version = "0.22", default-features = false, features = ["std", ], optional = true } clap = { version = "4.1", default-features = false, features = ["std", "derive", "env", "help", "error-context", "usage"], optional = true } +glob = { version = "0.3", default-features = false, optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true } seq-macro = { version = "0.3", default-features = false } @@ -110,7 +111,7 @@ arrow = ["base64", "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", # Enable support for arrow canonical extension types arrow_canonical_extension_types = ["arrow-schema?/canonical_extension_types"] # Enable CLI tools -cli = ["json", "base64", "clap", "arrow-csv", "serde"] +cli = ["json", "base64", "clap", "arrow-csv", "serde", "dep:glob"] # Enable JSON APIs json = ["serde_json", "base64"] # Enable internal testing APIs diff --git a/parquet/examples/page_store.rs b/parquet/examples/page_store.rs index 8b963329c997..736ce9354694 100644 --- a/parquet/examples/page_store.rs +++ b/parquet/examples/page_store.rs @@ -87,7 +87,11 @@ fn main() -> parquet::errors::Result<()> { let batches = reader.read_batches()?; let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); - println!("Read {} batch(es), {} total rows", batches.len(), total_rows); + println!( + "Read {} batch(es), {} total rows", + batches.len(), + total_rows + ); // Display let formatted = pretty_format_batches(&batches).unwrap(); diff --git a/parquet/examples/page_store_dedup/.gitignore b/parquet/examples/page_store_dedup/.gitignore new file mode 100644 index 000000000000..9d823ef6cc69 --- /dev/null +++ b/parquet/examples/page_store_dedup/.gitignore @@ -0,0 +1,6 @@ +data/ +meta/ +pages/ +verify/ +.venv/ +.cache/ diff --git a/parquet/examples/page_store_dedup/README.md b/parquet/examples/page_store_dedup/README.md new file mode 100644 index 000000000000..63ffa70f2ceb --- /dev/null +++ b/parquet/examples/page_store_dedup/README.md @@ -0,0 +1,159 @@ +# Parquet Page Store — Deduplication Demo + +> **Prototype**: This is an experimental feature exploring content-defined +> chunking for Parquet. APIs and file formats may change. + +Demonstrates how Content-Defined Chunking (CDC) enables efficient deduplication +across multiple versions of a dataset using the Parquet page store writer in +Apache Arrow Rust. The deduplication is self-contained in the Parquet writer — +no special storage system is required. + +## What this demo shows + +Four common dataset operations are applied to a real-world dataset +([OpenHermes-2.5](https://huggingface.co/datasets/teknium/OpenHermes-2.5) +conversational data, ~800 MB per file). Each operation produces a separate +Parquet file. Without a page store, storing all four files costs the full sum +of their sizes. With the CDC page store, identical pages are stored **exactly +once** — indexed by their BLAKE3 hash — so the four files share most of their +bytes. The resulting files can be stored anywhere. + +| File | Operation | +|------|-----------| +| `original.parquet` | Baseline dataset (~996k rows) | +| `filtered.parquet` | Keep rows where `num_turns ≤ 3` | +| `augmented.parquet` | Original + computed column `num_turns` | +| `appended.parquet` | Original + 5 000 new rows appended | + +## Prerequisites + +```bash +pip install pyarrow matplotlib huggingface_hub +cargo build --release -p parquet --features page_store,cli +``` + +## Running the demo + +```bash +cd parquet/examples/page_store_dedup + +# Run the full pipeline: prepare data, build binary, ingest into page store, show stats +python pipeline.py + +# Then generate diagrams +python diagram.py +``` + +Individual steps can be skipped if they've already run: + +```bash +python pipeline.py --skip-prepare --skip-build # re-run ingest + stats only +python pipeline.py --skip-prepare --skip-build --skip-ingest # stats only +``` + +Outputs: +- `page_store_concept.png` — architectural overview of how shared pages work +- `page_store_savings.png` — side-by-side storage comparison with real numbers + +## Using your own dataset + +```bash +python pipeline.py --file /path/to/your.parquet +``` + +The script requires a `conversations` list column for the filtered and augmented +variants. Adapt `pipeline.py` to your own schema as needed. + +## Results + +Dataset: **OpenHermes-2.5** (short conversations, `num_turns < 10`) + +### Dataset variants + +| File | Operation | Rows | Size | +|------|-----------|------|------| +| `original.parquet` | Baseline | 996,009 | 782.1 MB | +| `filtered.parquet` | Keep `num_turns ≤ 3` (removes 0.2% of rows) | 993,862 | 776.8 MB | +| `augmented.parquet` | Add column `num_turns` | 996,009 | 782.2 MB | +| `appended.parquet` | Append 5,000 rows | 1,001,009 | 788.6 MB | +| **Total** | | | **3,129.7 MB** | + +### Page store results + +| Metric | Value | +|--------|-------| +| Unique pages stored | 3,400 | +| Total page references | 15,179 | +| Page store size | 559.0 MB | +| Metadata files size | 4.4 MB | +| **Page store + metadata** | **563.4 MB** | +| **Storage saved** | **2,566.3 MB (82%)** | +| **Deduplication ratio** | **5.6×** | + +### Per-file page breakdown + +| File | Page refs | Unique hashes | New pages | Reused pages | +|------|-----------|---------------|-----------|--------------| +| `original.parquet` | 3,782 | 3,100 | 3,100 | 0 | +| `filtered.parquet` | 3,755 | 3,075 | 222 | 2,853 (92%) | +| `augmented.parquet` | 3,834 | 3,136 | 36 | 3,100 (98%) | +| `appended.parquet` | 3,808 | 3,125 | 42 | 3,083 (98%) | + +### Key insights + +1. **Adding a column** (`augmented`): only 36 new pages out of 3,136 (1.1%). + The existing 17 columns produce identical CDC pages — only the new `num_turns` + column contributes new pages. + +2. **Appending rows** (`appended`): only 42 new pages out of 3,125 (1.3%). + The original 996k rows' pages are unchanged; only the 5k new rows create new pages. + +3. **Filtering rows** (`filtered`): 92% of pages reused despite row removal. + Removing just 0.2% of rows barely shifts CDC boundaries — most pages are + unchanged. Heavier filtering (removing 20–50% of rows) would produce more new + pages, as CDC boundaries shift further throughout the file. + +4. **Net result**: 4 dataset versions stored for **563 MB instead of 3.1 GB** — an + **82% reduction**, or equivalently, 4 versions for the cost of **0.72×** a single + version. + +## How it works + +``` +Standard Parquet — each file stored independently: + + original.parquet ──► [ page 1 ][ page 2 ][ page 3 ]...[ page N ] + filtered.parquet ──► [ page 1'][ page 2 ][ page 3 ]...[ page M ] + augmented.parquet ──► [ page 1 ][ page 2 ][ page 3 ]...[ page N ][ extra ] + appended.parquet ──► [ page 1 ][ page 2 ][ page 3 ]...[ page N ][ new ] + + Total: sum of all four file sizes + +CDC Page Store — content-addressed, deduplicated: + + pages/ + .page ← shared by original, augmented, appended + .page ← shared by original, filtered, augmented, appended + .page ← shared by filtered only (boundary page) + ... (only UNIQUE pages stored) + + meta/ + original.meta.parquet ← tiny manifest referencing page hashes + filtered.meta.parquet + augmented.meta.parquet + appended.meta.parquet + + Total: ~18% of the combined file sizes +``` + +CDC ensures that page boundaries are **content-defined** (not fixed row +counts), so adding columns or appending rows only requires storing the small +number of new pages — the rest remain identical and are reused. + +## Further reading + +- [`parquet::arrow::page_store`][api] API docs +- [`parquet-page-store` CLI][cli] source + +[api]: https://docs.rs/parquet/latest/parquet/arrow/page_store/index.html +[cli]: ../../src/bin/parquet-page-store.rs diff --git a/parquet/examples/page_store_dedup/concept.py b/parquet/examples/page_store_dedup/concept.py new file mode 100644 index 000000000000..cdd30789145e --- /dev/null +++ b/parquet/examples/page_store_dedup/concept.py @@ -0,0 +1,360 @@ +#!/usr/bin/env python3 +""" +Generate the Parquet Page Store concept diagram. + +Output: page_store_concept.svg (open in any browser) + page_store_concept.png (requires drawsvg[raster]) +""" + +import os +import drawsvg as draw + +HERE = os.path.dirname(__file__) + +# --------------------------------------------------------------------------- +# Palette +# --------------------------------------------------------------------------- + +BG = "#0f1117" +SURFACE = "#161b22" +BORDER = "#2a2f3a" +TEXT_HI = "#f0f6fc" +TEXT_LO = "#6e7681" +BLUE = "#4493f8" +GREEN = "#3fb950" +PURPLE = "#bc8cff" +ORANGE = "#f0883e" +WHITE = "#ffffff" + +# --------------------------------------------------------------------------- +# Layout grid (derive everything from these constants) +# --------------------------------------------------------------------------- + +PAD = 28 # outer margin +GAP = 120 # gap between file panel right edge and store left edge + +FILE_W = 360 +FILE_H = 104 +FILE_GAP = 14 # vertical gap between file cards + +N_FILES = 4 +FILES_H = N_FILES * FILE_H + (N_FILES - 1) * FILE_GAP # 502 + +STORE_Y_PAD = 38 # store header height (folder name + divider) +STORE_LEG_H = 82 # legend block at bottom of store +STORE_H = FILES_H # store and file panel share the same height + +TITLE_H = 82 # space taken by title block +TOP_Y = TITLE_H + 12 + +CMP_H = 82 # bottom comparison bar height +CMP_GAP = 18 + +STORE_W = 256 # fixed, intentionally compact +STORE_X = PAD + FILE_W + GAP +CANVAS_W = STORE_X + STORE_W + PAD +CANVAS_H = TOP_Y + STORE_H + CMP_GAP + CMP_H + PAD + + +# --------------------------------------------------------------------------- +# Public entry point +# --------------------------------------------------------------------------- + +def generate(total_mb: float = 3129.7, store_mb: float = 563.4) -> None: + """Render the concept SVG (and optionally PNG) to the script directory.""" + + d = draw.Drawing(CANVAS_W, CANVAS_H) + + # ----------------------------------------------------------------------- + # Drawing helpers (closures over d) + # ----------------------------------------------------------------------- + + def bg_rect(x, y, w, h, fill=SURFACE, stroke=BORDER, rx=8, **kw): + d.append(draw.Rectangle(x, y, w, h, fill=fill, stroke=stroke, + stroke_width=1.5, rx=rx, ry=rx, **kw)) + + def txt(s, x, y, size=13, fill=TEXT_HI, anchor="middle", weight="normal", **kw): + d.append(draw.Text(s, size, x, y, text_anchor=anchor, fill=fill, + font_weight=weight, + font_family="ui-monospace,'SF Mono',monospace", **kw)) + + def hline(x1, x2, y, color=BORDER, width=1, opacity=1.0, dash=None): + kw = {"stroke_dasharray": dash} if dash else {} + d.append(draw.Line(x1, y, x2, y, stroke=color, stroke_width=width, + stroke_opacity=opacity, **kw)) + + arrowhead_ids: dict[str, str] = {} + + def _arrowhead(color: str) -> str: + if color not in arrowhead_ids: + aid = f"ah{len(arrowhead_ids)}" + arrowhead_ids[color] = aid + m = draw.Marker(-0.1, -3, 4, 3, orient="auto", id=aid) + m.append(draw.Path(d="M0,-2.5 L3.5,0 L0,2.5 Z", fill=color)) + d.append_def(m) + return f"url(#{arrowhead_ids[color]})" + + def arrow_line(x1, y, x2, color, label=None): + path = draw.Path(stroke=color, stroke_width=1.8, stroke_opacity=0.55, + fill="none", marker_end=_arrowhead(color)) + path.M(x1, y) + path.L(x2, y) + d.append(path) + if label: + mx, lw = (x1 + x2) / 2, 72 + d.append(draw.Rectangle(mx - lw / 2, y - 10, lw, 16, + fill="#1c2128", stroke=color, stroke_width=1, + stroke_opacity=0.4, rx=8, ry=8)) + txt(label, mx, y + 2, size=9, fill=color, weight="bold") + + def page_tile(x, y, w, h, color, label): + """Filled page block with glow halo + label.""" + d.append(draw.Rectangle(x - 3, y - 3, w + 6, h + 6, + fill=color, fill_opacity=0.07, rx=6, ry=6)) + d.append(draw.Rectangle(x, y, w, h, fill=color, fill_opacity=0.18, + stroke=color, stroke_width=1.5, stroke_opacity=0.7, + rx=5, ry=5)) + txt(label, x + w / 2, y + h / 2 + 5, size=14, fill=color, weight="bold") + + def file_page(x, y, w, h, color, label): + """Smaller page block used inside file cards.""" + d.append(draw.Rectangle(x, y, w, h, fill=color, fill_opacity=0.22, + stroke=color, stroke_width=1, stroke_opacity=0.55, + rx=3, ry=3)) + txt(label, x + w / 2, y + h / 2 + 4, size=9, fill=color, weight="bold") + + # ----------------------------------------------------------------------- + # Background + grid + # ----------------------------------------------------------------------- + + d.append(draw.Rectangle(0, 0, CANVAS_W, CANVAS_H, fill=BG)) + for gx in range(0, CANVAS_W, 40): + d.append(draw.Line(gx, 0, gx, CANVAS_H, stroke=WHITE, + stroke_width=0.18, stroke_opacity=0.04)) + for gy in range(0, CANVAS_H, 40): + d.append(draw.Line(0, gy, CANVAS_W, gy, stroke=WHITE, + stroke_width=0.18, stroke_opacity=0.04)) + + # ----------------------------------------------------------------------- + # Title + # ----------------------------------------------------------------------- + + txt("Parquet Page Store", CANVAS_W / 2, 32, size=22, weight="bold") + + cx = CANVAS_W / 2 + d.append(draw.Raw( + f'' + f'Deduplication built into the Arrow Rust Parquet writer using ' + f'Content-Defined Chunking' + f'' + )) + + hline(CANVAS_W / 2 - 230, CANVAS_W / 2 + 230, 63, color=BORDER) + hline(CANVAS_W / 2 - 50, CANVAS_W / 2 + 50, 63, color=BLUE, width=2, opacity=0.45) + + # ----------------------------------------------------------------------- + # Section labels (centered above each panel) + # ----------------------------------------------------------------------- + + FILES_CX = PAD + FILE_W / 2 + STORE_CX = STORE_X + STORE_W / 2 + + LABEL_Y = TOP_Y - 10 + txt("INPUT FILES", FILES_CX, LABEL_Y, size=9, fill=TEXT_LO, weight="bold") + txt("PAGE STORE", STORE_CX, LABEL_Y, size=9, fill=TEXT_LO, weight="bold") + + # ----------------------------------------------------------------------- + # Store card + # ----------------------------------------------------------------------- + + bg_rect(STORE_X, TOP_Y, STORE_W, STORE_H, fill="#0d1117", stroke=BORDER, rx=10) + + txt("pages/", STORE_X + 16, TOP_Y + 20, size=11, fill=TEXT_LO, anchor="start") + hline(STORE_X + 12, STORE_X + STORE_W - 12, TOP_Y + 30, color=BORDER) + + # ----------------------------------------------------------------------- + # Unique pages grid (centered inside the store card) + # ----------------------------------------------------------------------- + + UNIQUE_PAGES = [ + (BLUE, "A"), (BLUE, "B"), (BLUE, "C"), (BLUE, "D"), + (BLUE, "E"), (BLUE, "F"), (PURPLE, "G"), (PURPLE, "H"), + (GREEN, "I"), (GREEN, "J"), (ORANGE, "K"), (ORANGE, "L"), + ] + + SP_COLS = 3 + SPW, SPH = 56, 40 + SP_GAP_X = 14 + SP_GAP_Y = 10 + + grid_w = SP_COLS * SPW + (SP_COLS - 1) * SP_GAP_X + SP_START_X = STORE_X + (STORE_W - grid_w) // 2 + SP_START_Y = TOP_Y + STORE_Y_PAD + 10 + + page_centers: dict[str, tuple[float, float]] = {} + + for i, (color, label) in enumerate(UNIQUE_PAGES): + col, row = i % SP_COLS, i // SP_COLS + px = SP_START_X + col * (SPW + SP_GAP_X) + py = SP_START_Y + row * (SPH + SP_GAP_Y) + page_tile(px, py, SPW, SPH, color, label) + page_centers[label] = (px + SPW / 2, py + SPH / 2) + txt(f"#{label.lower()}3f9a…", px + SPW / 2, py + SPH + 10, + size=7, fill=TEXT_LO) + + N_PAGE_ROWS = (len(UNIQUE_PAGES) + SP_COLS - 1) // SP_COLS + last_row_py = SP_START_Y + (N_PAGE_ROWS - 1) * (SPH + SP_GAP_Y) + hash_label_bottom = last_row_py + SPH + 14 + + LIST_MARGIN_X = 14 + LIST_INNER_X = 10 + LINE_H = 13 + LIST_INNER_PY = 7 + + LISTING = [ + ("158k", "a3f9b2e1c04d7f28"), + ("201k", "ff22e9640578db3c"), + ("167k", "bc8cff3ad19f673d"), + ("148k", "4493f8c9b28705f3"), + ("160k", "3fb950efa4891422"), + ] + + list_x = STORE_X + LIST_MARGIN_X + list_w = STORE_W - 2 * LIST_MARGIN_X + list_y = hash_label_bottom + 8 + list_h = len(LISTING) * LINE_H + 2 * LIST_INNER_PY + + d.append(draw.Rectangle(list_x, list_y, list_w, list_h, + fill="#0a0d12", rx=4, ry=4)) + + for i, (size, hash_prefix) in enumerate(LISTING): + baseline = list_y + LIST_INNER_PY + i * LINE_H + LINE_H - 3 + line_txt = f".rw-r--r-- {size:>4} {hash_prefix}….page" + txt(line_txt, list_x + LIST_INNER_X, baseline, + size=7.5, fill="#3d4450", anchor="start") + + # ----------------------------------------------------------------------- + # Legend (centered, pinned to bottom of store card) + # ----------------------------------------------------------------------- + + LEG_ITEMS = [ + (BLUE, "shared by all"), + (PURPLE, "filter boundary"), + (GREEN, "new column"), + (ORANGE, "new rows"), + ] + LEG_COL_W = STORE_W / 2 - 4 + LEG_Y0 = TOP_Y + STORE_H - STORE_LEG_H + 20 + + hline(STORE_X + 12, STORE_X + STORE_W - 12, + TOP_Y + STORE_H - STORE_LEG_H, color=BORDER) + + for i, (color, label) in enumerate(LEG_ITEMS): + col, row = i % 2, i // 2 + lx = STORE_X + 20 + col * LEG_COL_W + ly = LEG_Y0 + row * 22 + d.append(draw.Rectangle(lx, ly - 7, 11, 11, fill=color, + fill_opacity=0.85, rx=2, ry=2)) + txt(label, lx + 16, ly + 2, size=10, fill=TEXT_LO, anchor="start") + + # ----------------------------------------------------------------------- + # File cards + # ----------------------------------------------------------------------- + + PW, PH, PGAP = 34, 26, 4 + + FILES = [ + ("original.parquet", "baseline · 996k rows", + [(BLUE,"A"),(BLUE,"B"),(BLUE,"C"),(BLUE,"D"),(BLUE,"E"),(BLUE,"F")], + BLUE, "baseline"), + ("filtered.parquet", "keep num_turns < 3", + [(BLUE,"A"),(PURPLE,"G"),(BLUE,"C"),(BLUE,"D"),(BLUE,"E"),(PURPLE,"H")], + PURPLE, "92% reused"), + ("augmented.parquet", "add num_turns column", + [(BLUE,"A"),(GREEN,"I"),(BLUE,"B"),(BLUE,"C"),(GREEN,"J"),(BLUE,"D"),(BLUE,"E"),(BLUE,"F")], + GREEN, "98% reused"), + ("appended.parquet", "append 5 000 rows", + [(BLUE,"A"),(BLUE,"B"),(BLUE,"C"),(BLUE,"D"),(BLUE,"E"),(BLUE,"F"),(ORANGE,"K"),(ORANGE,"L")], + ORANGE, "98% reused"), + ] + + for fi, (fname, subtitle, pages, accent, reuse_lbl) in enumerate(FILES): + fy = TOP_Y + fi * (FILE_H + FILE_GAP) + card_mid_y = fy + FILE_H / 2 + + bg_rect(PAD, fy, FILE_W, FILE_H, fill=SURFACE, stroke=BORDER, rx=8) + d.append(draw.Rectangle(PAD, fy + 10, 3, FILE_H - 20, + fill=accent, fill_opacity=0.85, rx=1, ry=1)) + txt(fname, PAD + 16, fy + 26, size=12, fill=TEXT_HI, weight="bold", anchor="start") + txt(subtitle, PAD + 16, fy + 43, size=10, fill=TEXT_LO, anchor="start") + + strip_x = PAD + 16 + strip_y = fy + FILE_H - PH - 12 + for pi, (pcolor, plabel) in enumerate(pages): + file_page(strip_x + pi * (PW + PGAP), strip_y, PW, PH, pcolor, plabel) + + arrow_line(PAD + FILE_W + 4, card_mid_y, STORE_X - 4, accent, label=reuse_lbl) + + # ----------------------------------------------------------------------- + # Bottom: storage comparison bars + # ----------------------------------------------------------------------- + + CMP_Y = TOP_Y + STORE_H + CMP_GAP + CMP_X = PAD + CMP_W = CANVAS_W - PAD * 2 + + bg_rect(CMP_X, CMP_Y, CMP_W, CMP_H, fill="#0d1117", stroke=BORDER, rx=8) + txt("STORAGE COMPARISON", CMP_X + CMP_W / 2, CMP_Y + 13, + size=9, fill=TEXT_LO, weight="bold") + + LABEL_COL_W = 132 + RIGHT_PAD = 12 + TRACK_X = CMP_X + LABEL_COL_W + TRACK_W = CMP_W - LABEL_COL_W - RIGHT_PAD - 220 + + BAR_H = 20 + savings_pct = round((1 - store_mb / total_mb) * 100) + ratio = total_mb / store_mb + + R1_Y = CMP_Y + 22 + txt("Vanilla Parquet", TRACK_X - 8, R1_Y + BAR_H / 2 + 4, + size=10, fill=TEXT_LO, anchor="end") + d.append(draw.Rectangle(TRACK_X, R1_Y, TRACK_W, BAR_H, + fill="#ef5350", fill_opacity=0.22, + stroke="#ef5350", stroke_width=1.2, stroke_opacity=0.45, + rx=4, ry=4)) + txt(f"{total_mb:,.0f} MB (4 independent files)", + TRACK_X + TRACK_W + 10, R1_Y + BAR_H / 2 + 4, + size=10, fill="#ef9a9a", anchor="start") + + R2_Y = R1_Y + BAR_H + 8 + WITH_W = round(TRACK_W * store_mb / total_mb) + txt("Page Store via CDC", TRACK_X - 8, R2_Y + BAR_H / 2 + 4, + size=10, fill=TEXT_LO, anchor="end") + d.append(draw.Rectangle(TRACK_X, R2_Y, WITH_W, BAR_H, + fill="#66bb6a", fill_opacity=0.22, + stroke="#66bb6a", stroke_width=1.2, stroke_opacity=0.45, + rx=4, ry=4)) + txt(f"{store_mb:,.0f} MB — {savings_pct}% less · {ratio:.1f}× ratio", + TRACK_X + WITH_W + 10, R2_Y + BAR_H / 2 + 4, + size=10, fill="#a5d6a7", anchor="start") + + # ----------------------------------------------------------------------- + # Save + # ----------------------------------------------------------------------- + + out_svg = os.path.join(HERE, "page_store_concept.svg") + out_png = os.path.join(HERE, "page_store_concept.png") + d.save_svg(out_svg) + print(f" Saved {out_svg}") + + try: + d.save_png(out_png) + print(f" Saved {out_png}") + except Exception as e: + print(f" PNG skipped ({e}) — open the SVG in a browser") + + +if __name__ == "__main__": + generate() diff --git a/parquet/examples/page_store_dedup/page_store_concept.svg b/parquet/examples/page_store_dedup/page_store_concept.svg new file mode 100644 index 000000000000..fbedd5b227e9 --- /dev/null +++ b/parquet/examples/page_store_dedup/page_store_concept.svg @@ -0,0 +1,220 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Parquet Page Store +Deduplication built into the Arrow Rust Parquet writer using Content-Defined Chunking + + +INPUT FILES +PAGE STORE + +pages/ + + + +A +#a3f9a… + + +B +#b3f9a… + + +C +#c3f9a… + + +D +#d3f9a… + + +E +#e3f9a… + + +F +#f3f9a… + + +G +#g3f9a… + + +H +#h3f9a… + + +I +#i3f9a… + + +J +#j3f9a… + + +K +#k3f9a… + + +L +#l3f9a… + +.rw-r--r-- 158k a3f9b2e1c04d7f28….page +.rw-r--r-- 201k ff22e9640578db3c….page +.rw-r--r-- 167k bc8cff3ad19f673d….page +.rw-r--r-- 148k 4493f8c9b28705f3….page +.rw-r--r-- 160k 3fb950efa4891422….page + + +shared by all + +filter boundary + +new column + +new rows + + +original.parquet +baseline · 996k rows + +A + +B + +C + +D + +E + +F + + +baseline + + +filtered.parquet +keep num_turns < 3 + +A + +G + +C + +D + +E + +H + + +92% reused + + +augmented.parquet +add num_turns column + +A + +I + +B + +C + +J + +D + +E + +F + + +98% reused + + +appended.parquet +append 5 000 rows + +A + +B + +C + +D + +E + +F + +K + +L + + +98% reused + +STORAGE COMPARISON +Vanilla Parquet + +2,796 MB (4 independent files) +Page Store via CDC + +865 MB — 69% less · 3.2× ratio + \ No newline at end of file diff --git a/parquet/examples/page_store_dedup/pipeline.py b/parquet/examples/page_store_dedup/pipeline.py new file mode 100644 index 000000000000..a49eb0659139 --- /dev/null +++ b/parquet/examples/page_store_dedup/pipeline.py @@ -0,0 +1,409 @@ +#!/usr/bin/env python3 +""" +Full pipeline for the Parquet Page Store deduplication demo. + +Steps: + 1. Prepare – download dataset and produce 4 Parquet variant files. + 2. Build – compile the parquet-page-store CLI binary. + 3. Ingest – write all variants into a shared content-addressed page store. + 4. Stats – compute and display deduplication statistics. + +Usage: + python pipeline.py [--file PATH] [--skip-prepare] [--skip-build] [--skip-ingest] + +Options: + --file PATH Use a local Parquet file instead of downloading from HuggingFace + --skip-prepare Skip data preparation (variants must already exist in data/) + --skip-build Skip cargo build (binary must already exist) + --skip-ingest Skip page store ingest (pages must already exist in pages/) +""" + +import argparse +import os +import shutil +import subprocess +import sys + +# Ensure imports from the same directory work regardless of cwd +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +import pyarrow as pa +import pyarrow.compute as pc +import pyarrow.parquet as pq + +# --------------------------------------------------------------------------- +# Config +# --------------------------------------------------------------------------- + +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +DATA_DIR = os.path.join(SCRIPT_DIR, "data") +META_DIR = os.path.join(SCRIPT_DIR, "meta") +PAGES_DIR = os.path.join(SCRIPT_DIR, "pages") +CACHE_DIR = os.path.join(SCRIPT_DIR, ".cache") + +# Repo root is 3 levels up: page_store_dedup/ -> examples/ -> parquet/ -> arrow-rs/ +REPO_ROOT = os.path.abspath(os.path.join(SCRIPT_DIR, "..", "..", "..")) +BINARY = os.path.join(REPO_ROOT, "target", "release", "parquet-page-store") + +HF_REPO_ID = "kszucs/pq" +HF_FILENAME = "hermes-2.5-cdc-short.parquet" + +# Number of rows to reserve for the appended variant +APPEND_ROWS = 5_000 + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _fmt(n: int) -> str: + return f"{n:,}" + + +def _mb(path: str) -> str: + return f"{os.path.getsize(path) / 1e6:.1f} MB" + + +def _dir_size(directory: str, ext: str) -> int: + total = 0 + for entry in os.scandir(directory): + if entry.is_file() and entry.name.endswith(ext): + total += entry.stat().st_size + return total + + +# --------------------------------------------------------------------------- +# Step 1 – Prepare +# --------------------------------------------------------------------------- + + +def load_raw(path: str | None) -> tuple[pa.Table, pa.Table]: + if path is None: + try: + from huggingface_hub import hf_hub_download + except ImportError: + sys.exit("ERROR: huggingface_hub is required. Install with: pip install huggingface_hub") + + print(f" Downloading {HF_FILENAME} from HuggingFace ... ", end="", flush=True) + path = hf_hub_download( + repo_id=HF_REPO_ID, + filename=HF_FILENAME, + repo_type="dataset", + cache_dir=CACHE_DIR, + ) + print(f"done ({_mb(path)})") + + print(f" Reading {os.path.basename(path)} ...") + full = pq.read_table(path) + print(f" Full table: {_fmt(len(full))} rows, {len(full.schema)} columns") + + if len(full) <= APPEND_ROWS: + sys.exit(f"ERROR: dataset has only {_fmt(len(full))} rows, need more than {_fmt(APPEND_ROWS)}") + + base = full.slice(0, len(full) - APPEND_ROWS) + extra = full.slice(len(full) - APPEND_ROWS, APPEND_ROWS) + return base, extra + + +def step_prepare(file_path: str | None) -> None: + print("=" * 60) + print(" Step 1 – Prepare dataset variants") + print("=" * 60) + + os.makedirs(DATA_DIR, exist_ok=True) + base, extra = load_raw(file_path) + print() + + total_bytes = 0 + print(f" Writing 4 variants to {DATA_DIR}/") + print() + + out = os.path.join(DATA_DIR, "original.parquet") + pq.write_table(base, out) + sz = os.path.getsize(out) + total_bytes += sz + print(f" original.parquet {_fmt(len(base)):>9} rows {sz / 1e6:>6.1f} MB (baseline)") + + out = os.path.join(DATA_DIR, "filtered.parquet") + mask = pc.less(pc.list_value_length(base["conversations"]), 3) + filtered = base.filter(mask) + pq.write_table(filtered, out) + sz = os.path.getsize(out) + total_bytes += sz + pct = len(filtered) * 100 // len(base) + print(f" filtered.parquet {_fmt(len(filtered)):>9} rows {sz / 1e6:>6.1f} MB ({pct}% of original rows kept)") + + out = os.path.join(DATA_DIR, "augmented.parquet") + num_turns = pc.list_value_length(base["conversations"]).cast(pa.int32()) + augmented = base.append_column(pa.field("num_turns", pa.int32()), num_turns) + pq.write_table(augmented, out) + sz = os.path.getsize(out) + total_bytes += sz + print(f" augmented.parquet {_fmt(len(augmented)):>9} rows {sz / 1e6:>6.1f} MB (same rows, +1 column)") + + out = os.path.join(DATA_DIR, "appended.parquet") + appended = pa.concat_tables([base, extra]) + pq.write_table(appended, out) + sz = os.path.getsize(out) + total_bytes += sz + print(f" appended.parquet {_fmt(len(appended)):>9} rows {sz / 1e6:>6.1f} MB (+{_fmt(APPEND_ROWS)} rows appended)") + + print() + print(f" Total (4 independent files): {total_bytes / 1e6:.1f} MB") + print() + + +# --------------------------------------------------------------------------- +# Step 2 – Build +# --------------------------------------------------------------------------- + + +def step_build() -> None: + print("=" * 60) + print(" Step 2 – Build parquet-page-store binary") + print("=" * 60) + print() + + cmd = ["cargo", "build", "--release", "-p", "parquet", "--features", "page_store,cli"] + print(f" Running: {' '.join(cmd)}") + print() + + result = subprocess.run(cmd, cwd=REPO_ROOT) + if result.returncode != 0: + sys.exit(f"ERROR: cargo build failed (exit code {result.returncode})") + + print() + print(f" Binary: {BINARY}") + print() + + +# --------------------------------------------------------------------------- +# Step 3 – Ingest into page store +# --------------------------------------------------------------------------- + + +def step_ingest() -> None: + print("=" * 60) + print(" Step 3 – Ingest Parquet files into page store") + print("=" * 60) + print() + + if not os.path.isfile(BINARY): + sys.exit(f"ERROR: binary not found at {BINARY}\n Run without --skip-build first.") + + for d in (PAGES_DIR, META_DIR): + if os.path.isdir(d): + shutil.rmtree(d) + os.makedirs(d) + + inputs = sorted( + os.path.join(DATA_DIR, f) + for f in os.listdir(DATA_DIR) + if f.endswith(".parquet") and os.path.isfile(os.path.join(DATA_DIR, f)) + ) + if not inputs: + sys.exit(f"ERROR: no .parquet files found in {DATA_DIR}") + + cmd = [BINARY, "write"] + inputs + ["--store", PAGES_DIR, "--output", META_DIR, "--compression", "snappy"] + print(f" Running: parquet-page-store write <{len(inputs)} files> --store pages --output meta --compression snappy") + print() + + result = subprocess.run(cmd, cwd=SCRIPT_DIR) + if result.returncode != 0: + sys.exit(f"ERROR: parquet-page-store write failed (exit code {result.returncode})") + + print() + + +# --------------------------------------------------------------------------- +# Step 4 – Statistics +# --------------------------------------------------------------------------- + + +def step_stats() -> tuple[float, float]: + print("=" * 60) + print(" Step 4 – Deduplication statistics") + print("=" * 60) + print() + + # Input file sizes (top-level .parquet files only) + input_files = sorted( + os.path.join(DATA_DIR, f) + for f in os.listdir(DATA_DIR) + if f.endswith(".parquet") and os.path.isfile(os.path.join(DATA_DIR, f)) + ) + if not input_files: + print(" No input files found — run without --skip-ingest first.") + return 0.0, 0.0 + + total_input = sum(os.path.getsize(p) for p in input_files) + + print(" Input files:") + for path in input_files: + sz = os.path.getsize(path) + print(f" {os.path.basename(path):<25} {sz / 1e6:>7.1f} MB") + print(f" {'Total':<25} {total_input / 1e6:>7.1f} MB") + print() + + # Page store size + if not os.path.isdir(PAGES_DIR): + print(" Page store directory not found — run without --skip-ingest first.") + return 0.0, 0.0 + + page_files = [e for e in os.scandir(PAGES_DIR) if e.is_file() and e.name.endswith(".page")] + if not page_files: + print(" No .page files found in pages/ — run without --skip-ingest first.") + return 0.0, 0.0 + + total_pages = _dir_size(PAGES_DIR, ".page") + page_count = len(page_files) + + ratio = total_pages / total_input + savings = 1.0 - ratio + bar_len = 20 + bar = "█" * round(ratio * bar_len) + + print(" Page store:") + print(f" Unique pages: {page_count:>7,}") + print(f" Page store size: {total_pages / 1e6:>7.1f} MB") + print(f" Total input size: {total_input / 1e6:>7.1f} MB") + print(f" Dedup ratio (store/input): {ratio * 100:>6.1f}% {bar}") + print(f" Space savings: {savings * 100:>6.1f}%") + print() + print(" Note: these numbers reflect page-level deduplication within the") + print(" page store. Block-level tools (e.g. 'de stats') operate at a") + print(" different granularity and will report lower dedup ratios.") + print() + + return total_input / 1e6, total_pages / 1e6 + + +# --------------------------------------------------------------------------- +# Step 5 – Regenerate concept diagram +# --------------------------------------------------------------------------- + + +def step_concept(total_mb: float, store_mb: float) -> None: + print("=" * 60) + print(" Step 5 – Regenerate concept diagram") + print("=" * 60) + print() + + try: + from concept import generate + except ImportError: + print(" SKIP: drawsvg not installed (pip install drawsvg)") + print() + return + + generate(total_mb=total_mb, store_mb=store_mb) + print() + + +# --------------------------------------------------------------------------- +# Step 6 – Roundtrip verification +# --------------------------------------------------------------------------- + + +def step_verify() -> None: + print("=" * 60) + print(" Step 6 – Roundtrip verification") + print("=" * 60) + print() + + verify_dir = os.path.join(SCRIPT_DIR, "verify") + if os.path.isdir(verify_dir): + shutil.rmtree(verify_dir) + os.makedirs(verify_dir) + + if not os.path.isfile(BINARY): + sys.exit(f"ERROR: binary not found at {BINARY}\n Run without --skip-build first.") + + meta_files = sorted( + os.path.join(META_DIR, f) + for f in os.listdir(META_DIR) + if f.endswith(".meta.parquet") and os.path.isfile(os.path.join(META_DIR, f)) + ) + if not meta_files: + sys.exit(f"ERROR: no .meta.parquet files found in {META_DIR}") + + all_ok = True + for meta_path in meta_files: + stem = os.path.basename(meta_path).replace(".meta.parquet", "") + original_path = os.path.join(DATA_DIR, f"{stem}.parquet") + reconstructed_path = os.path.join(verify_dir, f"{stem}.parquet") + + if not os.path.isfile(original_path): + print(f" SKIP {stem}: original not found in data/") + continue + + cmd = [BINARY, "reconstruct", meta_path, "--store", PAGES_DIR, "--output", reconstructed_path] + result = subprocess.run(cmd, capture_output=True) + if result.returncode != 0: + print(f" FAIL {stem}: reconstruction failed") + print(result.stderr.decode()) + all_ok = False + continue + + original = pq.read_table(original_path) + reconstructed = pq.read_table(reconstructed_path) + + if original.equals(reconstructed, check_metadata=False): + print(f" OK {stem}.parquet ({len(original):,} rows, {len(original.schema)} columns)") + else: + print(f" FAIL {stem}: data mismatch") + orig_rows, rec_rows = len(original), len(reconstructed) + if orig_rows != rec_rows: + print(f" row count: original={orig_rows:,} reconstructed={rec_rows:,}") + else: + for col in original.schema.names: + if col not in reconstructed.schema.names: + print(f" missing column in reconstructed: {col}") + elif not original[col].equals(reconstructed[col]): + print(f" column mismatch: {col}") + all_ok = False + + print() + if all_ok: + print(" All roundtrip checks passed.") + else: + sys.exit("ERROR: roundtrip verification failed") + print() + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("--file", metavar="PATH", help="Use a local Parquet file instead of downloading") + parser.add_argument("--skip-prepare", action="store_true", help="Skip data preparation step") + parser.add_argument("--skip-build", action="store_true", help="Skip cargo build step") + parser.add_argument("--skip-ingest", action="store_true", help="Skip page store ingest step") + parser.add_argument("--skip-concept", action="store_true", help="Skip concept diagram regeneration") + parser.add_argument("--skip-verify", action="store_true", help="Skip roundtrip verification step") + args = parser.parse_args() + + if not args.skip_prepare: + step_prepare(args.file) + + if not args.skip_build: + step_build() + + if not args.skip_ingest: + step_ingest() + + total_mb, store_mb = step_stats() + + if not args.skip_concept: + step_concept(total_mb, store_mb) + + if not args.skip_verify: + step_verify() + + +if __name__ == "__main__": + main() diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index e24788e4bcd7..11007930dfb0 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -182,10 +182,10 @@ experimental!(mod array_reader); pub mod arrow_reader; pub mod arrow_writer; -#[cfg(feature = "page_store")] -pub mod page_store; mod buffer; mod decoder; +#[cfg(feature = "page_store")] +pub mod page_store; #[cfg(feature = "async")] pub mod async_reader; diff --git a/parquet/src/arrow/page_store/mod.rs b/parquet/src/arrow/page_store/mod.rs index 87aa408b7840..aa710a102ff7 100644 --- a/parquet/src/arrow/page_store/mod.rs +++ b/parquet/src/arrow/page_store/mod.rs @@ -67,18 +67,16 @@ mod tests { use std::sync::Arc; use arrow_array::{ - ArrayRef, BooleanArray, Float64Array, Int32Array, ListArray, RecordBatch, - StringArray, StructArray, + ArrayRef, BooleanArray, Float64Array, Int32Array, ListArray, RecordBatch, StringArray, + StructArray, }; use arrow_schema::Field; use super::*; + use crate::arrow::ArrowSchemaConverter; use crate::errors::Result; - use crate::file::metadata::{ - FileMetaData, KeyValue, ParquetMetaData, ParquetMetaDataWriter, - }; + use crate::file::metadata::{FileMetaData, KeyValue, ParquetMetaData, ParquetMetaDataWriter}; use crate::file::properties::{EnabledStatistics, WriterProperties}; - use crate::arrow::ArrowSchemaConverter; use crate::schema::types::SchemaDescriptor; // ----------------------------------------------------------------------- @@ -109,13 +107,50 @@ mod tests { fn sample_batch() -> RecordBatch { RecordBatch::try_from_iter(vec![ - ("id", Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])) as ArrayRef), - ("value", Arc::new(Float64Array::from(vec![1.0, 2.5, 3.7, 4.2, 5.9])) as ArrayRef), - ("name", Arc::new(StringArray::from(vec!["alice", "bob", "charlie", "diana", "eve"])) as ArrayRef), + ( + "id", + Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])) as ArrayRef, + ), + ( + "value", + Arc::new(Float64Array::from(vec![1.0, 2.5, 3.7, 4.2, 5.9])) as ArrayRef, + ), + ( + "name", + Arc::new(StringArray::from(vec![ + "alice", "bob", "charlie", "diana", "eve", + ])) as ArrayRef, + ), + ]) + .unwrap() + } + + /// A large batch that encodes to well over 256 KiB per column, guaranteeing + /// multiple CDC pages per column with default CDC parameters (min 256 KiB). + /// Uses 100 000 rows of varied (non-compressible) data. + fn large_batch(n: usize) -> RecordBatch { + let ids: Vec = (0..n as i32).collect(); + // Vary the float values so they resist run-length compression + let values: Vec = (0..n).map(|i| (i as f64 * 1.000_001_f64).sin()).collect(); + // 30-byte strings — varied enough to prevent dictionary/RLE collapsing + let names: Vec = (0..n) + .map(|i| format!("row_{:0>10}_pad_{:0>10}", i, i * 7 + 3)) + .collect(); + RecordBatch::try_from_iter(vec![ + ("id", Arc::new(Int32Array::from(ids)) as ArrayRef), + ("value", Arc::new(Float64Array::from(values)) as ArrayRef), + ("name", Arc::new(StringArray::from(names)) as ArrayRef), ]) .unwrap() } + /// Concatenate all batches into one for equality comparison. + fn concat_batches(batches: &[RecordBatch]) -> RecordBatch { + use arrow_select::concat::concat_batches; + let schema = batches[0].schema(); + concat_batches(&schema, batches).unwrap() + } + // ----------------------------------------------------------------------- // Round-trip tests // ----------------------------------------------------------------------- @@ -145,20 +180,26 @@ mod tests { let store = tmp.path().join("pages"); let meta = tmp.path().join("data.parquet"); - let b1 = RecordBatch::try_from_iter(vec![ - ("x", Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef), - ]).unwrap(); - let b2 = RecordBatch::try_from_iter(vec![ - ("x", Arc::new(Int32Array::from(vec![4, 5])) as ArrayRef), - ]).unwrap(); - - let metadata = write_batches(&store, &meta, &[b1, b2], None).unwrap(); + let b1 = RecordBatch::try_from_iter(vec![( + "x", + Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef, + )]) + .unwrap(); + let b2 = RecordBatch::try_from_iter(vec![( + "x", + Arc::new(Int32Array::from(vec![4, 5])) as ArrayRef, + )]) + .unwrap(); + + let metadata = write_batches(&store, &meta, &[b1.clone(), b2.clone()], None).unwrap(); assert_eq!(metadata.num_row_groups(), 1); assert_eq!(metadata.file_metadata().num_rows(), 5); - let batches = PageStoreReader::try_new(&meta, &store).unwrap().read_batches().unwrap(); - let total: usize = batches.iter().map(|b| b.num_rows()).sum(); - assert_eq!(total, 5); + let batches = PageStoreReader::try_new(&meta, &store) + .unwrap() + .read_batches() + .unwrap(); + assert_eq!(concat_batches(&batches), concat_batches(&[b1, b2])); } #[test] @@ -179,10 +220,64 @@ mod tests { assert_eq!(metadata.num_row_groups(), 3); assert_eq!(metadata.file_metadata().num_rows(), 15); - let total: usize = PageStoreReader::try_new(&meta, &store) - .unwrap().read_batches().unwrap() - .iter().map(|b| b.num_rows()).sum(); - assert_eq!(total, 15); + let batches = PageStoreReader::try_new(&meta, &store) + .unwrap() + .read_batches() + .unwrap(); + let expected = concat_batches(&[batch.clone(), batch.clone(), batch]); + assert_eq!(concat_batches(&batches), expected); + } + + #[test] + fn test_multipage_roundtrip() { + // 100 000 rows encodes to several MiB per column, well above the 256 KiB + // CDC minimum, so every column gets multiple pages. Assert that the + // reconstructed data is bit-for-bit identical to the input. + let tmp = tempfile::tempdir().unwrap(); + let store = tmp.path().join("pages"); + let meta = tmp.path().join("data.parquet"); + + let batch = large_batch(1_000_000); + write_batches(&store, &meta, &[batch.clone()], None).unwrap(); + + // Must have produced more than one page per column + assert!(count_page_files(&store) > batch.num_columns()); + + let batches = PageStoreReader::try_new(&meta, &store) + .unwrap() + .read_batches() + .unwrap(); + assert_eq!(concat_batches(&batches), batch); + } + + #[test] + fn test_multipage_multiple_row_groups_roundtrip() { + // Three row groups, each large enough for multiple pages per column. + let tmp = tempfile::tempdir().unwrap(); + let store = tmp.path().join("pages"); + let meta = tmp.path().join("data.parquet"); + + let b1 = large_batch(500_000); + let b2 = large_batch(500_000); + let b3 = large_batch(500_000); + + let mut writer = PageStoreWriter::try_new(&store, b1.schema(), None).unwrap(); + writer.write(&b1).unwrap(); + writer.flush().unwrap(); + writer.write(&b2).unwrap(); + writer.flush().unwrap(); + writer.write(&b3).unwrap(); + let metadata = writer.finish(&meta).unwrap(); + + assert_eq!(metadata.num_row_groups(), 3); + assert_eq!(metadata.file_metadata().num_rows(), 1_500_000); + + let batches = PageStoreReader::try_new(&meta, &store) + .unwrap() + .read_batches() + .unwrap(); + let expected = concat_batches(&[b1, b2, b3]); + assert_eq!(concat_batches(&batches), expected); } #[test] @@ -213,12 +308,34 @@ mod tests { let meta = tmp.path().join("data.parquet"); let batch = RecordBatch::try_from_iter(vec![ - ("id", Arc::new(Int32Array::from(vec![Some(1), None, Some(3), None, Some(5)])) as ArrayRef), - ("label", Arc::new(StringArray::from(vec![Some("a"), Some("b"), None, None, Some("e")])) as ArrayRef), - ]).unwrap(); + ( + "id", + Arc::new(Int32Array::from(vec![ + Some(1), + None, + Some(3), + None, + Some(5), + ])) as ArrayRef, + ), + ( + "label", + Arc::new(StringArray::from(vec![ + Some("a"), + Some("b"), + None, + None, + Some("e"), + ])) as ArrayRef, + ), + ]) + .unwrap(); write_batches(&store, &meta, &[batch.clone()], None).unwrap(); - let batches = PageStoreReader::try_new(&meta, &store).unwrap().read_batches().unwrap(); + let batches = PageStoreReader::try_new(&meta, &store) + .unwrap() + .read_batches() + .unwrap(); assert_eq!(batches[0], batch); } @@ -228,12 +345,17 @@ mod tests { let store = tmp.path().join("pages"); let meta = tmp.path().join("data.parquet"); - let batch = RecordBatch::try_from_iter(vec![ - ("flag", Arc::new(BooleanArray::from(vec![true, false, true, true, false])) as ArrayRef), - ]).unwrap(); + let batch = RecordBatch::try_from_iter(vec![( + "flag", + Arc::new(BooleanArray::from(vec![true, false, true, true, false])) as ArrayRef, + )]) + .unwrap(); write_batches(&store, &meta, &[batch.clone()], None).unwrap(); - let batches = PageStoreReader::try_new(&meta, &store).unwrap().read_batches().unwrap(); + let batches = PageStoreReader::try_new(&meta, &store) + .unwrap() + .read_batches() + .unwrap(); assert_eq!(batches[0], batch); } @@ -244,17 +366,23 @@ mod tests { let meta = tmp.path().join("data.parquet"); let struct_array = StructArray::from(vec![ - (Arc::new(Field::new("a", arrow_schema::DataType::Int32, false)), - Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef), - (Arc::new(Field::new("b", arrow_schema::DataType::Utf8, false)), - Arc::new(StringArray::from(vec!["x", "y", "z"])) as ArrayRef), + ( + Arc::new(Field::new("a", arrow_schema::DataType::Int32, false)), + Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef, + ), + ( + Arc::new(Field::new("b", arrow_schema::DataType::Utf8, false)), + Arc::new(StringArray::from(vec!["x", "y", "z"])) as ArrayRef, + ), ]); - let batch = RecordBatch::try_from_iter(vec![ - ("s", Arc::new(struct_array) as ArrayRef), - ]).unwrap(); + let batch = + RecordBatch::try_from_iter(vec![("s", Arc::new(struct_array) as ArrayRef)]).unwrap(); write_batches(&store, &meta, &[batch.clone()], None).unwrap(); - let batches = PageStoreReader::try_new(&meta, &store).unwrap().read_batches().unwrap(); + let batches = PageStoreReader::try_new(&meta, &store) + .unwrap() + .read_batches() + .unwrap(); assert_eq!(batches[0], batch); } @@ -272,12 +400,14 @@ mod tests { Arc::new(values), None, ); - let batch = RecordBatch::try_from_iter(vec![ - ("items", Arc::new(list) as ArrayRef), - ]).unwrap(); + let batch = + RecordBatch::try_from_iter(vec![("items", Arc::new(list) as ArrayRef)]).unwrap(); write_batches(&store, &meta, &[batch.clone()], None).unwrap(); - let batches = PageStoreReader::try_new(&meta, &store).unwrap().read_batches().unwrap(); + let batches = PageStoreReader::try_new(&meta, &store) + .unwrap() + .read_batches() + .unwrap(); assert_eq!(batches[0], batch); } @@ -295,8 +425,12 @@ mod tests { write_batches(&store, &meta, &[batch.clone()], None).unwrap(); let total: usize = PageStoreReader::try_new(&meta, &store) - .unwrap().read_batches().unwrap() - .iter().map(|b| b.num_rows()).sum(); + .unwrap() + .read_batches() + .unwrap() + .iter() + .map(|b| b.num_rows()) + .sum(); assert_eq!(total, 5); } @@ -312,7 +446,10 @@ mod tests { .build(); write_batches(&store, &meta, &[batch.clone()], Some(props)).unwrap(); - let batches = PageStoreReader::try_new(&meta, &store).unwrap().read_batches().unwrap(); + let batches = PageStoreReader::try_new(&meta, &store) + .unwrap() + .read_batches() + .unwrap(); assert_eq!(batches[0], batch); } @@ -348,8 +485,9 @@ mod tests { let unique: std::collections::HashSet<_> = manifest.pages.iter().map(|p| &p.hash).collect(); assert_eq!(count_page_files(&store), unique.len()); - let total: usize = reader.read_batches().unwrap().iter().map(|b| b.num_rows()).sum(); - assert_eq!(total, 10); + let batches = reader.read_batches().unwrap(); + let expected = concat_batches(&[batch.clone(), batch]); + assert_eq!(concat_batches(&batches), expected); } #[test] @@ -369,8 +507,14 @@ mod tests { assert_eq!(pages_after_first, pages_after_second); - let batches_a = PageStoreReader::try_new(&meta_a, &store).unwrap().read_batches().unwrap(); - let batches_b = PageStoreReader::try_new(&meta_b, &store).unwrap().read_batches().unwrap(); + let batches_a = PageStoreReader::try_new(&meta_a, &store) + .unwrap() + .read_batches() + .unwrap(); + let batches_b = PageStoreReader::try_new(&meta_b, &store) + .unwrap() + .read_batches() + .unwrap(); assert_eq!(batches_a, batches_b); assert_eq!(batches_a[0], batch); } @@ -389,7 +533,10 @@ mod tests { let batch = sample_batch(); write_batches(&store, &meta, &[batch.clone()], None).unwrap(); - let batches = PageStoreReader::try_new(&meta, &store).unwrap().read_batches().unwrap(); + let batches = PageStoreReader::try_new(&meta, &store) + .unwrap() + .read_batches() + .unwrap(); assert_eq!(batches[0], batch); } @@ -428,11 +575,14 @@ mod tests { } assert!(manifest.pages.iter().all(|p| p.row_group == 0)); - let columns: std::collections::HashSet<_> = manifest.pages.iter().map(|p| p.column).collect(); + let columns: std::collections::HashSet<_> = + manifest.pages.iter().map(|p| p.column).collect(); assert_eq!(columns.len(), metadata.row_groups()[0].num_columns()); for col in &columns { - let mut idxs: Vec<_> = manifest.pages.iter() + let mut idxs: Vec<_> = manifest + .pages + .iter() .filter(|p| p.column == *col) .map(|p| p.page_index) .collect(); @@ -454,7 +604,10 @@ mod tests { let batch = sample_batch(); write_batches(&store, &meta, &[batch.clone()], None).unwrap(); - let schema = PageStoreReader::try_new(&meta, &store).unwrap().schema().unwrap(); + let schema = PageStoreReader::try_new(&meta, &store) + .unwrap() + .schema() + .unwrap(); assert_eq!(schema.fields(), batch.schema().fields()); } @@ -490,13 +643,19 @@ mod tests { write_batches(&store, &meta, &[sample_batch()], None).unwrap(); - let first_page = fs::read_dir(&store).unwrap() + let first_page = fs::read_dir(&store) + .unwrap() .filter_map(|e| e.ok()) .find(|e| e.path().extension().map_or(false, |ext| ext == "page")) .unwrap(); fs::remove_file(first_page.path()).unwrap(); - assert!(PageStoreReader::try_new(&meta, &store).unwrap().read_batches().is_err()); + assert!( + PageStoreReader::try_new(&meta, &store) + .unwrap() + .read_batches() + .is_err() + ); } #[test] @@ -505,18 +664,30 @@ mod tests { let store = tmp.path().join("pages"); let meta = tmp.path().join("data.parquet"); - let schema = ArrowSchemaConverter::new().convert(&sample_batch().schema()).unwrap(); + let schema = ArrowSchemaConverter::new() + .convert(&sample_batch().schema()) + .unwrap(); let schema_descr = Arc::new(SchemaDescriptor::new(schema.root_schema_ptr())); let file_metadata = FileMetaData::new( - 2, 0, None, - Some(vec![KeyValue::new(MANIFEST_KEY.to_string(), "not json{{{".to_string())]), - schema_descr, None, + 2, + 0, + None, + Some(vec![KeyValue::new( + MANIFEST_KEY.to_string(), + "not json{{{".to_string(), + )]), + schema_descr, + None, ); fs::create_dir_all(&store).unwrap(); let file = fs::File::create(&meta).unwrap(); - ParquetMetaDataWriter::new(file, &ParquetMetaData::new(file_metadata, vec![])).finish().unwrap(); + ParquetMetaDataWriter::new(file, &ParquetMetaData::new(file_metadata, vec![])) + .finish() + .unwrap(); - let err = PageStoreReader::try_new(&meta, &store).unwrap_err().to_string(); + let err = PageStoreReader::try_new(&meta, &store) + .unwrap_err() + .to_string(); assert!(err.contains("expected"), "unexpected error: {err}"); } @@ -526,14 +697,23 @@ mod tests { let store = tmp.path().join("pages"); let meta = tmp.path().join("data.parquet"); - let schema = ArrowSchemaConverter::new().convert(&sample_batch().schema()).unwrap(); + let schema = ArrowSchemaConverter::new() + .convert(&sample_batch().schema()) + .unwrap(); let schema_descr = Arc::new(SchemaDescriptor::new(schema.root_schema_ptr())); let file_metadata = FileMetaData::new(2, 0, None, None, schema_descr, None); fs::create_dir_all(&store).unwrap(); let file = fs::File::create(&meta).unwrap(); - ParquetMetaDataWriter::new(file, &ParquetMetaData::new(file_metadata, vec![])).finish().unwrap(); + ParquetMetaDataWriter::new(file, &ParquetMetaData::new(file_metadata, vec![])) + .finish() + .unwrap(); - let err = PageStoreReader::try_new(&meta, &store).unwrap_err().to_string(); - assert!(err.contains(MANIFEST_KEY), "error should mention key: {err}"); + let err = PageStoreReader::try_new(&meta, &store) + .unwrap_err() + .to_string(); + assert!( + err.contains(MANIFEST_KEY), + "error should mention key: {err}" + ); } } diff --git a/parquet/src/arrow/page_store/reader.rs b/parquet/src/arrow/page_store/reader.rs index 9712bf0e2c03..f5e02a70e76a 100644 --- a/parquet/src/arrow/page_store/reader.rs +++ b/parquet/src/arrow/page_store/reader.rs @@ -28,7 +28,7 @@ use bytes::Bytes; use arrow_array::RecordBatch; use arrow_schema::{ArrowError, SchemaRef}; -use super::{PageStoreManifest, MANIFEST_KEY}; +use super::{MANIFEST_KEY, PageStoreManifest}; use crate::arrow::arrow_reader::{ ArrowReaderMetadata, ArrowReaderOptions, ParquetRecordBatchReaderBuilder, }; @@ -74,10 +74,7 @@ impl PageStoreReader { /// /// * `metadata_path` — path to the metadata-only `.parquet` file. /// * `store_dir` — directory containing `{hash}.page` blobs. - pub fn try_new( - metadata_path: impl AsRef, - store_dir: impl Into, - ) -> Result { + pub fn try_new(metadata_path: impl AsRef, store_dir: impl Into) -> Result { let store_dir = store_dir.into(); let file = fs::File::open(metadata_path.as_ref())?; @@ -119,8 +116,7 @@ impl PageStoreReader { /// decoded on-demand and only one batch is held in memory at a time. pub fn reader(&self) -> Result { let chunk_reader = PageStoreChunkReader::new(self.store_dir.clone(), &self.manifest); - let options = - ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::Required); + let options = ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::Required); let arrow_metadata = ArrowReaderMetadata::try_new(Arc::clone(&self.metadata), options)?; ParquetRecordBatchReaderBuilder::new_with_metadata(chunk_reader, arrow_metadata).build() } @@ -150,8 +146,7 @@ impl PageStoreReader { crate::errors::ParquetError::General(format!("'{MANIFEST_KEY}' has no value")) })?; - serde_json::from_str(value) - .map_err(|e| crate::errors::ParquetError::General(e.to_string())) + serde_json::from_str(value).map_err(|e| crate::errors::ParquetError::General(e.to_string())) } } diff --git a/parquet/src/arrow/page_store/writer.rs b/parquet/src/arrow/page_store/writer.rs index 0c5377d3741c..87d2b4914ef9 100644 --- a/parquet/src/arrow/page_store/writer.rs +++ b/parquet/src/arrow/page_store/writer.rs @@ -27,12 +27,12 @@ use bytes::Bytes; use arrow_array::RecordBatch; use arrow_schema::{DataType as ArrowDataType, SchemaRef}; -use super::{PageRef, PageStoreManifest, MANIFEST_KEY}; +use super::{MANIFEST_KEY, PageRef, PageStoreManifest}; +use crate::arrow::ArrowSchemaConverter; use crate::arrow::arrow_writer::{ ArrowColumnChunk, ArrowColumnChunkData, ArrowColumnWriterImpl, ArrowRowGroupWriter, SharedColumnChunk, }; -use crate::arrow::ArrowSchemaConverter; use crate::column::chunker::ContentDefinedChunker; use crate::column::page::{CompressedPage, PageWriteSpec, PageWriter}; use crate::column::writer::{GenericColumnWriter, get_column_writer}; @@ -43,7 +43,9 @@ use crate::file::metadata::{ }; use crate::file::page_index::column_index::ColumnIndexMetaData; use crate::file::page_index::offset_index::OffsetIndexMetaData; -use crate::file::properties::{CdcOptions, EnabledStatistics, WriterProperties, WriterPropertiesPtr}; +use crate::file::properties::{ + CdcOptions, EnabledStatistics, WriterProperties, WriterPropertiesPtr, +}; use crate::parquet_thrift::{ThriftCompactOutputProtocol, WriteThrift}; use crate::schema::types::{ColumnDescPtr, SchemaDescPtr, SchemaDescriptor}; @@ -186,11 +188,7 @@ fn make_column_writer( let chunk: SharedColumnChunk = pw.buffer.clone(); let writer = if use_byte_array { - ArrowColumnWriterImpl::ByteArray(GenericColumnWriter::new( - desc.clone(), - props.clone(), - pw, - )) + ArrowColumnWriterImpl::ByteArray(GenericColumnWriter::new(desc.clone(), props.clone(), pw)) } else { ArrowColumnWriterImpl::Column(get_column_writer(desc.clone(), props.clone(), pw)) }; @@ -238,17 +236,53 @@ fn create_writers_for_type( | ArrowDataType::FixedSizeList(f, _) | ArrowDataType::ListView(f) | ArrowDataType::LargeListView(f) => { - create_writers_for_type(f.data_type(), props, leaves, store_dir, page_refs, row_group, col_idx, out)?; + create_writers_for_type( + f.data_type(), + props, + leaves, + store_dir, + page_refs, + row_group, + col_idx, + out, + )?; } ArrowDataType::Struct(fields) => { for field in fields { - create_writers_for_type(field.data_type(), props, leaves, store_dir, page_refs, row_group, col_idx, out)?; + create_writers_for_type( + field.data_type(), + props, + leaves, + store_dir, + page_refs, + row_group, + col_idx, + out, + )?; } } ArrowDataType::Map(f, _) => match f.data_type() { ArrowDataType::Struct(f) => { - create_writers_for_type(f[0].data_type(), props, leaves, store_dir, page_refs, row_group, col_idx, out)?; - create_writers_for_type(f[1].data_type(), props, leaves, store_dir, page_refs, row_group, col_idx, out)?; + create_writers_for_type( + f[0].data_type(), + props, + leaves, + store_dir, + page_refs, + row_group, + col_idx, + out, + )?; + create_writers_for_type( + f[1].data_type(), + props, + leaves, + store_dir, + page_refs, + row_group, + col_idx, + out, + )?; } _ => unreachable!("invalid map type"), }, diff --git a/parquet/src/bin/parquet-page-store.rs b/parquet/src/bin/parquet-page-store.rs index 47f9e4be1f75..6b8d0feb5762 100644 --- a/parquet/src/bin/parquet-page-store.rs +++ b/parquet/src/bin/parquet-page-store.rs @@ -41,9 +41,10 @@ use std::path::PathBuf; use arrow_array::RecordBatchReader; use clap::{Parser, Subcommand, ValueEnum}; use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; +use parquet::arrow::arrow_writer::ArrowWriter; use parquet::arrow::page_store::{PageStoreReader, PageStoreWriter}; use parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel}; -use parquet::errors::Result; +use parquet::errors::{ParquetError, Result}; use parquet::file::properties::WriterProperties; #[derive(Debug, Parser)] @@ -56,13 +57,16 @@ use parquet::file::properties::WriterProperties; /// /// The workflow has two steps: /// -/// 1. `write` — reads regular Parquet files, re-encodes their pages with CDC +/// 1. `write` — reads regular Parquet files, re-encodes their pages with CDC /// chunking, writes each page as a {hash}.page blob into a shared store /// directory, and produces a lightweight metadata-only Parquet file. /// -/// 2. `read` — given a metadata Parquet file and the store directory, +/// 2. `read` — given a metadata Parquet file and the store directory, /// reassembles the data and prints it. /// +/// 3. `reconstruct` — given a metadata Parquet file and the store directory, +/// writes a self-contained regular Parquet file (no page store dependency). +/// /// Quick start: /// /// # Write a file into the store @@ -71,6 +75,9 @@ use parquet::file::properties::WriterProperties; /// # Read it back /// parquet-page-store read ./meta/data.meta.parquet --store ./pages /// +/// # Reconstruct a self-contained Parquet file from the page store +/// parquet-page-store reconstruct ./meta/data.meta.parquet --store ./pages --output data.parquet +/// /// # Write several files (pages are deduplicated across them) /// parquet-page-store write a.parquet b.parquet --store ./pages struct Cli { @@ -137,6 +144,32 @@ enum Command { #[clap(short, long)] store: PathBuf, }, + + /// Reconstruct a self-contained Parquet file from a page-store-backed one. + /// + /// Reads all data from the page store via the metadata file and writes a + /// regular Parquet file that has no dependency on the store directory. + /// Useful for exporting, verification, or migrating data out of the store. + /// + /// Example: + /// + /// parquet-page-store reconstruct data.meta.parquet --store ./pages --output data.parquet + Reconstruct { + /// Path to the metadata-only Parquet file. + input: PathBuf, + + /// Page store directory containing the .page blobs. + #[clap(short, long)] + store: PathBuf, + + /// Output path for the reconstructed regular Parquet file. + #[clap(short, long)] + output: PathBuf, + + /// Compression codec for the output file [default: snappy]. + #[clap(long, default_value = "snappy")] + compression: CompressionArg, + }, } #[derive(Debug, Clone, ValueEnum)] @@ -176,6 +209,12 @@ fn main() { compression, } => cmd_write(&inputs, &store, output.as_deref(), compression), Command::Read { input, store } => cmd_read(&input, &store), + Command::Reconstruct { + input, + store, + output, + compression, + } => cmd_reconstruct(&input, &store, &output, compression), }; if let Err(e) = result { eprintln!("Error: {e}"); @@ -183,6 +222,35 @@ fn main() { } } +/// Expand any glob patterns in `inputs` into concrete file paths. +/// +/// Patterns containing `*` or `?` are expanded using the `glob` crate. +/// Literal paths (no wildcards) are passed through unchanged. +/// This lets you write `parquet-page-store write "data/*.parquet"` on any +/// platform without relying on shell glob expansion. +fn expand_inputs(inputs: &[PathBuf]) -> Result> { + let mut expanded = Vec::new(); + for input in inputs { + let s = input.to_string_lossy(); + if s.contains('*') || s.contains('?') { + let mut matches: Vec = glob::glob(&s) + .map_err(|e| ParquetError::General(format!("invalid glob pattern: {e}")))? + .map(|entry| entry.map_err(|e| ParquetError::General(format!("glob error: {e}")))) + .collect::>()?; + if matches.is_empty() { + return Err(ParquetError::General(format!( + "glob pattern matched no files: {s}" + ))); + } + matches.sort(); + expanded.extend(matches); + } else { + expanded.push(input.clone()); + } + } + Ok(expanded) +} + fn cmd_write( inputs: &[PathBuf], store: &PathBuf, @@ -192,7 +260,9 @@ fn cmd_write( let output_dir = output_dir.unwrap_or_else(|| std::path::Path::new(".")); std::fs::create_dir_all(output_dir)?; - for input in inputs { + let inputs = expand_inputs(inputs)?; + + for input in &inputs { let file = File::open(input)?; let reader = ParquetRecordBatchReaderBuilder::try_new(file)? .with_batch_size(8192) @@ -211,8 +281,7 @@ fn cmd_write( let mut writer = PageStoreWriter::try_new(store, schema, Some(props))?; let mut total_rows = 0usize; for batch in reader { - let batch = - batch.map_err(|e| parquet::errors::ParquetError::General(e.to_string()))?; + let batch = batch.map_err(|e| parquet::errors::ParquetError::General(e.to_string()))?; total_rows += batch.num_rows(); writer.write(&batch)?; } @@ -256,6 +325,42 @@ fn cmd_write( Ok(()) } +fn cmd_reconstruct( + input: &PathBuf, + store: &PathBuf, + output: &PathBuf, + compression: CompressionArg, +) -> Result<()> { + let reader = PageStoreReader::try_new(input, store)?; + let schema = reader + .schema() + .map_err(|e| ParquetError::General(e.to_string()))?; + + let props = WriterProperties::builder() + .set_compression(compression.to_parquet()) + .build(); + let file = File::create(output)?; + let mut writer = ArrowWriter::try_new(file, schema, Some(props))?; + + let mut total_rows = 0usize; + for batch in reader.reader()? { + let batch = batch.map_err(|e| ParquetError::General(e.to_string()))?; + total_rows += batch.num_rows(); + writer.write(&batch)?; + } + let metadata = writer.close()?; + + eprintln!( + "{}: {} row(s), {} row group(s) -> {}", + input.display(), + total_rows, + metadata.num_row_groups(), + output.display(), + ); + + Ok(()) +} + fn cmd_read(input: &PathBuf, store: &PathBuf) -> Result<()> { let reader = PageStoreReader::try_new(input, store)?; let md = reader.metadata(); From 2e9ee369f8898bd2448c86bf598f44708a2a3a3d Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Fri, 3 Apr 2026 19:27:26 +0200 Subject: [PATCH 5/8] chore: add ASF license headers and fix RAT/Prettier CI failures --- dev/release/rat_exclude_files.txt | 1 + parquet/examples/page_store_dedup/README.md | 90 +++++++++++-------- parquet/examples/page_store_dedup/concept.py | 16 ++++ parquet/examples/page_store_dedup/pipeline.py | 16 ++++ 4 files changed, 88 insertions(+), 35 deletions(-) diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index d08a0ea8c74a..3b2dd0051a1a 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -20,3 +20,4 @@ arrow-flight/src/sql/arrow.flight.protocol.sql.rs .github/* parquet/src/bin/parquet-fromcsv-help.txt arrow-flight/examples/data/* +parquet/examples/page_store_dedup/page_store_concept.svg diff --git a/parquet/examples/page_store_dedup/README.md b/parquet/examples/page_store_dedup/README.md index 63ffa70f2ceb..41d34f7b99c7 100644 --- a/parquet/examples/page_store_dedup/README.md +++ b/parquet/examples/page_store_dedup/README.md @@ -1,29 +1,48 @@ + + # Parquet Page Store — Deduplication Demo > **Prototype**: This is an experimental feature exploring content-defined -> chunking for Parquet. APIs and file formats may change. +> chunking for Parquet. APIs and file formats may change. Demonstrates how Content-Defined Chunking (CDC) enables efficient deduplication across multiple versions of a dataset using the Parquet page store writer in -Apache Arrow Rust. The deduplication is self-contained in the Parquet writer — +Apache Arrow Rust. The deduplication is self-contained in the Parquet writer — no special storage system is required. ## What this demo shows Four common dataset operations are applied to a real-world dataset ([OpenHermes-2.5](https://huggingface.co/datasets/teknium/OpenHermes-2.5) -conversational data, ~800 MB per file). Each operation produces a separate -Parquet file. Without a page store, storing all four files costs the full sum -of their sizes. With the CDC page store, identical pages are stored **exactly +conversational data, ~800 MB per file). Each operation produces a separate +Parquet file. Without a page store, storing all four files costs the full sum +of their sizes. With the CDC page store, identical pages are stored **exactly once** — indexed by their BLAKE3 hash — so the four files share most of their -bytes. The resulting files can be stored anywhere. +bytes. The resulting files can be stored anywhere. -| File | Operation | -|------|-----------| -| `original.parquet` | Baseline dataset (~996k rows) | -| `filtered.parquet` | Keep rows where `num_turns ≤ 3` | +| File | Operation | +| ------------------- | -------------------------------------- | +| `original.parquet` | Baseline dataset (~996k rows) | +| `filtered.parquet` | Keep rows where `num_turns ≤ 3` | | `augmented.parquet` | Original + computed column `num_turns` | -| `appended.parquet` | Original + 5 000 new rows appended | +| `appended.parquet` | Original + 5 000 new rows appended | ## Prerequisites @@ -52,6 +71,7 @@ python pipeline.py --skip-prepare --skip-build --skip-ingest # stats only ``` Outputs: + - `page_store_concept.png` — architectural overview of how shared pages work - `page_store_savings.png` — side-by-side storage comparison with real numbers @@ -62,7 +82,7 @@ python pipeline.py --file /path/to/your.parquet ``` The script requires a `conversations` list column for the filtered and augmented -variants. Adapt `pipeline.py` to your own schema as needed. +variants. Adapt `pipeline.py` to your own schema as needed. ## Results @@ -70,34 +90,34 @@ Dataset: **OpenHermes-2.5** (short conversations, `num_turns < 10`) ### Dataset variants -| File | Operation | Rows | Size | -|------|-----------|------|------| -| `original.parquet` | Baseline | 996,009 | 782.1 MB | -| `filtered.parquet` | Keep `num_turns ≤ 3` (removes 0.2% of rows) | 993,862 | 776.8 MB | -| `augmented.parquet` | Add column `num_turns` | 996,009 | 782.2 MB | -| `appended.parquet` | Append 5,000 rows | 1,001,009 | 788.6 MB | -| **Total** | | | **3,129.7 MB** | +| File | Operation | Rows | Size | +| ------------------- | ------------------------------------------- | --------- | -------------- | +| `original.parquet` | Baseline | 996,009 | 782.1 MB | +| `filtered.parquet` | Keep `num_turns ≤ 3` (removes 0.2% of rows) | 993,862 | 776.8 MB | +| `augmented.parquet` | Add column `num_turns` | 996,009 | 782.2 MB | +| `appended.parquet` | Append 5,000 rows | 1,001,009 | 788.6 MB | +| **Total** | | | **3,129.7 MB** | ### Page store results -| Metric | Value | -|--------|-------| -| Unique pages stored | 3,400 | -| Total page references | 15,179 | -| Page store size | 559.0 MB | -| Metadata files size | 4.4 MB | -| **Page store + metadata** | **563.4 MB** | -| **Storage saved** | **2,566.3 MB (82%)** | -| **Deduplication ratio** | **5.6×** | +| Metric | Value | +| ------------------------- | -------------------- | +| Unique pages stored | 3,400 | +| Total page references | 15,179 | +| Page store size | 559.0 MB | +| Metadata files size | 4.4 MB | +| **Page store + metadata** | **563.4 MB** | +| **Storage saved** | **2,566.3 MB (82%)** | +| **Deduplication ratio** | **5.6×** | ### Per-file page breakdown -| File | Page refs | Unique hashes | New pages | Reused pages | -|------|-----------|---------------|-----------|--------------| -| `original.parquet` | 3,782 | 3,100 | 3,100 | 0 | -| `filtered.parquet` | 3,755 | 3,075 | 222 | 2,853 (92%) | -| `augmented.parquet` | 3,834 | 3,136 | 36 | 3,100 (98%) | -| `appended.parquet` | 3,808 | 3,125 | 42 | 3,083 (98%) | +| File | Page refs | Unique hashes | New pages | Reused pages | +| ------------------- | --------- | ------------- | --------- | ------------ | +| `original.parquet` | 3,782 | 3,100 | 3,100 | 0 | +| `filtered.parquet` | 3,755 | 3,075 | 222 | 2,853 (92%) | +| `augmented.parquet` | 3,834 | 3,136 | 36 | 3,100 (98%) | +| `appended.parquet` | 3,808 | 3,125 | 42 | 3,083 (98%) | ### Key insights @@ -110,7 +130,7 @@ Dataset: **OpenHermes-2.5** (short conversations, `num_turns < 10`) 3. **Filtering rows** (`filtered`): 92% of pages reused despite row removal. Removing just 0.2% of rows barely shifts CDC boundaries — most pages are - unchanged. Heavier filtering (removing 20–50% of rows) would produce more new + unchanged. Heavier filtering (removing 20–50% of rows) would produce more new pages, as CDC boundaries shift further throughout the file. 4. **Net result**: 4 dataset versions stored for **563 MB instead of 3.1 GB** — an diff --git a/parquet/examples/page_store_dedup/concept.py b/parquet/examples/page_store_dedup/concept.py index cdd30789145e..01523f664465 100644 --- a/parquet/examples/page_store_dedup/concept.py +++ b/parquet/examples/page_store_dedup/concept.py @@ -1,4 +1,20 @@ #!/usr/bin/env python3 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. """ Generate the Parquet Page Store concept diagram. diff --git a/parquet/examples/page_store_dedup/pipeline.py b/parquet/examples/page_store_dedup/pipeline.py index a49eb0659139..6e4db2b6a2be 100644 --- a/parquet/examples/page_store_dedup/pipeline.py +++ b/parquet/examples/page_store_dedup/pipeline.py @@ -1,4 +1,20 @@ #!/usr/bin/env python3 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. """ Full pipeline for the Parquet Page Store deduplication demo. From 735970f1cdf3937ec272100806560fd06fc1b7ba Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Fri, 3 Apr 2026 19:57:05 +0200 Subject: [PATCH 6/8] fix(parquet): fix clippy warnings in page_store and cdc --- parquet/src/arrow/page_store/mod.rs | 30 +++++++++++++------------- parquet/src/arrow/page_store/writer.rs | 5 ++--- parquet/src/column/chunker/cdc.rs | 2 +- 3 files changed, 18 insertions(+), 19 deletions(-) diff --git a/parquet/src/arrow/page_store/mod.rs b/parquet/src/arrow/page_store/mod.rs index aa710a102ff7..4aae932bbe35 100644 --- a/parquet/src/arrow/page_store/mod.rs +++ b/parquet/src/arrow/page_store/mod.rs @@ -87,7 +87,7 @@ mod tests { fs::read_dir(dir) .unwrap() .filter_map(|e| e.ok()) - .filter(|e| e.path().extension().map_or(false, |ext| ext == "page")) + .filter(|e| e.path().extension().is_some_and(|ext| ext == "page")) .count() } @@ -162,7 +162,7 @@ mod tests { let meta = tmp.path().join("data.parquet"); let batch = sample_batch(); - let metadata = write_batches(&store, &meta, &[batch.clone()], None).unwrap(); + let metadata = write_batches(&store, &meta, std::slice::from_ref(&batch), None).unwrap(); assert_eq!(metadata.num_row_groups(), 1); assert_eq!(metadata.file_metadata().num_rows(), 5); @@ -238,7 +238,7 @@ mod tests { let meta = tmp.path().join("data.parquet"); let batch = large_batch(1_000_000); - write_batches(&store, &meta, &[batch.clone()], None).unwrap(); + write_batches(&store, &meta, std::slice::from_ref(&batch), None).unwrap(); // Must have produced more than one page per column assert!(count_page_files(&store) > batch.num_columns()); @@ -331,7 +331,7 @@ mod tests { ]) .unwrap(); - write_batches(&store, &meta, &[batch.clone()], None).unwrap(); + write_batches(&store, &meta, std::slice::from_ref(&batch), None).unwrap(); let batches = PageStoreReader::try_new(&meta, &store) .unwrap() .read_batches() @@ -351,7 +351,7 @@ mod tests { )]) .unwrap(); - write_batches(&store, &meta, &[batch.clone()], None).unwrap(); + write_batches(&store, &meta, std::slice::from_ref(&batch), None).unwrap(); let batches = PageStoreReader::try_new(&meta, &store) .unwrap() .read_batches() @@ -378,7 +378,7 @@ mod tests { let batch = RecordBatch::try_from_iter(vec![("s", Arc::new(struct_array) as ArrayRef)]).unwrap(); - write_batches(&store, &meta, &[batch.clone()], None).unwrap(); + write_batches(&store, &meta, std::slice::from_ref(&batch), None).unwrap(); let batches = PageStoreReader::try_new(&meta, &store) .unwrap() .read_batches() @@ -403,7 +403,7 @@ mod tests { let batch = RecordBatch::try_from_iter(vec![("items", Arc::new(list) as ArrayRef)]).unwrap(); - write_batches(&store, &meta, &[batch.clone()], None).unwrap(); + write_batches(&store, &meta, std::slice::from_ref(&batch), None).unwrap(); let batches = PageStoreReader::try_new(&meta, &store) .unwrap() .read_batches() @@ -422,7 +422,7 @@ mod tests { let meta = tmp.path().join("data.parquet"); let batch = sample_batch(); - write_batches(&store, &meta, &[batch.clone()], None).unwrap(); + write_batches(&store, &meta, std::slice::from_ref(&batch), None).unwrap(); let total: usize = PageStoreReader::try_new(&meta, &store) .unwrap() @@ -444,7 +444,7 @@ mod tests { let props = WriterProperties::builder() .set_statistics_enabled(EnabledStatistics::Page) .build(); - write_batches(&store, &meta, &[batch.clone()], Some(props)).unwrap(); + write_batches(&store, &meta, std::slice::from_ref(&batch), Some(props)).unwrap(); let batches = PageStoreReader::try_new(&meta, &store) .unwrap() @@ -499,10 +499,10 @@ mod tests { let batch = sample_batch(); - write_batches(&store, &meta_a, &[batch.clone()], None).unwrap(); + write_batches(&store, &meta_a, std::slice::from_ref(&batch), None).unwrap(); let pages_after_first = count_page_files(&store); - write_batches(&store, &meta_b, &[batch.clone()], None).unwrap(); + write_batches(&store, &meta_b, std::slice::from_ref(&batch), None).unwrap(); let pages_after_second = count_page_files(&store); assert_eq!(pages_after_first, pages_after_second); @@ -531,7 +531,7 @@ mod tests { fs::create_dir_all(meta.parent().unwrap()).unwrap(); let batch = sample_batch(); - write_batches(&store, &meta, &[batch.clone()], None).unwrap(); + write_batches(&store, &meta, std::slice::from_ref(&batch), None).unwrap(); let batches = PageStoreReader::try_new(&meta, &store) .unwrap() @@ -551,7 +551,7 @@ mod tests { for entry in fs::read_dir(&store).unwrap() { let entry = entry.unwrap(); let path = entry.path(); - if path.extension().map_or(false, |ext| ext == "page") { + if path.extension().is_some_and(|ext| ext == "page") { let data = fs::read(&path).unwrap(); let hash = blake3::hash(&data); let expected = format!("{}.page", hash.to_hex()); @@ -602,7 +602,7 @@ mod tests { let meta = tmp.path().join("data.parquet"); let batch = sample_batch(); - write_batches(&store, &meta, &[batch.clone()], None).unwrap(); + write_batches(&store, &meta, std::slice::from_ref(&batch), None).unwrap(); let schema = PageStoreReader::try_new(&meta, &store) .unwrap() @@ -646,7 +646,7 @@ mod tests { let first_page = fs::read_dir(&store) .unwrap() .filter_map(|e| e.ok()) - .find(|e| e.path().extension().map_or(false, |ext| ext == "page")) + .find(|e| e.path().extension().is_some_and(|ext| ext == "page")) .unwrap(); fs::remove_file(first_page.path()).unwrap(); diff --git a/parquet/src/arrow/page_store/writer.rs b/parquet/src/arrow/page_store/writer.rs index 87d2b4914ef9..9b4cc123ceb7 100644 --- a/parquet/src/arrow/page_store/writer.rs +++ b/parquet/src/arrow/page_store/writer.rs @@ -196,6 +196,7 @@ fn make_column_writer( Ok(crate::arrow::arrow_writer::ArrowColumnWriter { chunk, writer }) } +#[allow(clippy::too_many_arguments)] fn create_writers_for_type( data_type: &ArrowDataType, props: &WriterPropertiesPtr, @@ -446,9 +447,8 @@ impl PageStoreWriter { let mut total_byte_size = 0i64; let mut cumulative_offset: i64 = self.next_page_offset; - let mut col_idx = 0usize; - for chunk in chunks { + for (col_idx, chunk) in chunks.into_iter().enumerate() { let mut close = chunk.close; total_byte_size += close.metadata.uncompressed_size(); @@ -479,7 +479,6 @@ impl PageStoreWriter { } } - col_idx += 1; cumulative_offset += close.metadata.compressed_size(); column_metadata.push(close.metadata); diff --git a/parquet/src/column/chunker/cdc.rs b/parquet/src/column/chunker/cdc.rs index bd03af2b471d..274a4a231018 100644 --- a/parquet/src/column/chunker/cdc.rs +++ b/parquet/src/column/chunker/cdc.rs @@ -2308,7 +2308,7 @@ mod arrow_tests { None, true, ); - let read = concat_batches(&read_batches(&buf)); + let read = concat_batches(read_batches(&buf)); let read_list = read.column(0).as_list::(); assert_eq!(read_list.len(), 5); assert!(read_list.is_valid(0)); From de45d6dfadfe3fd91863bd6b0d0178e23b8ae549 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Fri, 3 Apr 2026 20:11:53 +0200 Subject: [PATCH 7/8] fix(parquet): fix clippy map_or and rustdoc unresolved link in page_store --- parquet/src/arrow/page_store/reader.rs | 2 +- parquet/src/bin/parquet-page-store.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet/src/arrow/page_store/reader.rs b/parquet/src/arrow/page_store/reader.rs index f5e02a70e76a..c1637f8f7df4 100644 --- a/parquet/src/arrow/page_store/reader.rs +++ b/parquet/src/arrow/page_store/reader.rs @@ -110,7 +110,7 @@ impl PageStoreReader { )?)) } - /// Build a streaming [`ParquetRecordBatchReader`] over the page store. + /// Build a streaming `ParquetRecordBatchReader` over the page store. /// /// Prefer this over [`Self::read_batches`] for large files — batches are /// decoded on-demand and only one batch is held in memory at a time. diff --git a/parquet/src/bin/parquet-page-store.rs b/parquet/src/bin/parquet-page-store.rs index 6b8d0feb5762..496ec944de1d 100644 --- a/parquet/src/bin/parquet-page-store.rs +++ b/parquet/src/bin/parquet-page-store.rs @@ -314,7 +314,7 @@ fn cmd_write( let page_files = std::fs::read_dir(store)? .filter_map(|e| e.ok()) - .filter(|e| e.path().extension().map_or(false, |ext| ext == "page")) + .filter(|e| e.path().extension().is_some_and(|ext| ext == "page")) .count(); eprintln!( "Page store: {} page file(s) in {}", From 96a4f2eb19057737b5151d2fd1115befb0897b33 Mon Sep 17 00:00:00 2001 From: Krisztian Szucs Date: Fri, 3 Apr 2026 20:27:57 +0200 Subject: [PATCH 8/8] fix(parquet): fix clippy map_or in page_store example --- parquet/examples/page_store.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/examples/page_store.rs b/parquet/examples/page_store.rs index 736ce9354694..9b668546a6bc 100644 --- a/parquet/examples/page_store.rs +++ b/parquet/examples/page_store.rs @@ -77,7 +77,7 @@ fn main() -> parquet::errors::Result<()> { let page_files: Vec<_> = std::fs::read_dir(&store_dir) .unwrap() .filter_map(|e| e.ok()) - .filter(|e| e.path().extension().map_or(false, |ext| ext == "page")) + .filter(|e| e.path().extension().is_some_and(|ext| ext == "page")) .collect(); println!("Page files in store: {}", page_files.len());