From 455bc8b01e1523cf47bad816ebb9e20c2a744e52 Mon Sep 17 00:00:00 2001
From: Krisztian Szucs <szucs.krisztian@gmail.com>
Date: Wed, 1 Apr 2026 14:42:59 +0200
Subject: [PATCH 1/8] fix(parquet): fix CDC panic on nested ListArrays with
 null entries (#9637)

The CDC chunker's value_offset diverged from actual leaf array positions
when null list entries had non-empty child offset ranges (valid per the
Arrow columnar format spec). This caused slice_for_chunk to produce
incorrect non_null_indices, leading to an out-of-bounds panic in
write_mini_batch.

Track non-null value counts (nni) separately from leaf slot counts in
the chunker, and use them in slice_for_chunk to correctly index into
non_null_indices regardless of gaps in the leaf array.
---
 parquet/src/arrow/arrow_writer/levels.rs | 196 +++++++++-----------
 parquet/src/column/chunker/cdc.rs        | 219 ++++++++++++++++++++---
 parquet/src/column/chunker/mod.rs        |   6 +-
 3 files changed, 287 insertions(+), 134 deletions(-)
diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs
index d1da24872c49..2ebe1319160f 100644
--- a/parquet/src/arrow/arrow_writer/levels.rs
+++ b/parquet/src/arrow/arrow_writer/levels.rs
@@ -805,37 +805,26 @@ impl ArrayLevels {
 
     /// Create a sliced view of this `ArrayLevels` for a CDC chunk.
     ///
-    /// Note: `def_levels`, `rep_levels`, and `non_null_indices` are copied (not zero-copy),
-    /// while `array` is sliced without copying.
+    /// The chunk's `value_offset`/`num_values` select the relevant slice of
+    /// `non_null_indices`. The array is sliced to the range covered by
+    /// those indices, and they are shifted to be relative to the slice.
     pub(crate) fn slice_for_chunk(&self, chunk: &CdcChunk) -> Self {
-        let level_offset = chunk.level_offset;
-        let num_levels = chunk.num_levels;
-        let value_offset = chunk.value_offset;
-        let num_values = chunk.num_values;
-        let def_levels = self
-            .def_levels
-            .as_ref()
-            .map(|levels| levels[level_offset..level_offset + num_levels].to_vec());
-        let rep_levels = self
-            .rep_levels
-            .as_ref()
-            .map(|levels| levels[level_offset..level_offset + num_levels].to_vec());
-
-        // Filter non_null_indices to [value_offset, value_offset + num_values)
-        // and shift by -value_offset. Use binary search since the slice is sorted.
-        let value_end = value_offset + num_values;
-        let start = self
-            .non_null_indices
-            .partition_point(|&idx| idx < value_offset);
-        let end = self
-            .non_null_indices
-            .partition_point(|&idx| idx < value_end);
-        let non_null_indices: Vec<usize> = self.non_null_indices[start..end]
-            .iter()
-            .map(|&idx| idx - value_offset)
-            .collect();
+        let def_levels = self.def_levels.as_ref().map(|levels| {
+            levels[chunk.level_offset..chunk.level_offset + chunk.num_levels].to_vec()
+        });
+        let rep_levels = self.rep_levels.as_ref().map(|levels| {
+            levels[chunk.level_offset..chunk.level_offset + chunk.num_levels].to_vec()
+        });
 
-        let array = self.array.slice(value_offset, num_values);
+        // Select the non-null indices for this chunk.
+        let nni = &self.non_null_indices[chunk.value_offset..chunk.value_offset + chunk.num_values];
+        // Compute the array range spanned by the non-null indices
+        let start = nni.first().copied().unwrap_or(0);
+        let end = nni.last().map_or(0, |&i| i + 1);
+        // Shift indices to be relative to the sliced array.
+        let non_null_indices = nni.iter().map(|&idx| idx - start).collect();
+        // Slice the array to the computed range.
+        let array = self.array.slice(start, end - start);
         let logical_nulls = array.logical_nulls();
 
         Self {
@@ -2149,9 +2138,8 @@ mod tests {
     fn test_slice_for_chunk_flat() {
         // Case 1: required field (max_def_level=0, no def/rep levels stored).
         // Array has 6 values; all are non-null so non_null_indices covers every position.
-        // The chunk selects value_offset=2, num_values=3 → the sub-array [3, 4, 5].
-        // Since there are no levels, num_levels=0 and level_offset are irrelevant.
-        // non_null_indices [0,1,2,3,4,5] filtered to [2,4) and shifted by -2 → [0,1,2].
+        // value_offset=2, num_values=3 → non_null_indices[2..5] = [2,3,4].
+        // Array is sliced (no def_levels → write_batch_internal uses values.len()).
         let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6]));
         let logical_nulls = array.logical_nulls();
         let levels = ArrayLevels {
@@ -2176,14 +2164,9 @@ mod tests {
 
         // Case 2: optional field (max_def_level=1, def levels present, no rep levels).
         // Array: [Some(1), None, Some(3), None, Some(5), Some(6)]
-        // def_levels: [1, 0, 1, 0, 1, 1]  (1=non-null, 0=null)
-        // non_null_indices: [0, 2, 4, 5]  (array positions of the four non-null values)
-        //
-        // The chunk selects level_offset=1, num_levels=3, value_offset=1, num_values=3:
-        //   - def_levels[1..4] = [0, 1, 0]  → null, non-null, null
-        //   - sub-array slice(1, 3) = [None, Some(3), None]
-        //   - non_null_indices filtered to [value_offset=1, value_end=4): only index 2 qualifies,
-        //     shifted by -1 → [1]  (position of Some(3) within the sliced sub-array)
+        // non_null_indices: [0, 2, 4, 5]
+        // value_offset=1, num_values=1 → non_null_indices[1..2] = [2].
+        // Array is not sliced (def_levels present → num_levels from def_levels.len()).
         let array: ArrayRef = Arc::new(Int32Array::from(vec![
             Some(1),
             None,
@@ -2206,90 +2189,85 @@ mod tests {
             level_offset: 1,
             num_levels: 3,
             value_offset: 1,
-            num_values: 3,
+            num_values: 1,
         });
         assert_eq!(sliced.def_levels, Some(vec![0, 1, 0]));
         assert!(sliced.rep_levels.is_none());
-        assert_eq!(sliced.non_null_indices, vec![1]);
-        assert_eq!(sliced.array.len(), 3);
+        assert_eq!(sliced.non_null_indices, vec![0]); // [2] shifted by -2 (nni[0])
+        assert_eq!(sliced.array.len(), 1);
     }
 
     #[test]
-    fn test_slice_for_chunk_nested() {
-        // [[1,2],[3],[4,5]]: def=[2,2,2,2,2], rep=[0,1,0,0,1]
-        // Slice levels 2..5 (def=[2,2,2], rep=[0,0,1]), values 2..5
-        let array: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]));
+    fn test_slice_for_chunk_nested_with_nulls() {
+        // Regression test for https://github.com/apache/arrow-rs/issues/9637
+        //
+        // Simulates a List<Int32?> where null list entries have non-zero child
+        // ranges (valid per Arrow spec: "a null value may correspond to a
+        // non-empty segment in the child array"). This creates gaps in the
+        // leaf array that don't correspond to any levels.
+        //
+        // 5 rows with 2 null list entries owning non-empty child ranges:
+        //   row 0: [1]       → leaf[0]
+        //   row 1: null list → owns leaf[1..3] (gap of 2)
+        //   row 2: [2, null] → leaf[3], leaf[4]=null element
+        //   row 3: null list → owns leaf[5..8] (gap of 3)
+        //   row 4: [4, 5]   → leaf[8], leaf[9]
+        //
+        // def_levels: [3,  0,  3, 2,  0,  3, 3]
+        // rep_levels: [0,  0,  0, 1,  0,  0, 1]
+        // non_null_indices: [0, 3, 8, 9]
+        //   gaps in array: 0→3 (skip 1,2), 3→8 (skip 5,6,7)
+        let array: ArrayRef = Arc::new(Int32Array::from(vec![
+            Some(1), // 0: row 0
+            None,    // 1: gap (null list row 1)
+            None,    // 2: gap (null list row 1)
+            Some(2), // 3: row 2
+            None,    // 4: row 2, null element
+            None,    // 5: gap (null list row 3)
+            None,    // 6: gap (null list row 3)
+            None,    // 7: gap (null list row 3)
+            Some(4), // 8: row 4
+            Some(5), // 9: row 4
+        ]));
         let logical_nulls = array.logical_nulls();
         let levels = ArrayLevels {
-            def_levels: Some(vec![2, 2, 2, 2, 2]),
-            rep_levels: Some(vec![0, 1, 0, 0, 1]),
-            non_null_indices: vec![0, 1, 2, 3, 4],
-            max_def_level: 2,
+            def_levels: Some(vec![3, 0, 3, 2, 0, 3, 3]),
+            rep_levels: Some(vec![0, 0, 0, 1, 0, 0, 1]),
+            non_null_indices: vec![0, 3, 8, 9],
+            max_def_level: 3,
             max_rep_level: 1,
             array,
             logical_nulls,
         };
-        let sliced = levels.slice_for_chunk(&CdcChunk {
+
+        // Chunk 0: rows 0-1, nni=[0] → array sliced to [0..1]
+        let chunk0 = levels.slice_for_chunk(&CdcChunk {
+            level_offset: 0,
+            num_levels: 2,
+            value_offset: 0,
+            num_values: 1,
+        });
+        assert_eq!(chunk0.non_null_indices, vec![0]);
+        assert_eq!(chunk0.array.len(), 1);
+
+        // Chunk 1: rows 2-3, nni=[3] → array sliced to [3..4]
+        let chunk1 = levels.slice_for_chunk(&CdcChunk {
             level_offset: 2,
             num_levels: 3,
-            value_offset: 2,
-            num_values: 3,
+            value_offset: 1,
+            num_values: 1,
         });
-        assert_eq!(sliced.def_levels, Some(vec![2, 2, 2]));
-        assert_eq!(sliced.rep_levels, Some(vec![0, 0, 1]));
-        // [0,1,2,3,4] filtered to [2,5) → [2,3,4] → shifted -2 → [0,1,2]
-        assert_eq!(sliced.non_null_indices, vec![0, 1, 2]);
-        assert_eq!(sliced.array.len(), 3);
-    }
+        assert_eq!(chunk1.non_null_indices, vec![0]);
+        assert_eq!(chunk1.array.len(), 1);
 
-    #[test]
-    fn test_slice_for_chunk_non_null_indices_boundary() {
-        // [1, null, 3]: non_null_indices=[0, 2]; test inclusive lower / exclusive upper bounds
-        let array: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), None, Some(3)]));
-        let logical_nulls = array.logical_nulls();
-        let levels = ArrayLevels {
-            def_levels: Some(vec![1, 0, 1]),
-            rep_levels: None,
-            non_null_indices: vec![0, 2],
-            max_def_level: 1,
-            max_rep_level: 0,
-            array,
-            logical_nulls,
-        };
-        assert_eq!(
-            levels
-                .slice_for_chunk(&CdcChunk {
-                    level_offset: 0,
-                    num_levels: 1,
-                    value_offset: 0,
-                    num_values: 1
-                })
-                .non_null_indices,
-            vec![0]
-        );
-        // idx 2 in range [1,3), shifted -1 → 1
-        assert_eq!(
-            levels
-                .slice_for_chunk(&CdcChunk {
-                    level_offset: 1,
-                    num_levels: 2,
-                    value_offset: 1,
-                    num_values: 2
-                })
-                .non_null_indices,
-            vec![1]
-        );
-        // idx 2 excluded from [1,2)
-        assert_eq!(
-            levels
-                .slice_for_chunk(&CdcChunk {
-                    level_offset: 1,
-                    num_levels: 1,
-                    value_offset: 1,
-                    num_values: 1
-                })
-                .non_null_indices,
-            Vec::<usize>::new()
-        );
+        // Chunk 2: row 4, nni=[8, 9] → array sliced to [8..10]
+        let chunk2 = levels.slice_for_chunk(&CdcChunk {
+            level_offset: 5,
+            num_levels: 2,
+            value_offset: 2,
+            num_values: 2,
+        });
+        assert_eq!(chunk2.non_null_indices, vec![0, 1]);
+        assert_eq!(chunk2.array.len(), 2);
     }
 }
diff --git a/parquet/src/column/chunker/cdc.rs b/parquet/src/column/chunker/cdc.rs
index f21f58780a6a..750735730874 100644
--- a/parquet/src/column/chunker/cdc.rs
+++ b/parquet/src/column/chunker/cdc.rs
@@ -289,27 +289,39 @@ impl ContentDefinedChunker {
         let mut chunks = Vec::new();
         let mut prev_offset: usize = 0;
         let mut prev_value_offset: usize = 0;
-        // Total number of values seen; for non-nested data this equals num_levels.
-        let mut total_values: usize = num_levels;
+        let mut value_offset: usize = 0;
 
         if !has_rep_levels && !has_def_levels {
             // Fastest path: non-nested, non-null data.
+            // Every level corresponds to exactly one non-null value, so
+            // value_offset == level_offset and num_values == num_levels.
+            //
+            // Example: required Int32, array = [10, 20, 30]
+            //   level:         0   1   2
+            //   value_offset:  0   1   2
             for offset in 0..num_levels {
                 roll_value(self, offset);
                 if self.need_new_chunk() {
                     chunks.push(CdcChunk {
                         level_offset: prev_offset,
-                        value_offset: prev_offset,
                         num_levels: offset - prev_offset,
+                        value_offset: prev_offset,
                         num_values: offset - prev_offset,
                     });
                     prev_offset = offset;
                 }
             }
-            // Set the previous value offset to add the last chunk.
             prev_value_offset = prev_offset;
+            value_offset = num_levels;
         } else if !has_rep_levels {
-            // Non-nested data with nulls.
+            // Non-nested data with nulls. value_offset only increments for
+            // non-null values (def == max_def), so it diverges from the
+            // level offset when nulls are present.
+            //
+            // Example: optional Int32, array = [1, null, 2, null, 3]
+            //   def_levels:    [1, 0, 1, 0, 1]
+            //   level:          0  1  2  3  4
+            //   value_offset:   0     1     2  (only increments on def==1)
             let def_levels = def_levels.expect("def_levels required when max_def_level > 0");
             #[allow(clippy::needless_range_loop)]
             for offset in 0..num_levels {
@@ -318,23 +330,56 @@ impl ContentDefinedChunker {
                 if def_level == self.max_def_level {
                     roll_value(self, offset);
                 }
+                // Check boundary before incrementing value_offset so that
+                // num_values reflects only entries in the completed chunk.
                 if self.need_new_chunk() {
                     chunks.push(CdcChunk {
                         level_offset: prev_offset,
-                        value_offset: prev_offset,
                         num_levels: offset - prev_offset,
-                        num_values: offset - prev_offset,
+                        value_offset: prev_value_offset,
+                        num_values: value_offset - prev_value_offset,
                     });
                     prev_offset = offset;
+                    prev_value_offset = value_offset;
+                }
+                if def_level == self.max_def_level {
+                    value_offset += 1;
                 }
             }
-            // Set the previous value offset to add the last chunk.
-            prev_value_offset = prev_offset;
         } else {
-            // Nested data with nulls.
+            // Nested data with nulls. Two counters are needed:
+            //
+            //   leaf_offset: index into the leaf values array for hashing,
+            //     incremented for all leaf slots (def >= repeated_ancestor_def_level),
+            //     including null elements.
+            //
+            //   value_offset: index into non_null_indices for chunk boundaries,
+            //     incremented only for non-null leaf values (def == max_def_level).
+            //
+            // These diverge when nullable elements exist inside lists.
+            //
+            // Example: List<Int32?> with repeated_ancestor_def_level=2, max_def=3
+            //   row 0: [1, null, 2]   (3 leaf slots, 2 non-null)
+            //   row 1: [3]            (1 leaf slot, 1 non-null)
+            //
+            //   leaf array:    [1, null, 2, 3]
+            //   def_levels:    [3,  2,   3, 3]
+            //   rep_levels:    [0,  1,   1, 0]
+            //
+            //   level  def  leaf_offset  value_offset  action
+            //   ─────  ───  ───────────  ────────────  ──────────────────────────
+            //     0     3       0             0        roll_value(0), value++, leaf++
+            //     1     2       1             1        leaf++ only (null element)
+            //     2     3       2             1        roll_value(2), value++, leaf++
+            //     3     3       3             2        roll_value(3), value++, leaf++
+            //
+            // roll_value(2) correctly indexes leaf array position 2 (value "2").
+            // Using value_offset=1 would index position 1 (the null slot).
+            //
+            // Using value_offset for roll_value would hash the wrong array slot.
             let def_levels = def_levels.expect("def_levels required for nested data");
             let rep_levels = rep_levels.expect("rep_levels required for nested data");
-            let mut value_offset: usize = 0;
+            let mut leaf_offset: usize = 0;
 
             for offset in 0..num_levels {
                 let def_level = def_levels[offset];
@@ -343,43 +388,45 @@ impl ContentDefinedChunker {
                 self.roll_level(def_level);
                 self.roll_level(rep_level);
                 if def_level == self.max_def_level {
-                    roll_value(self, value_offset);
+                    roll_value(self, leaf_offset);
                 }
 
+                // Check boundary before incrementing value_offset so that
+                // num_values reflects only entries in the completed chunk.
                 if rep_level == 0 && self.need_new_chunk() {
-                    // If we are at a record boundary and need a new chunk, create one.
                     let levels_to_write = offset - prev_offset;
                     if levels_to_write > 0 {
                         chunks.push(CdcChunk {
                             level_offset: prev_offset,
-                            value_offset: prev_value_offset,
                             num_levels: levels_to_write,
+                            value_offset: prev_value_offset,
                             num_values: value_offset - prev_value_offset,
                         });
                         prev_offset = offset;
                         prev_value_offset = value_offset;
                     }
                 }
-                if def_level >= self.repeated_ancestor_def_level {
-                    // We only increment the value offset if we have a leaf value.
+                if def_level == self.max_def_level {
                     value_offset += 1;
                 }
+                if def_level >= self.repeated_ancestor_def_level {
+                    leaf_offset += 1;
+                }
             }
-            total_values = value_offset;
         }
 
         // Add the last chunk if we have any levels left.
         if prev_offset < num_levels {
             chunks.push(CdcChunk {
                 level_offset: prev_offset,
-                value_offset: prev_value_offset,
                 num_levels: num_levels - prev_offset,
-                num_values: total_values - prev_value_offset,
+                value_offset: prev_value_offset,
+                num_values: value_offset - prev_value_offset,
             });
         }
 
         #[cfg(debug_assertions)]
-        self.validate_chunks(&chunks, num_levels, total_values);
+        self.validate_chunks(&chunks, num_levels, value_offset);
 
         chunks
     }
@@ -626,8 +673,9 @@ mod tests {
         assert_eq!(chunks1.len(), chunks2.len());
         for (a, b) in chunks1.iter().zip(chunks2.iter()) {
             assert_eq!(a.level_offset, b.level_offset);
-            assert_eq!(a.value_offset, b.value_offset);
             assert_eq!(a.num_levels, b.num_levels);
+            assert_eq!(a.value_offset, b.value_offset);
+            assert_eq!(a.num_values, b.num_values);
         }
     }
 
@@ -663,9 +711,12 @@ mod arrow_tests {
     use std::borrow::Borrow;
     use std::sync::Arc;
 
+    use arrow::util::data_gen::create_random_batch;
     use arrow_array::cast::AsArray;
     use arrow_array::{Array, ArrayRef, BooleanArray, Int32Array, RecordBatch};
-    use arrow_schema::{DataType, Field, Schema};
+    use arrow_buffer::Buffer;
+    use arrow_data::ArrayData;
+    use arrow_schema::{DataType, Field, Fields, Schema};
 
     use crate::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
     use crate::arrow::arrow_writer::ArrowWriter;
@@ -2153,4 +2204,128 @@ mod arrow_tests {
             "all chunks after the first must be identical"
         );
     }
+
+    /// Helper to write a batch with CDC and read it back.
+    fn cdc_roundtrip(batch: &RecordBatch) -> RecordBatch {
+        let props = WriterProperties::builder()
+            .set_content_defined_chunking(Some(CdcOptions::default()))
+            .build();
+        let mut buffer = Vec::new();
+        let mut writer = ArrowWriter::try_new(&mut buffer, batch.schema(), Some(props)).unwrap();
+        writer.write(batch).unwrap();
+        writer.close().unwrap();
+
+        let reader = ParquetRecordBatchReaderBuilder::try_new(bytes::Bytes::from(buffer))
+            .unwrap()
+            .build()
+            .unwrap();
+        reader.into_iter().next().unwrap().unwrap()
+    }
+
+    /// Regression test for <https://github.com/apache/arrow-rs/issues/9637>
+    ///
+    /// Writing nested list data with CDC enabled panicked with an out-of-bounds
+    /// slice access when null list entries had non-zero child ranges.
+    #[test]
+    fn test_cdc_list_roundtrip() {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new(
+                "_1",
+                DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))),
+                true,
+            ),
+            Field::new(
+                "_2",
+                DataType::List(Arc::new(Field::new_list_field(DataType::Boolean, true))),
+                true,
+            ),
+            Field::new(
+                "_3",
+                DataType::LargeList(Arc::new(Field::new_list_field(DataType::Utf8, true))),
+                true,
+            ),
+        ]));
+        let batch = create_random_batch(schema, 2, 0.25, 0.75).unwrap();
+        assert_eq!(cdc_roundtrip(&batch), batch);
+    }
+
+    /// Test CDC with deeply nested types: List<List<Int32>>, List<Struct<List<Int32>>>
+    #[test]
+    fn test_cdc_deeply_nested_roundtrip() {
+        let inner_field = Field::new_list_field(DataType::Int32, true);
+        let inner_type = DataType::List(Arc::new(inner_field));
+        let outer_field = Field::new_list_field(inner_type.clone(), true);
+        let list_list_type = DataType::List(Arc::new(outer_field));
+
+        let struct_inner_field = Field::new_list_field(DataType::Int32, true);
+        let struct_inner_type = DataType::List(Arc::new(struct_inner_field));
+        let struct_fields = Fields::from(vec![Field::new("a", struct_inner_type, true)]);
+        let struct_type = DataType::Struct(struct_fields);
+        let struct_list_field = Field::new_list_field(struct_type, true);
+        let list_struct_type = DataType::List(Arc::new(struct_list_field));
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("list_list", list_list_type, true),
+            Field::new("list_struct_list", list_struct_type, true),
+        ]));
+        let batch = create_random_batch(schema, 200, 0.25, 0.75).unwrap();
+        assert_eq!(cdc_roundtrip(&batch), batch);
+    }
+
+    /// Test CDC with list arrays that have non-empty null segments.
+    ///
+    /// Per the Arrow columnar format spec: "a null value may correspond to a
+    /// non-empty segment in the child array". This test constructs such arrays
+    /// manually and verifies the CDC writer handles them correctly.
+    #[test]
+    fn test_cdc_list_non_empty_null_segments() {
+        // Build List<Int32> where null entries own non-zero child ranges:
+        //   row 0: [1, 2]     offsets[0..2]  valid
+        //   row 1: null        offsets[2..5]  null, but owns 3 child values
+        //   row 2: [6, 7]     offsets[5..7]  valid
+        //   row 3: null        offsets[7..9]  null, but owns 2 child values
+        //   row 4: [10]        offsets[9..10] valid
+        let values = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
+        let offsets = Buffer::from_iter([0_i32, 2, 5, 7, 9, 10]);
+        let null_bitmap = Buffer::from([0b00010101]); // rows 0, 2, 4 valid
+
+        let list_type = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, false)));
+        let list_data = unsafe {
+            ArrayData::new_unchecked(
+                list_type.clone(),
+                5,
+                None,
+                Some(null_bitmap),
+                0,
+                vec![offsets],
+                vec![values.to_data()],
+            )
+        };
+        let list_array = arrow_array::make_array(list_data);
+
+        let schema = Arc::new(Schema::new(vec![Field::new("col", list_type, true)]));
+        let batch = RecordBatch::try_new(schema, vec![list_array]).unwrap();
+
+        let read = cdc_roundtrip(&batch);
+        let read_list = read.column(0).as_list::<i32>();
+        assert_eq!(read_list.len(), 5);
+        assert!(read_list.is_valid(0));
+        assert!(read_list.is_null(1));
+        assert!(read_list.is_valid(2));
+        assert!(read_list.is_null(3));
+        assert!(read_list.is_valid(4));
+
+        let get_vals = |i: usize| -> Vec<i32> {
+            read_list
+                .value(i)
+                .as_primitive::<arrow_array::types::Int32Type>()
+                .values()
+                .iter()
+                .copied()
+                .collect()
+        };
+        assert_eq!(get_vals(0), vec![1, 2]);
+        assert_eq!(get_vals(2), vec![6, 7]);
+        assert_eq!(get_vals(4), vec![10]);
+    }
 }
diff --git a/parquet/src/column/chunker/mod.rs b/parquet/src/column/chunker/mod.rs
index c4caf18af66b..42631e026db4 100644
--- a/parquet/src/column/chunker/mod.rs
+++ b/parquet/src/column/chunker/mod.rs
@@ -31,10 +31,10 @@ pub(crate) use cdc::ContentDefinedChunker;
 pub(crate) struct CdcChunk {
     /// The start offset of this chunk inside the given levels.
     pub level_offset: usize,
-    /// The start offset of this chunk inside the given values array.
-    pub value_offset: usize,
     /// The number of levels in this chunk.
     pub num_levels: usize,
-    /// The number of values (Arrow array elements) in this chunk.
+    /// The start index into `non_null_indices` for this chunk.
+    pub value_offset: usize,
+    /// The number of `non_null_indices` entries in this chunk.
     pub num_values: usize,
 }

From 4fb27f65647939e2c97f91564d5071758498eced Mon Sep 17 00:00:00 2001
From: Krisztian Szucs <szucs.krisztian@gmail.com>
Date: Thu, 2 Apr 2026 16:51:13 +0200
Subject: [PATCH 2/8] refactor(parquet): reuse existing write_with_cdc_options
 in regression tests

---
 parquet/src/column/chunker/cdc.rs | 46 ++++++++++++++++---------------
 1 file changed, 24 insertions(+), 22 deletions(-)

diff --git a/parquet/src/column/chunker/cdc.rs b/parquet/src/column/chunker/cdc.rs
index 750735730874..bd03af2b471d 100644
--- a/parquet/src/column/chunker/cdc.rs
+++ b/parquet/src/column/chunker/cdc.rs
@@ -2205,23 +2205,6 @@ mod arrow_tests {
         );
     }
 
-    /// Helper to write a batch with CDC and read it back.
-    fn cdc_roundtrip(batch: &RecordBatch) -> RecordBatch {
-        let props = WriterProperties::builder()
-            .set_content_defined_chunking(Some(CdcOptions::default()))
-            .build();
-        let mut buffer = Vec::new();
-        let mut writer = ArrowWriter::try_new(&mut buffer, batch.schema(), Some(props)).unwrap();
-        writer.write(batch).unwrap();
-        writer.close().unwrap();
-
-        let reader = ParquetRecordBatchReaderBuilder::try_new(bytes::Bytes::from(buffer))
-            .unwrap()
-            .build()
-            .unwrap();
-        reader.into_iter().next().unwrap().unwrap()
-    }
-
     /// Regression test for <https://github.com/apache/arrow-rs/issues/9637>
     ///
     /// Writing nested list data with CDC enabled panicked with an out-of-bounds
@@ -2245,8 +2228,14 @@ mod arrow_tests {
                 true,
             ),
         ]));
-        let batch = create_random_batch(schema, 2, 0.25, 0.75).unwrap();
-        assert_eq!(cdc_roundtrip(&batch), batch);
+        let batch = create_random_batch(schema, 10_000, 0.25, 0.75).unwrap();
+        write_with_cdc_options(
+            &[&batch],
+            CDC_MIN_CHUNK_SIZE,
+            CDC_MAX_CHUNK_SIZE,
+            None,
+            true,
+        );
     }
 
     /// Test CDC with deeply nested types: List<List<Int32>>, List<Struct<List<Int32>>>
@@ -2268,8 +2257,14 @@ mod arrow_tests {
             Field::new("list_list", list_list_type, true),
             Field::new("list_struct_list", list_struct_type, true),
         ]));
-        let batch = create_random_batch(schema, 200, 0.25, 0.75).unwrap();
-        assert_eq!(cdc_roundtrip(&batch), batch);
+        let batch = create_random_batch(schema, 10_000, 0.25, 0.75).unwrap();
+        write_with_cdc_options(
+            &[&batch],
+            CDC_MIN_CHUNK_SIZE,
+            CDC_MAX_CHUNK_SIZE,
+            None,
+            true,
+        );
     }
 
     /// Test CDC with list arrays that have non-empty null segments.
@@ -2306,7 +2301,14 @@ mod arrow_tests {
         let schema = Arc::new(Schema::new(vec![Field::new("col", list_type, true)]));
         let batch = RecordBatch::try_new(schema, vec![list_array]).unwrap();
 
-        let read = cdc_roundtrip(&batch);
+        let buf = write_with_cdc_options(
+            &[&batch],
+            CDC_MIN_CHUNK_SIZE,
+            CDC_MAX_CHUNK_SIZE,
+            None,
+            true,
+        );
+        let read = concat_batches(&read_batches(&buf));
         let read_list = read.column(0).as_list::<i32>();
         assert_eq!(read_list.len(), 5);
         assert!(read_list.is_valid(0));

From 3533fd8cc309385c849706f283ee5fbe568d8489 Mon Sep 17 00:00:00 2001
From: Krisztian Szucs <szucs.krisztian@gmail.com>
Date: Wed, 1 Apr 2026 17:35:30 +0200
Subject: [PATCH 3/8] feat(parquet): add content-addressed page store with CDC
 deduplication

---
 parquet/Cargo.toml                     |  13 +
 parquet/examples/page_store.rs         | 102 +++++
 parquet/src/arrow/arrow_writer/mod.rs  |  34 +-
 parquet/src/arrow/mod.rs               |   2 +
 parquet/src/arrow/page_store/mod.rs    | 539 +++++++++++++++++++++++++
 parquet/src/arrow/page_store/reader.rs | 248 ++++++++++++
 parquet/src/arrow/page_store/writer.rs | 511 +++++++++++++++++++++++
 parquet/src/bin/parquet-page-store.rs  | 278 +++++++++++++
 8 files changed, 1710 insertions(+), 17 deletions(-)
 create mode 100644 parquet/examples/page_store.rs
 create mode 100644 parquet/src/arrow/page_store/mod.rs
 create mode 100644 parquet/src/arrow/page_store/reader.rs
 create mode 100644 parquet/src/arrow/page_store/writer.rs
 create mode 100644 parquet/src/bin/parquet-page-store.rs

diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml
index efcd1fe2190b..5e4eeacee1f8 100644
--- a/parquet/Cargo.toml
+++ b/parquet/Cargo.toml
@@ -43,6 +43,7 @@ arrow-csv = { workspace = true, optional = true }
 arrow-data = { workspace = true, optional = true }
 arrow-schema = { workspace = true, optional = true }
 arrow-select = { workspace = true, optional = true }
+arrow-cast = { workspace = true, optional = true, features = ["prettyprint"] }
 arrow-ipc = { workspace = true, optional = true }
 parquet-geospatial = { workspace = true, optional = true }
 parquet-variant = { workspace = true, optional = true }
@@ -77,6 +78,7 @@ half = { version = "2.1", default-features = false, features = ["num-traits"] }
 crc32fast = { version = "1.4.2", optional = true, default-features = false }
 simdutf8 = { workspace = true , optional = true }
 ring = { version = "0.17", default-features = false, features = ["std"], optional = true }
+blake3 = { version = "1", default-features = false, optional = true }
 
 [dev-dependencies]
 base64 = { version = "0.22", default-features = false, features = ["std"] }
@@ -134,6 +136,8 @@ flate2-zlib-rs = ["flate2/zlib-rs"]
 variant_experimental = ["arrow", "parquet-variant", "parquet-variant-json", "parquet-variant-compute"]
 # Enable geospatial support
 geospatial = ["parquet-geospatial"]
+# Enable page store (content-addressed page storage)
+page_store = ["arrow", "dep:blake3", "dep:arrow-cast", "serde", "serde_json"]
 
 
 [[example]]
@@ -151,6 +155,11 @@ name = "write_parquet"
 required-features = ["cli"]
 path = "./examples/write_parquet.rs"
 
+[[example]]
+name = "page_store"
+required-features = ["page_store"]
+path = "./examples/page_store.rs"
+
 [[example]]
 name = "read_with_rowgroup"
 required-features = ["arrow", "async"]
@@ -180,6 +189,10 @@ name = "variant_integration"
 required-features = ["arrow", "variant_experimental", "serde"]
 path = "./tests/variant_integration.rs"
 
+[[bin]]
+name = "parquet-page-store"
+required-features = ["page_store", "cli"]
+
 [[bin]]
 name = "parquet-read"
 required-features = ["cli"]
diff --git a/parquet/examples/page_store.rs b/parquet/examples/page_store.rs
new file mode 100644
index 000000000000..8b963329c997
--- /dev/null
+++ b/parquet/examples/page_store.rs
@@ -0,0 +1,102 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Example demonstrating the Parquet Page Store.
+//!
+//! Writes Arrow RecordBatches to a content-addressed page store and reads them back.
+
+use std::sync::Arc;
+
+use arrow_array::{ArrayRef, Float64Array, Int32Array, RecordBatch, StringArray};
+use arrow_cast::pretty::pretty_format_batches;
+use parquet::arrow::page_store::{PageStoreReader, PageStoreWriter};
+use parquet::file::properties::{CdcOptions, EnabledStatistics, WriterProperties};
+use tempfile::TempDir;
+
+fn main() -> parquet::errors::Result<()> {
+    let tempdir = TempDir::new().unwrap();
+    let store_dir = tempdir.path().join("page_store");
+
+    // Create sample data
+    let batch = RecordBatch::try_from_iter(vec![
+        (
+            "id",
+            Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10])) as ArrayRef,
+        ),
+        (
+            "value",
+            Arc::new(Float64Array::from(vec![
+                1.0, 2.5, 3.7, 4.2, 5.9, 6.1, 7.3, 8.8, 9.0, 10.5,
+            ])) as ArrayRef,
+        ),
+        (
+            "name",
+            Arc::new(StringArray::from(vec![
+                "alice", "bob", "charlie", "diana", "eve", "frank", "grace", "heidi", "ivan",
+                "judy",
+            ])) as ArrayRef,
+        ),
+    ])
+    .unwrap();
+
+    let props = WriterProperties::builder()
+        .set_statistics_enabled(EnabledStatistics::Page)
+        .set_content_defined_chunking(Some(CdcOptions::default()))
+        .build();
+
+    let metadata_path = tempdir.path().join("table.parquet");
+
+    // Write to page store
+    println!("Page store dir: {}", store_dir.display());
+    println!("Metadata file:  {}", metadata_path.display());
+    let mut writer = PageStoreWriter::try_new(&store_dir, batch.schema(), Some(props))?;
+    writer.write(&batch)?;
+    let metadata = writer.finish(&metadata_path)?;
+
+    println!(
+        "Wrote {} row group(s), {} total rows",
+        metadata.num_row_groups(),
+        metadata.file_metadata().num_rows()
+    );
+
+    // List page files
+    let page_files: Vec<_> = std::fs::read_dir(&store_dir)
+        .unwrap()
+        .filter_map(|e| e.ok())
+        .filter(|e| e.path().extension().map_or(false, |ext| ext == "page"))
+        .collect();
+    println!("Page files in store: {}", page_files.len());
+
+    // Read back from page store
+    println!("\nReading from page store...");
+    let reader = PageStoreReader::try_new(&metadata_path, &store_dir)?;
+    let batches = reader.read_batches()?;
+
+    let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
+    println!("Read {} batch(es), {} total rows", batches.len(), total_rows);
+
+    // Display
+    let formatted = pretty_format_batches(&batches).unwrap();
+    println!("\n{formatted}");
+
+    // Verify round-trip
+    assert_eq!(batches.len(), 1);
+    assert_eq!(batches[0], batch);
+    println!("\nRound-trip verification: PASSED");
+
+    Ok(())
+}
diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs
index 2ef71d5745a2..5bf226701671 100644
--- a/parquet/src/arrow/arrow_writer/mod.rs
+++ b/parquet/src/arrow/arrow_writer/mod.rs
@@ -605,9 +605,9 @@ impl ArrowWriterOptions {
 
 /// A single column chunk produced by [`ArrowColumnWriter`]
 #[derive(Default)]
-struct ArrowColumnChunkData {
-    length: usize,
-    data: Vec<Bytes>,
+pub(crate) struct ArrowColumnChunkData {
+    pub(crate) length: usize,
+    pub(crate) data: Vec<Bytes>,
 }
 
 impl Length for ArrowColumnChunkData {
@@ -632,7 +632,7 @@ impl ChunkReader for ArrowColumnChunkData {
 }
 
 /// A [`Read`] for [`ArrowColumnChunkData`]
-struct ArrowColumnChunkReader(Peekable<IntoIter<Bytes>>);
+pub(crate) struct ArrowColumnChunkReader(Peekable<IntoIter<Bytes>>);
 
 impl Read for ArrowColumnChunkReader {
     fn read(&mut self, out: &mut [u8]) -> std::io::Result<usize> {
@@ -658,7 +658,7 @@ impl Read for ArrowColumnChunkReader {
 ///
 /// This allows it to be owned by [`ArrowPageWriter`] whilst allowing access via
 /// [`ArrowRowGroupWriter`] on flush, without requiring self-referential borrows
-type SharedColumnChunk = Arc<Mutex<ArrowColumnChunkData>>;
+pub(crate) type SharedColumnChunk = Arc<Mutex<ArrowColumnChunkData>>;
 
 #[derive(Default)]
 struct ArrowPageWriter {
@@ -752,8 +752,8 @@ pub fn compute_leaves(field: &Field, array: &ArrayRef) -> Result<Vec<ArrowLeafCo
 
 /// The data for a single column chunk, see [`ArrowColumnWriter`]
 pub struct ArrowColumnChunk {
-    data: ArrowColumnChunkData,
-    close: ColumnCloseResult,
+    pub(crate) data: ArrowColumnChunkData,
+    pub(crate) close: ColumnCloseResult,
 }
 
 impl std::fmt::Debug for ArrowColumnChunk {
@@ -872,8 +872,8 @@ impl ArrowColumnChunk {
 /// assert_eq!(metadata.file_metadata().num_rows(), 3);
 /// ```
 pub struct ArrowColumnWriter {
-    writer: ArrowColumnWriterImpl,
-    chunk: SharedColumnChunk,
+    pub(crate) writer: ArrowColumnWriterImpl,
+    pub(crate) chunk: SharedColumnChunk,
 }
 
 impl std::fmt::Debug for ArrowColumnWriter {
@@ -882,7 +882,7 @@ impl std::fmt::Debug for ArrowColumnWriter {
     }
 }
 
-enum ArrowColumnWriterImpl {
+pub(crate) enum ArrowColumnWriterImpl {
     ByteArray(GenericColumnWriter<'static, ByteArrayEncoder>),
     Column(ColumnWriter<'static>),
 }
@@ -989,14 +989,14 @@ impl ArrowColumnWriter {
 ///
 /// See the example on [`ArrowColumnWriter`] for how to encode columns in parallel
 #[derive(Debug)]
-struct ArrowRowGroupWriter {
-    writers: Vec<ArrowColumnWriter>,
+pub(crate) struct ArrowRowGroupWriter {
+    pub(crate) writers: Vec<ArrowColumnWriter>,
     schema: SchemaRef,
-    buffered_rows: usize,
+    pub(crate) buffered_rows: usize,
 }
 
 impl ArrowRowGroupWriter {
-    fn new(writers: Vec<ArrowColumnWriter>, arrow: &SchemaRef) -> Self {
+    pub(crate) fn new(writers: Vec<ArrowColumnWriter>, arrow: &SchemaRef) -> Self {
         Self {
             writers,
             schema: arrow.clone(),
@@ -1004,7 +1004,7 @@ impl ArrowRowGroupWriter {
         }
     }
 
-    fn write(&mut self, batch: &RecordBatch) -> Result<()> {
+    pub(crate) fn write(&mut self, batch: &RecordBatch) -> Result<()> {
         self.buffered_rows += batch.num_rows();
         let mut writers = self.writers.iter_mut();
         for (field, column) in self.schema.fields().iter().zip(batch.columns()) {
@@ -1015,7 +1015,7 @@ impl ArrowRowGroupWriter {
         Ok(())
     }
 
-    fn write_with_chunkers(
+    pub(crate) fn write_with_chunkers(
         &mut self,
         batch: &RecordBatch,
         chunkers: &mut [ContentDefinedChunker],
@@ -1042,7 +1042,7 @@ impl ArrowRowGroupWriter {
             .sum()
     }
 
-    fn close(self) -> Result<Vec<ArrowColumnChunk>> {
+    pub(crate) fn close(self) -> Result<Vec<ArrowColumnChunk>> {
         self.writers
             .into_iter()
             .map(|writer| writer.close())
diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs
index 52152988166f..e24788e4bcd7 100644
--- a/parquet/src/arrow/mod.rs
+++ b/parquet/src/arrow/mod.rs
@@ -182,6 +182,8 @@
 experimental!(mod array_reader);
 pub mod arrow_reader;
 pub mod arrow_writer;
+#[cfg(feature = "page_store")]
+pub mod page_store;
 mod buffer;
 mod decoder;
 
diff --git a/parquet/src/arrow/page_store/mod.rs b/parquet/src/arrow/page_store/mod.rs
new file mode 100644
index 000000000000..87aa408b7840
--- /dev/null
+++ b/parquet/src/arrow/page_store/mod.rs
@@ -0,0 +1,539 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Content-addressed page store for Parquet files.
+//!
+//! This module provides [`PageStoreWriter`] and [`PageStoreReader`] for writing
+//! and reading Parquet data through a content-addressed page store. Each data
+//! page is stored as a separate file named by its BLAKE3 hash, enabling
+//! cross-file page-level deduplication when used with
+//! [content-defined chunking](crate::file::properties::CdcOptions).
+
+mod reader;
+mod writer;
+
+pub use reader::PageStoreReader;
+pub use writer::PageStoreWriter;
+
+use serde::{Deserialize, Serialize};
+
+/// A reference to a page stored in the content-addressed page store.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PageRef {
+    /// Row group index
+    pub row_group: usize,
+    /// Column index (leaf column)
+    pub column: usize,
+    /// Page index within this column chunk (0-based)
+    pub page_index: usize,
+    /// Byte offset within the virtual column chunk
+    pub offset: i64,
+    /// Compressed page size in bytes (thrift header + data)
+    pub size: i32,
+    /// BLAKE3 hash hex string (64 chars)
+    pub hash: String,
+    /// True for dictionary pages
+    pub is_dict: bool,
+}
+
+/// Manifest stored in the metadata-only parquet file's key-value metadata.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PageStoreManifest {
+    /// All page references across all row groups and columns
+    pub pages: Vec<PageRef>,
+}
+
+/// The key used to store the page store manifest in parquet key-value metadata.
+const MANIFEST_KEY: &str = "page_store.manifest";
+
+#[cfg(test)]
+mod tests {
+    use std::fs;
+    use std::path::Path;
+    use std::sync::Arc;
+
+    use arrow_array::{
+        ArrayRef, BooleanArray, Float64Array, Int32Array, ListArray, RecordBatch,
+        StringArray, StructArray,
+    };
+    use arrow_schema::Field;
+
+    use super::*;
+    use crate::errors::Result;
+    use crate::file::metadata::{
+        FileMetaData, KeyValue, ParquetMetaData, ParquetMetaDataWriter,
+    };
+    use crate::file::properties::{EnabledStatistics, WriterProperties};
+    use crate::arrow::ArrowSchemaConverter;
+    use crate::schema::types::SchemaDescriptor;
+
+    // -----------------------------------------------------------------------
+    // Helpers
+    // -----------------------------------------------------------------------
+
+    fn count_page_files(dir: &Path) -> usize {
+        fs::read_dir(dir)
+            .unwrap()
+            .filter_map(|e| e.ok())
+            .filter(|e| e.path().extension().map_or(false, |ext| ext == "page"))
+            .count()
+    }
+
+    fn write_batches(
+        store_dir: &Path,
+        metadata_path: &Path,
+        batches: &[RecordBatch],
+        props: Option<WriterProperties>,
+    ) -> Result<ParquetMetaData> {
+        let schema = batches[0].schema();
+        let mut writer = PageStoreWriter::try_new(store_dir, schema, props)?;
+        for batch in batches {
+            writer.write(batch)?;
+        }
+        writer.finish(metadata_path)
+    }
+
+    fn sample_batch() -> RecordBatch {
+        RecordBatch::try_from_iter(vec![
+            ("id", Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])) as ArrayRef),
+            ("value", Arc::new(Float64Array::from(vec![1.0, 2.5, 3.7, 4.2, 5.9])) as ArrayRef),
+            ("name", Arc::new(StringArray::from(vec!["alice", "bob", "charlie", "diana", "eve"])) as ArrayRef),
+        ])
+        .unwrap()
+    }
+
+    // -----------------------------------------------------------------------
+    // Round-trip tests
+    // -----------------------------------------------------------------------
+
+    #[test]
+    fn test_round_trip() {
+        let tmp = tempfile::tempdir().unwrap();
+        let store = tmp.path().join("pages");
+        let meta = tmp.path().join("data.parquet");
+
+        let batch = sample_batch();
+        let metadata = write_batches(&store, &meta, &[batch.clone()], None).unwrap();
+
+        assert_eq!(metadata.num_row_groups(), 1);
+        assert_eq!(metadata.file_metadata().num_rows(), 5);
+        assert!(count_page_files(&store) > 0);
+
+        let reader = PageStoreReader::try_new(&meta, &store).unwrap();
+        let batches = reader.read_batches().unwrap();
+        assert_eq!(batches.len(), 1);
+        assert_eq!(batches[0], batch);
+    }
+
+    #[test]
+    fn test_multiple_batches_single_row_group() {
+        let tmp = tempfile::tempdir().unwrap();
+        let store = tmp.path().join("pages");
+        let meta = tmp.path().join("data.parquet");
+
+        let b1 = RecordBatch::try_from_iter(vec![
+            ("x", Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef),
+        ]).unwrap();
+        let b2 = RecordBatch::try_from_iter(vec![
+            ("x", Arc::new(Int32Array::from(vec![4, 5])) as ArrayRef),
+        ]).unwrap();
+
+        let metadata = write_batches(&store, &meta, &[b1, b2], None).unwrap();
+        assert_eq!(metadata.num_row_groups(), 1);
+        assert_eq!(metadata.file_metadata().num_rows(), 5);
+
+        let batches = PageStoreReader::try_new(&meta, &store).unwrap().read_batches().unwrap();
+        let total: usize = batches.iter().map(|b| b.num_rows()).sum();
+        assert_eq!(total, 5);
+    }
+
+    #[test]
+    fn test_multiple_row_groups_via_flush() {
+        let tmp = tempfile::tempdir().unwrap();
+        let store = tmp.path().join("pages");
+        let meta = tmp.path().join("data.parquet");
+
+        let batch = sample_batch();
+        let mut writer = PageStoreWriter::try_new(&store, batch.schema(), None).unwrap();
+        writer.write(&batch).unwrap();
+        writer.flush().unwrap();
+        writer.write(&batch).unwrap();
+        writer.flush().unwrap();
+        writer.write(&batch).unwrap();
+        let metadata = writer.finish(&meta).unwrap();
+
+        assert_eq!(metadata.num_row_groups(), 3);
+        assert_eq!(metadata.file_metadata().num_rows(), 15);
+
+        let total: usize = PageStoreReader::try_new(&meta, &store)
+            .unwrap().read_batches().unwrap()
+            .iter().map(|b| b.num_rows()).sum();
+        assert_eq!(total, 15);
+    }
+
+    #[test]
+    fn test_flush_empty_is_noop() {
+        let tmp = tempfile::tempdir().unwrap();
+        let store = tmp.path().join("pages");
+        let meta = tmp.path().join("data.parquet");
+
+        let batch = sample_batch();
+        let mut writer = PageStoreWriter::try_new(&store, batch.schema(), None).unwrap();
+        writer.flush().unwrap();
+        writer.flush().unwrap();
+        writer.write(&batch).unwrap();
+        let metadata = writer.finish(&meta).unwrap();
+
+        assert_eq!(metadata.num_row_groups(), 1);
+        assert_eq!(metadata.file_metadata().num_rows(), 5);
+    }
+
+    // -----------------------------------------------------------------------
+    // Column type tests
+    // -----------------------------------------------------------------------
+
+    #[test]
+    fn test_nullable_columns() {
+        let tmp = tempfile::tempdir().unwrap();
+        let store = tmp.path().join("pages");
+        let meta = tmp.path().join("data.parquet");
+
+        let batch = RecordBatch::try_from_iter(vec![
+            ("id", Arc::new(Int32Array::from(vec![Some(1), None, Some(3), None, Some(5)])) as ArrayRef),
+            ("label", Arc::new(StringArray::from(vec![Some("a"), Some("b"), None, None, Some("e")])) as ArrayRef),
+        ]).unwrap();
+
+        write_batches(&store, &meta, &[batch.clone()], None).unwrap();
+        let batches = PageStoreReader::try_new(&meta, &store).unwrap().read_batches().unwrap();
+        assert_eq!(batches[0], batch);
+    }
+
+    #[test]
+    fn test_boolean_column() {
+        let tmp = tempfile::tempdir().unwrap();
+        let store = tmp.path().join("pages");
+        let meta = tmp.path().join("data.parquet");
+
+        let batch = RecordBatch::try_from_iter(vec![
+            ("flag", Arc::new(BooleanArray::from(vec![true, false, true, true, false])) as ArrayRef),
+        ]).unwrap();
+
+        write_batches(&store, &meta, &[batch.clone()], None).unwrap();
+        let batches = PageStoreReader::try_new(&meta, &store).unwrap().read_batches().unwrap();
+        assert_eq!(batches[0], batch);
+    }
+
+    #[test]
+    fn test_nested_struct_column() {
+        let tmp = tempfile::tempdir().unwrap();
+        let store = tmp.path().join("pages");
+        let meta = tmp.path().join("data.parquet");
+
+        let struct_array = StructArray::from(vec![
+            (Arc::new(Field::new("a", arrow_schema::DataType::Int32, false)),
+             Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef),
+            (Arc::new(Field::new("b", arrow_schema::DataType::Utf8, false)),
+             Arc::new(StringArray::from(vec!["x", "y", "z"])) as ArrayRef),
+        ]);
+        let batch = RecordBatch::try_from_iter(vec![
+            ("s", Arc::new(struct_array) as ArrayRef),
+        ]).unwrap();
+
+        write_batches(&store, &meta, &[batch.clone()], None).unwrap();
+        let batches = PageStoreReader::try_new(&meta, &store).unwrap().read_batches().unwrap();
+        assert_eq!(batches[0], batch);
+    }
+
+    #[test]
+    fn test_list_column() {
+        let tmp = tempfile::tempdir().unwrap();
+        let store = tmp.path().join("pages");
+        let meta = tmp.path().join("data.parquet");
+
+        let values = Int32Array::from(vec![1, 2, 3, 4, 5, 6]);
+        let offsets = arrow_buffer::OffsetBuffer::new(vec![0, 2, 2, 5, 6].into());
+        let list = ListArray::new(
+            Arc::new(Field::new_list_field(arrow_schema::DataType::Int32, false)),
+            offsets,
+            Arc::new(values),
+            None,
+        );
+        let batch = RecordBatch::try_from_iter(vec![
+            ("items", Arc::new(list) as ArrayRef),
+        ]).unwrap();
+
+        write_batches(&store, &meta, &[batch.clone()], None).unwrap();
+        let batches = PageStoreReader::try_new(&meta, &store).unwrap().read_batches().unwrap();
+        assert_eq!(batches[0], batch);
+    }
+
+    // -----------------------------------------------------------------------
+    // CDC / dedup tests
+    // -----------------------------------------------------------------------
+
+    #[test]
+    fn test_cdc_enabled_by_default() {
+        let tmp = tempfile::tempdir().unwrap();
+        let store = tmp.path().join("pages");
+        let meta = tmp.path().join("data.parquet");
+
+        let batch = sample_batch();
+        write_batches(&store, &meta, &[batch.clone()], None).unwrap();
+
+        let total: usize = PageStoreReader::try_new(&meta, &store)
+            .unwrap().read_batches().unwrap()
+            .iter().map(|b| b.num_rows()).sum();
+        assert_eq!(total, 5);
+    }
+
+    #[test]
+    fn test_cdc_enabled_even_with_custom_props() {
+        let tmp = tempfile::tempdir().unwrap();
+        let store = tmp.path().join("pages");
+        let meta = tmp.path().join("data.parquet");
+
+        let batch = sample_batch();
+        let props = WriterProperties::builder()
+            .set_statistics_enabled(EnabledStatistics::Page)
+            .build();
+        write_batches(&store, &meta, &[batch.clone()], Some(props)).unwrap();
+
+        let batches = PageStoreReader::try_new(&meta, &store).unwrap().read_batches().unwrap();
+        assert_eq!(batches[0], batch);
+    }
+
+    #[test]
+    fn test_dedup_identical_row_groups() {
+        let tmp = tempfile::tempdir().unwrap();
+        let store = tmp.path().join("pages");
+        let meta = tmp.path().join("data.parquet");
+
+        let batch = sample_batch();
+        let props = WriterProperties::builder()
+            .set_max_row_group_row_count(Some(5))
+            .set_statistics_enabled(EnabledStatistics::Page)
+            .build();
+        let mut writer = PageStoreWriter::try_new(&store, batch.schema(), Some(props)).unwrap();
+        writer.write(&batch).unwrap();
+        writer.flush().unwrap();
+        writer.write(&batch).unwrap();
+        let metadata = writer.finish(&meta).unwrap();
+
+        assert_eq!(metadata.num_row_groups(), 2);
+
+        let reader = PageStoreReader::try_new(&meta, &store).unwrap();
+        let manifest = reader.manifest();
+
+        let rg0: Vec<_> = manifest.pages.iter().filter(|p| p.row_group == 0).collect();
+        let rg1: Vec<_> = manifest.pages.iter().filter(|p| p.row_group == 1).collect();
+        assert_eq!(rg0.len(), rg1.len());
+        for (p0, p1) in rg0.iter().zip(rg1.iter()) {
+            assert_eq!(p0.hash, p1.hash);
+        }
+
+        let unique: std::collections::HashSet<_> = manifest.pages.iter().map(|p| &p.hash).collect();
+        assert_eq!(count_page_files(&store), unique.len());
+
+        let total: usize = reader.read_batches().unwrap().iter().map(|b| b.num_rows()).sum();
+        assert_eq!(total, 10);
+    }
+
+    #[test]
+    fn test_cross_file_dedup() {
+        let tmp = tempfile::tempdir().unwrap();
+        let store = tmp.path().join("pages");
+        let meta_a = tmp.path().join("table_a.parquet");
+        let meta_b = tmp.path().join("table_b.parquet");
+
+        let batch = sample_batch();
+
+        write_batches(&store, &meta_a, &[batch.clone()], None).unwrap();
+        let pages_after_first = count_page_files(&store);
+
+        write_batches(&store, &meta_b, &[batch.clone()], None).unwrap();
+        let pages_after_second = count_page_files(&store);
+
+        assert_eq!(pages_after_first, pages_after_second);
+
+        let batches_a = PageStoreReader::try_new(&meta_a, &store).unwrap().read_batches().unwrap();
+        let batches_b = PageStoreReader::try_new(&meta_b, &store).unwrap().read_batches().unwrap();
+        assert_eq!(batches_a, batches_b);
+        assert_eq!(batches_a[0], batch);
+    }
+
+    // -----------------------------------------------------------------------
+    // Page integrity tests
+    // -----------------------------------------------------------------------
+
+    #[test]
+    fn test_metadata_path_outside_store() {
+        let tmp = tempfile::tempdir().unwrap();
+        let store = tmp.path().join("shared_pages");
+        let meta = tmp.path().join("elsewhere").join("my_table.parquet");
+        fs::create_dir_all(meta.parent().unwrap()).unwrap();
+
+        let batch = sample_batch();
+        write_batches(&store, &meta, &[batch.clone()], None).unwrap();
+
+        let batches = PageStoreReader::try_new(&meta, &store).unwrap().read_batches().unwrap();
+        assert_eq!(batches[0], batch);
+    }
+
+    #[test]
+    fn test_page_integrity() {
+        let tmp = tempfile::tempdir().unwrap();
+        let store = tmp.path().join("pages");
+        let meta = tmp.path().join("data.parquet");
+
+        write_batches(&store, &meta, &[sample_batch()], None).unwrap();
+
+        for entry in fs::read_dir(&store).unwrap() {
+            let entry = entry.unwrap();
+            let path = entry.path();
+            if path.extension().map_or(false, |ext| ext == "page") {
+                let data = fs::read(&path).unwrap();
+                let hash = blake3::hash(&data);
+                let expected = format!("{}.page", hash.to_hex());
+                assert_eq!(path.file_name().unwrap().to_str().unwrap(), expected);
+            }
+        }
+    }
+
+    #[test]
+    fn test_manifest_page_refs_consistent() {
+        let tmp = tempfile::tempdir().unwrap();
+        let store = tmp.path().join("pages");
+        let meta = tmp.path().join("data.parquet");
+
+        let metadata = write_batches(&store, &meta, &[sample_batch()], None).unwrap();
+        let reader = PageStoreReader::try_new(&meta, &store).unwrap();
+        let manifest = reader.manifest();
+
+        for pr in &manifest.pages {
+            assert!(store.join(format!("{}.page", pr.hash)).exists());
+        }
+        assert!(manifest.pages.iter().all(|p| p.row_group == 0));
+
+        let columns: std::collections::HashSet<_> = manifest.pages.iter().map(|p| p.column).collect();
+        assert_eq!(columns.len(), metadata.row_groups()[0].num_columns());
+
+        for col in &columns {
+            let mut idxs: Vec<_> = manifest.pages.iter()
+                .filter(|p| p.column == *col)
+                .map(|p| p.page_index)
+                .collect();
+            idxs.sort();
+            assert_eq!(idxs, (0..idxs.len()).collect::<Vec<_>>());
+        }
+    }
+
+    // -----------------------------------------------------------------------
+    // Reader accessors
+    // -----------------------------------------------------------------------
+
+    #[test]
+    fn test_reader_schema() {
+        let tmp = tempfile::tempdir().unwrap();
+        let store = tmp.path().join("pages");
+        let meta = tmp.path().join("data.parquet");
+
+        let batch = sample_batch();
+        write_batches(&store, &meta, &[batch.clone()], None).unwrap();
+
+        let schema = PageStoreReader::try_new(&meta, &store).unwrap().schema().unwrap();
+        assert_eq!(schema.fields(), batch.schema().fields());
+    }
+
+    #[test]
+    fn test_reader_metadata() {
+        let tmp = tempfile::tempdir().unwrap();
+        let store = tmp.path().join("pages");
+        let meta = tmp.path().join("data.parquet");
+
+        write_batches(&store, &meta, &[sample_batch()], None).unwrap();
+
+        let md = PageStoreReader::try_new(&meta, &store).unwrap();
+        assert_eq!(md.metadata().num_row_groups(), 1);
+        assert_eq!(md.metadata().file_metadata().num_rows(), 5);
+        assert_eq!(md.metadata().row_groups()[0].num_columns(), 3);
+    }
+
+    // -----------------------------------------------------------------------
+    // Reader error cases
+    // -----------------------------------------------------------------------
+
+    #[test]
+    fn test_reader_missing_metadata_file() {
+        let tmp = tempfile::tempdir().unwrap();
+        assert!(PageStoreReader::try_new(tmp.path().join("no.parquet"), tmp.path()).is_err());
+    }
+
+    #[test]
+    fn test_reader_missing_page_file() {
+        let tmp = tempfile::tempdir().unwrap();
+        let store = tmp.path().join("pages");
+        let meta = tmp.path().join("data.parquet");
+
+        write_batches(&store, &meta, &[sample_batch()], None).unwrap();
+
+        let first_page = fs::read_dir(&store).unwrap()
+            .filter_map(|e| e.ok())
+            .find(|e| e.path().extension().map_or(false, |ext| ext == "page"))
+            .unwrap();
+        fs::remove_file(first_page.path()).unwrap();
+
+        assert!(PageStoreReader::try_new(&meta, &store).unwrap().read_batches().is_err());
+    }
+
+    #[test]
+    fn test_reader_corrupt_manifest() {
+        let tmp = tempfile::tempdir().unwrap();
+        let store = tmp.path().join("pages");
+        let meta = tmp.path().join("data.parquet");
+
+        let schema = ArrowSchemaConverter::new().convert(&sample_batch().schema()).unwrap();
+        let schema_descr = Arc::new(SchemaDescriptor::new(schema.root_schema_ptr()));
+        let file_metadata = FileMetaData::new(
+            2, 0, None,
+            Some(vec![KeyValue::new(MANIFEST_KEY.to_string(), "not json{{{".to_string())]),
+            schema_descr, None,
+        );
+        fs::create_dir_all(&store).unwrap();
+        let file = fs::File::create(&meta).unwrap();
+        ParquetMetaDataWriter::new(file, &ParquetMetaData::new(file_metadata, vec![])).finish().unwrap();
+
+        let err = PageStoreReader::try_new(&meta, &store).unwrap_err().to_string();
+        assert!(err.contains("expected"), "unexpected error: {err}");
+    }
+
+    #[test]
+    fn test_reader_missing_manifest_key() {
+        let tmp = tempfile::tempdir().unwrap();
+        let store = tmp.path().join("pages");
+        let meta = tmp.path().join("data.parquet");
+
+        let schema = ArrowSchemaConverter::new().convert(&sample_batch().schema()).unwrap();
+        let schema_descr = Arc::new(SchemaDescriptor::new(schema.root_schema_ptr()));
+        let file_metadata = FileMetaData::new(2, 0, None, None, schema_descr, None);
+        fs::create_dir_all(&store).unwrap();
+        let file = fs::File::create(&meta).unwrap();
+        ParquetMetaDataWriter::new(file, &ParquetMetaData::new(file_metadata, vec![])).finish().unwrap();
+
+        let err = PageStoreReader::try_new(&meta, &store).unwrap_err().to_string();
+        assert!(err.contains(MANIFEST_KEY), "error should mention key: {err}");
+    }
+}
diff --git a/parquet/src/arrow/page_store/reader.rs b/parquet/src/arrow/page_store/reader.rs
new file mode 100644
index 000000000000..9712bf0e2c03
--- /dev/null
+++ b/parquet/src/arrow/page_store/reader.rs
@@ -0,0 +1,248 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [`PageStoreReader`] — reads Arrow data from a content-addressed page store.
+
+use std::collections::BTreeMap;
+use std::fs;
+use std::io::{self, Cursor};
+use std::path::{Path, PathBuf};
+use std::sync::Arc;
+
+use bytes::Bytes;
+
+use arrow_array::RecordBatch;
+use arrow_schema::{ArrowError, SchemaRef};
+
+use super::{PageStoreManifest, MANIFEST_KEY};
+use crate::arrow::arrow_reader::{
+    ArrowReaderMetadata, ArrowReaderOptions, ParquetRecordBatchReaderBuilder,
+};
+use crate::errors::Result;
+use crate::file::metadata::{PageIndexPolicy, ParquetMetaData, ParquetMetaDataReader};
+use crate::file::reader::{ChunkReader, Length};
+
+/// Reads Parquet data from a content-addressed page store.
+///
+/// Takes a metadata-only Parquet file (written by [`super::PageStoreWriter`])
+/// and the `store_dir` that holds the `{hash}.page` blobs. The metadata file
+/// can live anywhere — it does not need to be inside `store_dir`.
+///
+/// Pages are read on-demand from the store directory — only the pages
+/// needed for the requested row groups are loaded into memory.
+///
+/// # Example
+/// ```no_run
+/// # use parquet::arrow::page_store::PageStoreReader;
+/// let reader = PageStoreReader::try_new(
+///     "/data/tables/my_table.parquet",
+///     "/data/pages",
+/// ).unwrap();
+/// let batches = reader.read_batches().unwrap();
+/// ```
+pub struct PageStoreReader {
+    store_dir: PathBuf,
+    metadata: Arc<ParquetMetaData>,
+    manifest: PageStoreManifest,
+}
+
+impl std::fmt::Debug for PageStoreReader {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("PageStoreReader")
+            .field("store_dir", &self.store_dir)
+            .field("num_pages", &self.manifest.pages.len())
+            .finish()
+    }
+}
+
+impl PageStoreReader {
+    /// Open a page-store-backed Parquet file.
+    ///
+    /// * `metadata_path` — path to the metadata-only `.parquet` file.
+    /// * `store_dir` — directory containing `{hash}.page` blobs.
+    pub fn try_new(
+        metadata_path: impl AsRef<Path>,
+        store_dir: impl Into<PathBuf>,
+    ) -> Result<Self> {
+        let store_dir = store_dir.into();
+        let file = fs::File::open(metadata_path.as_ref())?;
+
+        let metadata = ParquetMetaDataReader::new()
+            .with_page_index_policy(PageIndexPolicy::Required)
+            .parse_and_finish(&file)?;
+
+        let manifest = Self::parse_manifest(&metadata)?;
+
+        Ok(Self {
+            store_dir,
+            metadata: Arc::new(metadata),
+            manifest,
+        })
+    }
+
+    /// Returns a reference to the Parquet metadata.
+    pub fn metadata(&self) -> &ParquetMetaData {
+        &self.metadata
+    }
+
+    /// Returns the manifest with all page references.
+    pub fn manifest(&self) -> &PageStoreManifest {
+        &self.manifest
+    }
+
+    /// Returns the Arrow schema.
+    pub fn schema(&self) -> std::result::Result<SchemaRef, ArrowError> {
+        let parquet_schema = self.metadata.file_metadata().schema_descr();
+        Ok(Arc::new(crate::arrow::parquet_to_arrow_schema(
+            parquet_schema,
+            self.metadata.file_metadata().key_value_metadata(),
+        )?))
+    }
+
+    /// Build a streaming [`ParquetRecordBatchReader`] over the page store.
+    ///
+    /// Prefer this over [`Self::read_batches`] for large files — batches are
+    /// decoded on-demand and only one batch is held in memory at a time.
+    pub fn reader(&self) -> Result<crate::arrow::arrow_reader::ParquetRecordBatchReader> {
+        let chunk_reader = PageStoreChunkReader::new(self.store_dir.clone(), &self.manifest);
+        let options =
+            ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::Required);
+        let arrow_metadata = ArrowReaderMetadata::try_new(Arc::clone(&self.metadata), options)?;
+        ParquetRecordBatchReaderBuilder::new_with_metadata(chunk_reader, arrow_metadata).build()
+    }
+
+    /// Read all data from the page store and return as [`RecordBatch`]es.
+    ///
+    /// Convenient for small datasets and tests. For large files use
+    /// [`Self::reader`] to stream batches one at a time.
+    pub fn read_batches(&self) -> Result<Vec<RecordBatch>> {
+        self.reader()?
+            .collect::<std::result::Result<Vec<_>, _>>()
+            .map_err(|e| crate::errors::ParquetError::General(e.to_string()))
+    }
+
+    fn parse_manifest(metadata: &ParquetMetaData) -> Result<PageStoreManifest> {
+        let kv = metadata
+            .file_metadata()
+            .key_value_metadata()
+            .and_then(|kvs| kvs.iter().find(|kv| kv.key == MANIFEST_KEY))
+            .ok_or_else(|| {
+                crate::errors::ParquetError::General(format!(
+                    "Missing '{MANIFEST_KEY}' in parquet key-value metadata"
+                ))
+            })?;
+
+        let value = kv.value.as_ref().ok_or_else(|| {
+            crate::errors::ParquetError::General(format!("'{MANIFEST_KEY}' has no value"))
+        })?;
+
+        serde_json::from_str(value)
+            .map_err(|e| crate::errors::ParquetError::General(e.to_string()))
+    }
+}
+
+// ---------------------------------------------------------------------------
+// PageStoreChunkReader — on-demand ChunkReader backed by page blobs
+// ---------------------------------------------------------------------------
+
+/// A [`ChunkReader`] that serves byte ranges from page store blobs.
+///
+/// Builds a sorted interval map from the metadata offsets to page file hashes,
+/// so that any byte-range request from the Parquet decoder is resolved by
+/// reading only the appropriate `.page` file(s) from disk.
+pub struct PageStoreChunkReader {
+    store_dir: PathBuf,
+    /// Sorted map: virtual file offset -> (size, hash).
+    pages: BTreeMap<i64, (i32, String)>,
+    /// Virtual file length (max offset + size across all pages).
+    total_len: u64,
+}
+
+impl PageStoreChunkReader {
+    fn new(store_dir: PathBuf, manifest: &PageStoreManifest) -> Self {
+        let mut pages = BTreeMap::new();
+        let mut total_len: u64 = 0;
+        for pr in &manifest.pages {
+            pages.insert(pr.offset, (pr.size, pr.hash.clone()));
+            let end = pr.offset as u64 + pr.size as u64;
+            if end > total_len {
+                total_len = end;
+            }
+        }
+        Self {
+            store_dir,
+            pages,
+            total_len,
+        }
+    }
+
+    fn read_page_file(&self, hash: &str) -> io::Result<Bytes> {
+        let path = self.store_dir.join(format!("{hash}.page"));
+        let data = fs::read(&path)?;
+        Ok(Bytes::from(data))
+    }
+}
+
+impl Length for PageStoreChunkReader {
+    fn len(&self) -> u64 {
+        self.total_len
+    }
+}
+
+impl ChunkReader for PageStoreChunkReader {
+    type T = Cursor<Bytes>;
+
+    fn get_read(&self, start: u64) -> Result<Self::T> {
+        let bytes = self.get_bytes(start, (self.total_len - start) as usize)?;
+        Ok(Cursor::new(bytes))
+    }
+
+    fn get_bytes(&self, start: u64, length: usize) -> Result<Bytes> {
+        let end = start as i64 + length as i64;
+        let mut result = Vec::with_capacity(length);
+
+        let scan_start = self
+            .pages
+            .range(..=start as i64)
+            .next_back()
+            .map(|(&o, _)| o)
+            .unwrap_or(0);
+
+        for (&offset, (size, hash)) in self.pages.range(scan_start..) {
+            if offset >= end {
+                break;
+            }
+
+            let page_data = self.read_page_file(hash)?;
+
+            let copy_start = (start as i64 - offset).max(0) as usize;
+            let copy_end = (end - offset).min(*size as i64) as usize;
+
+            if copy_start < copy_end && copy_start < page_data.len() {
+                let actual_end = copy_end.min(page_data.len());
+                result.extend_from_slice(&page_data[copy_start..actual_end]);
+            }
+
+            if result.len() >= length {
+                break;
+            }
+        }
+
+        result.truncate(length);
+        Ok(Bytes::from(result))
+    }
+}
diff --git a/parquet/src/arrow/page_store/writer.rs b/parquet/src/arrow/page_store/writer.rs
new file mode 100644
index 000000000000..0c5377d3741c
--- /dev/null
+++ b/parquet/src/arrow/page_store/writer.rs
@@ -0,0 +1,511 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! [`PageStoreWriter`] — writes Arrow data to a content-addressed page store.
+
+use std::fs;
+use std::io::Write;
+use std::path::{Path, PathBuf};
+use std::sync::{Arc, Mutex};
+
+use bytes::Bytes;
+
+use arrow_array::RecordBatch;
+use arrow_schema::{DataType as ArrowDataType, SchemaRef};
+
+use super::{PageRef, PageStoreManifest, MANIFEST_KEY};
+use crate::arrow::arrow_writer::{
+    ArrowColumnChunk, ArrowColumnChunkData, ArrowColumnWriterImpl, ArrowRowGroupWriter,
+    SharedColumnChunk,
+};
+use crate::arrow::ArrowSchemaConverter;
+use crate::column::chunker::ContentDefinedChunker;
+use crate::column::page::{CompressedPage, PageWriteSpec, PageWriter};
+use crate::column::writer::{GenericColumnWriter, get_column_writer};
+use crate::errors::Result;
+use crate::file::metadata::{
+    FileMetaData, KeyValue, ParquetMetaData, ParquetMetaDataBuilder, ParquetMetaDataWriter,
+    RowGroupMetaData,
+};
+use crate::file::page_index::column_index::ColumnIndexMetaData;
+use crate::file::page_index::offset_index::OffsetIndexMetaData;
+use crate::file::properties::{CdcOptions, EnabledStatistics, WriterProperties, WriterPropertiesPtr};
+use crate::parquet_thrift::{ThriftCompactOutputProtocol, WriteThrift};
+use crate::schema::types::{ColumnDescPtr, SchemaDescPtr, SchemaDescriptor};
+
+// ---------------------------------------------------------------------------
+// ContentAddressedPageWriter — internal PageWriter impl
+// ---------------------------------------------------------------------------
+
+/// A [`PageWriter`] that writes each page to a content-addressed store directory.
+struct ContentAddressedPageWriter {
+    buffer: SharedColumnChunk,
+    store_dir: PathBuf,
+    page_refs: Arc<Mutex<Vec<PageRef>>>,
+    row_group: usize,
+    column: usize,
+    page_count: usize,
+}
+
+impl ContentAddressedPageWriter {
+    fn new(
+        store_dir: PathBuf,
+        page_refs: Arc<Mutex<Vec<PageRef>>>,
+        row_group: usize,
+        column: usize,
+    ) -> Self {
+        Self {
+            buffer: Arc::new(Mutex::new(ArrowColumnChunkData::default())),
+            store_dir,
+            page_refs,
+            row_group,
+            column,
+            page_count: 0,
+        }
+    }
+}
+
+impl PageWriter for ContentAddressedPageWriter {
+    fn write_page(&mut self, page: CompressedPage) -> Result<PageWriteSpec> {
+        let page_header = page.to_thrift_header()?;
+        let mut header_bytes = Vec::with_capacity(256);
+        {
+            let mut protocol = ThriftCompactOutputProtocol::new(&mut header_bytes);
+            page_header.write_thrift(&mut protocol)?;
+        }
+        let header = Bytes::from(header_bytes);
+
+        let data = page.compressed_page().buffer().clone();
+        let compressed_size = data.len() + header.len();
+
+        let mut hasher = blake3::Hasher::new();
+        hasher.update(&header);
+        hasher.update(&data);
+        let hash = hasher.finalize();
+        let hash_hex = hash.to_hex().to_string();
+
+        let page_path = self.store_dir.join(format!("{hash_hex}.page"));
+        if !page_path.exists() {
+            let mut file = fs::File::create(&page_path)?;
+            file.write_all(&header)?;
+            file.write_all(&data)?;
+        }
+
+        let mut buf = self.buffer.try_lock().unwrap();
+        let offset = buf.length as u64;
+        buf.length += compressed_size;
+        buf.data.push(header.clone());
+        buf.data.push(data);
+
+        let is_dict = page.page_type() == crate::basic::PageType::DICTIONARY_PAGE;
+        self.page_refs.lock().unwrap().push(PageRef {
+            row_group: self.row_group,
+            column: self.column,
+            page_index: self.page_count,
+            offset: offset as i64,
+            size: compressed_size as i32,
+            hash: hash_hex,
+            is_dict,
+        });
+        self.page_count += 1;
+
+        let mut spec = PageWriteSpec::new();
+        spec.page_type = page.page_type();
+        spec.num_values = page.num_values();
+        spec.uncompressed_size = page.uncompressed_size() + header.len();
+        spec.offset = offset;
+        spec.compressed_size = compressed_size;
+        spec.bytes_written = compressed_size as u64;
+        Ok(spec)
+    }
+
+    fn close(&mut self) -> Result<()> {
+        Ok(())
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Column writer factory
+// ---------------------------------------------------------------------------
+
+fn create_column_writers(
+    schema: &SchemaDescriptor,
+    arrow_schema: &SchemaRef,
+    props: &WriterPropertiesPtr,
+    store_dir: &Path,
+    page_refs: &Arc<Mutex<Vec<PageRef>>>,
+    row_group: usize,
+) -> Result<Vec<crate::arrow::arrow_writer::ArrowColumnWriter>> {
+    let mut writers = Vec::new();
+    let mut leaves = schema.columns().iter();
+    let mut col_idx = 0usize;
+    for field in &arrow_schema.fields {
+        create_writers_for_type(
+            field.data_type(),
+            props,
+            &mut leaves,
+            store_dir,
+            page_refs,
+            row_group,
+            &mut col_idx,
+            &mut writers,
+        )?;
+    }
+    Ok(writers)
+}
+
+fn make_column_writer(
+    desc: &ColumnDescPtr,
+    props: &WriterPropertiesPtr,
+    store_dir: &Path,
+    page_refs: &Arc<Mutex<Vec<PageRef>>>,
+    row_group: usize,
+    col_idx: usize,
+    use_byte_array: bool,
+) -> Result<crate::arrow::arrow_writer::ArrowColumnWriter> {
+    let pw = Box::new(ContentAddressedPageWriter::new(
+        store_dir.to_path_buf(),
+        page_refs.clone(),
+        row_group,
+        col_idx,
+    ));
+    let chunk: SharedColumnChunk = pw.buffer.clone();
+
+    let writer = if use_byte_array {
+        ArrowColumnWriterImpl::ByteArray(GenericColumnWriter::new(
+            desc.clone(),
+            props.clone(),
+            pw,
+        ))
+    } else {
+        ArrowColumnWriterImpl::Column(get_column_writer(desc.clone(), props.clone(), pw))
+    };
+
+    Ok(crate::arrow::arrow_writer::ArrowColumnWriter { chunk, writer })
+}
+
+fn create_writers_for_type(
+    data_type: &ArrowDataType,
+    props: &WriterPropertiesPtr,
+    leaves: &mut std::slice::Iter<'_, ColumnDescPtr>,
+    store_dir: &Path,
+    page_refs: &Arc<Mutex<Vec<PageRef>>>,
+    row_group: usize,
+    col_idx: &mut usize,
+    out: &mut Vec<crate::arrow::arrow_writer::ArrowColumnWriter>,
+) -> Result<()> {
+    let col = |idx: &mut usize, leaves: &mut std::slice::Iter<'_, ColumnDescPtr>| {
+        let desc = leaves.next().unwrap();
+        let i = *idx;
+        *idx += 1;
+        make_column_writer(desc, props, store_dir, page_refs, row_group, i, false)
+    };
+
+    let bytes = |idx: &mut usize, leaves: &mut std::slice::Iter<'_, ColumnDescPtr>| {
+        let desc = leaves.next().unwrap();
+        let i = *idx;
+        *idx += 1;
+        make_column_writer(desc, props, store_dir, page_refs, row_group, i, true)
+    };
+
+    match data_type {
+        _ if data_type.is_primitive() => out.push(col(col_idx, leaves)?),
+        ArrowDataType::FixedSizeBinary(_) | ArrowDataType::Boolean | ArrowDataType::Null => {
+            out.push(col(col_idx, leaves)?)
+        }
+        ArrowDataType::LargeBinary
+        | ArrowDataType::Binary
+        | ArrowDataType::Utf8
+        | ArrowDataType::LargeUtf8
+        | ArrowDataType::BinaryView
+        | ArrowDataType::Utf8View => out.push(bytes(col_idx, leaves)?),
+        ArrowDataType::List(f)
+        | ArrowDataType::LargeList(f)
+        | ArrowDataType::FixedSizeList(f, _)
+        | ArrowDataType::ListView(f)
+        | ArrowDataType::LargeListView(f) => {
+            create_writers_for_type(f.data_type(), props, leaves, store_dir, page_refs, row_group, col_idx, out)?;
+        }
+        ArrowDataType::Struct(fields) => {
+            for field in fields {
+                create_writers_for_type(field.data_type(), props, leaves, store_dir, page_refs, row_group, col_idx, out)?;
+            }
+        }
+        ArrowDataType::Map(f, _) => match f.data_type() {
+            ArrowDataType::Struct(f) => {
+                create_writers_for_type(f[0].data_type(), props, leaves, store_dir, page_refs, row_group, col_idx, out)?;
+                create_writers_for_type(f[1].data_type(), props, leaves, store_dir, page_refs, row_group, col_idx, out)?;
+            }
+            _ => unreachable!("invalid map type"),
+        },
+        ArrowDataType::Dictionary(_, value_type) => match value_type.as_ref() {
+            ArrowDataType::Utf8
+            | ArrowDataType::LargeUtf8
+            | ArrowDataType::Binary
+            | ArrowDataType::LargeBinary
+            | ArrowDataType::Utf8View
+            | ArrowDataType::BinaryView
+            | ArrowDataType::FixedSizeBinary(_) => out.push(bytes(col_idx, leaves)?),
+            _ => out.push(col(col_idx, leaves)?),
+        },
+        _ => {
+            return Err(crate::errors::ParquetError::NYI(format!(
+                "PageStoreWriter: unsupported Arrow type {data_type}"
+            )));
+        }
+    }
+    Ok(())
+}
+
+// ---------------------------------------------------------------------------
+// PageStoreWriter
+// ---------------------------------------------------------------------------
+
+/// Writes Arrow [`RecordBatch`]es to a content-addressed page store.
+///
+/// Each data page is written as a separate file named by its BLAKE3 hash
+/// under `store_dir`. The metadata-only Parquet file is written to an
+/// explicit path on [`Self::finish`], containing the schema, row group
+/// metadata, and a manifest mapping page locations to their hashes.
+///
+/// A single `store_dir` can hold pages belonging to many Parquet files;
+/// identical pages across files are automatically deduplicated.
+///
+/// # Example
+/// ```no_run
+/// # use std::sync::Arc;
+/// # use arrow_array::{ArrayRef, Int32Array, RecordBatch};
+/// # use parquet::arrow::page_store::PageStoreWriter;
+/// # use parquet::file::properties::WriterProperties;
+/// let batch = RecordBatch::try_from_iter(vec![
+///     ("id", Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef),
+/// ]).unwrap();
+///
+/// let store = std::env::temp_dir().join("pages");
+/// let mut writer = PageStoreWriter::try_new(&store, batch.schema(), None).unwrap();
+/// writer.write(&batch).unwrap();
+/// writer.finish(store.join("table_a.parquet")).unwrap();
+/// ```
+pub struct PageStoreWriter {
+    store_dir: PathBuf,
+    schema: SchemaDescPtr,
+    arrow_schema: SchemaRef,
+    props: WriterPropertiesPtr,
+    page_refs: Arc<Mutex<Vec<PageRef>>>,
+    row_groups: Vec<RowGroupMetaData>,
+    column_indexes: Vec<Vec<ColumnIndexMetaData>>,
+    offset_indexes: Vec<Vec<OffsetIndexMetaData>>,
+    in_progress: Option<ArrowRowGroupWriter>,
+    cdc_chunkers: Option<Vec<ContentDefinedChunker>>,
+    row_group_index: usize,
+    total_rows: i64,
+    next_page_offset: i64,
+}
+
+impl PageStoreWriter {
+    /// Create a new `PageStoreWriter`.
+    ///
+    /// Creates `store_dir` if it does not exist.
+    pub fn try_new(
+        store_dir: impl Into<PathBuf>,
+        arrow_schema: SchemaRef,
+        props: Option<WriterProperties>,
+    ) -> Result<Self> {
+        let store_dir = store_dir.into();
+        fs::create_dir_all(&store_dir)?;
+
+        let props = props.unwrap_or_else(|| {
+            WriterProperties::builder()
+                .set_statistics_enabled(EnabledStatistics::Page)
+                .set_content_defined_chunking(Some(CdcOptions::default()))
+                .build()
+        });
+
+        let cdc_default = CdcOptions::default();
+        let cdc_opts = props.content_defined_chunking().or(Some(&cdc_default));
+
+        let schema = {
+            let converter = ArrowSchemaConverter::new().with_coerce_types(props.coerce_types());
+            converter.convert(&arrow_schema)?
+        };
+
+        let schema_descr = Arc::new(SchemaDescriptor::new(schema.root_schema_ptr()));
+
+        let cdc_chunkers = cdc_opts
+            .map(|opts| {
+                schema_descr
+                    .columns()
+                    .iter()
+                    .map(|desc| ContentDefinedChunker::new(desc, opts))
+                    .collect::<Result<Vec<_>>>()
+            })
+            .transpose()?;
+
+        let props_ptr = Arc::new(props);
+
+        Ok(Self {
+            store_dir,
+            schema: schema_descr,
+            arrow_schema,
+            props: props_ptr,
+            page_refs: Arc::new(Mutex::new(Vec::new())),
+            row_groups: Vec::new(),
+            column_indexes: Vec::new(),
+            offset_indexes: Vec::new(),
+            in_progress: None,
+            cdc_chunkers,
+            row_group_index: 0,
+            total_rows: 0,
+            next_page_offset: 0,
+        })
+    }
+
+    /// Write a [`RecordBatch`] to the page store.
+    pub fn write(&mut self, batch: &RecordBatch) -> Result<()> {
+        if self.in_progress.is_none() {
+            let writers = create_column_writers(
+                &self.schema,
+                &self.arrow_schema,
+                &self.props,
+                &self.store_dir,
+                &self.page_refs,
+                self.row_group_index,
+            )?;
+            self.in_progress = Some(ArrowRowGroupWriter::new(writers, &self.arrow_schema));
+        }
+
+        let in_progress = self.in_progress.as_mut().unwrap();
+        match self.cdc_chunkers.as_mut() {
+            Some(chunkers) => in_progress.write_with_chunkers(batch, chunkers)?,
+            None => in_progress.write(batch)?,
+        }
+        Ok(())
+    }
+
+    /// Flush the current row group.
+    pub fn flush(&mut self) -> Result<()> {
+        let in_progress = match self.in_progress.take() {
+            Some(ip) => ip,
+            None => return Ok(()),
+        };
+
+        let buffered_rows = in_progress.buffered_rows;
+        let chunks: Vec<ArrowColumnChunk> = in_progress.close()?;
+
+        let mut column_metadata = Vec::with_capacity(chunks.len());
+        let mut col_indexes: Vec<ColumnIndexMetaData> = Vec::with_capacity(chunks.len());
+        let mut off_indexes: Vec<OffsetIndexMetaData> = Vec::with_capacity(chunks.len());
+        let mut total_byte_size = 0i64;
+
+        let mut cumulative_offset: i64 = self.next_page_offset;
+        let mut col_idx = 0usize;
+
+        for chunk in chunks {
+            let mut close = chunk.close;
+            total_byte_size += close.metadata.uncompressed_size();
+
+            let src_dict_offset = close.metadata.dictionary_page_offset();
+            let src_data_offset = close.metadata.data_page_offset();
+            let src_start = src_dict_offset.unwrap_or(src_data_offset);
+            let delta = cumulative_offset - src_start;
+
+            let mut col_builder = close.metadata.into_builder();
+            col_builder = col_builder.set_data_page_offset(src_data_offset + delta);
+            if let Some(dict_off) = src_dict_offset {
+                col_builder = col_builder.set_dictionary_page_offset(Some(dict_off + delta));
+            }
+            close.metadata = col_builder.build()?;
+
+            if let Some(ref mut oi) = close.offset_index {
+                for loc in &mut oi.page_locations {
+                    loc.offset += delta;
+                }
+            }
+
+            {
+                let mut page_refs = self.page_refs.lock().unwrap();
+                for pr in page_refs.iter_mut() {
+                    if pr.row_group == self.row_group_index && pr.column == col_idx {
+                        pr.offset += delta;
+                    }
+                }
+            }
+
+            col_idx += 1;
+            cumulative_offset += close.metadata.compressed_size();
+
+            column_metadata.push(close.metadata);
+            col_indexes.push(close.column_index.unwrap_or(ColumnIndexMetaData::NONE));
+            if let Some(oi) = close.offset_index {
+                off_indexes.push(oi);
+            } else {
+                off_indexes.push(OffsetIndexMetaData {
+                    page_locations: vec![],
+                    unencoded_byte_array_data_bytes: None,
+                });
+            }
+        }
+
+        self.next_page_offset = cumulative_offset;
+
+        let row_group = RowGroupMetaData::builder(self.schema.clone())
+            .set_column_metadata(column_metadata)
+            .set_total_byte_size(total_byte_size)
+            .set_num_rows(buffered_rows as i64)
+            .set_ordinal(self.row_group_index as i16)
+            .build()?;
+
+        self.total_rows += buffered_rows as i64;
+        self.row_groups.push(row_group);
+        self.column_indexes.push(col_indexes);
+        self.offset_indexes.push(off_indexes);
+        self.row_group_index += 1;
+        Ok(())
+    }
+
+    /// Flush remaining data and write the metadata-only Parquet file to `path`.
+    pub fn finish(mut self, path: impl AsRef<Path>) -> Result<ParquetMetaData> {
+        self.flush()?;
+
+        let page_refs = self.page_refs.lock().unwrap().clone();
+        let manifest = PageStoreManifest { pages: page_refs };
+        let manifest_json = serde_json::to_string(&manifest)
+            .map_err(|e| crate::errors::ParquetError::General(e.to_string()))?;
+
+        let file_metadata = FileMetaData::new(
+            2,
+            self.total_rows,
+            Some("parquet-rs page_store".to_string()),
+            Some(vec![KeyValue::new(MANIFEST_KEY.to_string(), manifest_json)]),
+            self.schema.clone(),
+            None,
+        );
+
+        let mut builder = ParquetMetaDataBuilder::new(file_metadata);
+        for rg in self.row_groups {
+            builder = builder.add_row_group(rg);
+        }
+        builder = builder.set_column_index(Some(self.column_indexes));
+        builder = builder.set_offset_index(Some(self.offset_indexes));
+        let metadata = builder.build();
+
+        let file = fs::File::create(path.as_ref())?;
+        ParquetMetaDataWriter::new(file, &metadata).finish()?;
+
+        Ok(metadata)
+    }
+}
diff --git a/parquet/src/bin/parquet-page-store.rs b/parquet/src/bin/parquet-page-store.rs
new file mode 100644
index 000000000000..47f9e4be1f75
--- /dev/null
+++ b/parquet/src/bin/parquet-page-store.rs
@@ -0,0 +1,278 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! CLI tool for working with a content-addressed Parquet page store.
+//!
+//! # Install
+//!
+//! ```text
+//! cargo install parquet --features=page_store,cli
+//! ```
+//!
+//! # Write a Parquet file into a page store
+//!
+//! ```text
+//! parquet-page-store write input.parquet --store ./pages --output ./meta
+//! ```
+//!
+//! # Read a page-store-backed Parquet file
+//!
+//! ```text
+//! parquet-page-store read ./meta/input.meta.parquet --store ./pages
+//! ```
+
+use std::fs::File;
+use std::path::PathBuf;
+
+use arrow_array::RecordBatchReader;
+use clap::{Parser, Subcommand, ValueEnum};
+use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
+use parquet::arrow::page_store::{PageStoreReader, PageStoreWriter};
+use parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel};
+use parquet::errors::Result;
+use parquet::file::properties::WriterProperties;
+
+#[derive(Debug, Parser)]
+#[clap(author, version)]
+/// Content-addressed Parquet page store.
+///
+/// A page store splits Parquet data pages into individual files named by their
+/// BLAKE3 hash. Identical pages across files are stored only once, enabling
+/// efficient deduplication when used with content-defined chunking (CDC).
+///
+/// The workflow has two steps:
+///
+///   1. `write` — reads regular Parquet files, re-encodes their pages with CDC
+///      chunking, writes each page as a {hash}.page blob into a shared store
+///      directory, and produces a lightweight metadata-only Parquet file.
+///
+///   2. `read`  — given a metadata Parquet file and the store directory,
+///      reassembles the data and prints it.
+///
+/// Quick start:
+///
+///   # Write a file into the store
+///   parquet-page-store write data.parquet --store ./pages --output ./meta
+///
+///   # Read it back
+///   parquet-page-store read ./meta/data.meta.parquet --store ./pages
+///
+///   # Write several files (pages are deduplicated across them)
+///   parquet-page-store write a.parquet b.parquet --store ./pages
+struct Cli {
+    #[command(subcommand)]
+    command: Command,
+}
+
+#[derive(Debug, Subcommand)]
+enum Command {
+    /// Write Parquet files into a page store.
+    ///
+    /// Each input file is read, its pages are re-encoded with CDC chunking and
+    /// written to the store directory as {hash}.page blobs. A metadata-only
+    /// Parquet file is produced for each input (named {stem}.meta.parquet).
+    ///
+    /// Multiple files can share the same store directory — identical pages are
+    /// automatically deduplicated.
+    ///
+    /// Examples:
+    ///
+    ///   # Single file, metadata written to current directory
+    ///   parquet-page-store write data.parquet --store ./pages
+    ///
+    ///   # Explicit output directory
+    ///   parquet-page-store write data.parquet --store ./pages --output ./meta
+    ///
+    ///   # Multiple files into the same store
+    ///   parquet-page-store write a.parquet b.parquet --store ./pages
+    ///
+    ///   # Write without compression
+    ///   parquet-page-store write data.parquet --store ./pages --compression none
+    Write {
+        /// Input Parquet file(s).
+        #[clap(required = true)]
+        inputs: Vec<PathBuf>,
+
+        /// Page store directory for .page blobs (created if it does not exist).
+        #[clap(short, long)]
+        store: PathBuf,
+
+        /// Output directory for metadata Parquet files [default: current directory].
+        #[clap(short, long)]
+        output: Option<PathBuf>,
+
+        /// Compression codec for page data [default: zstd].
+        #[clap(long, default_value = "zstd")]
+        compression: CompressionArg,
+    },
+
+    /// Read a page-store-backed Parquet file and print its contents.
+    ///
+    /// The metadata Parquet file contains the schema, row group structure, and
+    /// a manifest mapping each page to its BLAKE3 hash. The actual page data
+    /// is read from the store directory.
+    ///
+    /// Example:
+    ///
+    ///   parquet-page-store read data.meta.parquet --store ./pages
+    Read {
+        /// Path to the metadata-only Parquet file.
+        input: PathBuf,
+
+        /// Page store directory containing the .page blobs.
+        #[clap(short, long)]
+        store: PathBuf,
+    },
+}
+
+#[derive(Debug, Clone, ValueEnum)]
+enum CompressionArg {
+    None,
+    Snappy,
+    Gzip,
+    Lzo,
+    Brotli,
+    Lz4,
+    Zstd,
+    Lz4Raw,
+}
+
+impl CompressionArg {
+    fn to_parquet(&self) -> Compression {
+        match self {
+            CompressionArg::None => Compression::UNCOMPRESSED,
+            CompressionArg::Snappy => Compression::SNAPPY,
+            CompressionArg::Gzip => Compression::GZIP(GzipLevel::default()),
+            CompressionArg::Lzo => Compression::LZO,
+            CompressionArg::Brotli => Compression::BROTLI(BrotliLevel::default()),
+            CompressionArg::Lz4 => Compression::LZ4,
+            CompressionArg::Zstd => Compression::ZSTD(ZstdLevel::default()),
+            CompressionArg::Lz4Raw => Compression::LZ4_RAW,
+        }
+    }
+}
+
+fn main() {
+    let cli = Cli::parse();
+    let result = match cli.command {
+        Command::Write {
+            inputs,
+            store,
+            output,
+            compression,
+        } => cmd_write(&inputs, &store, output.as_deref(), compression),
+        Command::Read { input, store } => cmd_read(&input, &store),
+    };
+    if let Err(e) = result {
+        eprintln!("Error: {e}");
+        std::process::exit(1);
+    }
+}
+
+fn cmd_write(
+    inputs: &[PathBuf],
+    store: &PathBuf,
+    output_dir: Option<&std::path::Path>,
+    compression: CompressionArg,
+) -> Result<()> {
+    let output_dir = output_dir.unwrap_or_else(|| std::path::Path::new("."));
+    std::fs::create_dir_all(output_dir)?;
+
+    for input in inputs {
+        let file = File::open(input)?;
+        let reader = ParquetRecordBatchReaderBuilder::try_new(file)?
+            .with_batch_size(8192)
+            .build()?;
+        let schema = reader.schema();
+
+        let stem = input
+            .file_stem()
+            .and_then(|s| s.to_str())
+            .unwrap_or("output");
+        let meta_path = output_dir.join(format!("{stem}.meta.parquet"));
+
+        let props = WriterProperties::builder()
+            .set_compression(compression.to_parquet())
+            .build();
+        let mut writer = PageStoreWriter::try_new(store, schema, Some(props))?;
+        let mut total_rows = 0usize;
+        for batch in reader {
+            let batch =
+                batch.map_err(|e| parquet::errors::ParquetError::General(e.to_string()))?;
+            total_rows += batch.num_rows();
+            writer.write(&batch)?;
+        }
+        let metadata = writer.finish(&meta_path)?;
+
+        let page_count = metadata
+            .file_metadata()
+            .key_value_metadata()
+            .and_then(|kvs| {
+                kvs.iter()
+                    .find(|kv| kv.key == "page_store.manifest")
+                    .and_then(|kv| kv.value.as_ref())
+            })
+            .and_then(|v| {
+                serde_json::from_str::<serde_json::Value>(v)
+                    .ok()
+                    .and_then(|j| j["pages"].as_array().map(|a| a.len()))
+            })
+            .unwrap_or(0);
+
+        eprintln!(
+            "{}: {} rows, {} row group(s), {} pages -> {}",
+            input.display(),
+            total_rows,
+            metadata.num_row_groups(),
+            page_count,
+            meta_path.display(),
+        );
+    }
+
+    let page_files = std::fs::read_dir(store)?
+        .filter_map(|e| e.ok())
+        .filter(|e| e.path().extension().map_or(false, |ext| ext == "page"))
+        .count();
+    eprintln!(
+        "Page store: {} page file(s) in {}",
+        page_files,
+        store.display()
+    );
+
+    Ok(())
+}
+
+fn cmd_read(input: &PathBuf, store: &PathBuf) -> Result<()> {
+    let reader = PageStoreReader::try_new(input, store)?;
+    let md = reader.metadata();
+
+    eprintln!(
+        "Schema: {} column(s), {} row group(s), {} total row(s)",
+        md.row_groups().first().map_or(0, |rg| rg.num_columns()),
+        md.num_row_groups(),
+        md.file_metadata().num_rows(),
+    );
+
+    let mut total_rows = 0usize;
+    for batch in reader.reader()? {
+        let batch = batch.map_err(|e| parquet::errors::ParquetError::General(e.to_string()))?;
+        total_rows += batch.num_rows();
+    }
+    eprintln!("Read {} row(s)", total_rows);
+
+    Ok(())
+}

From 0d34a46a66e2bdb85df821d94cd3bd3fd01ea8b6 Mon Sep 17 00:00:00 2001
From: Krisztian Szucs <szucs.krisztian@gmail.com>
Date: Fri, 3 Apr 2026 19:05:10 +0200
Subject: [PATCH 4/8] feat(parquet): add page store demo, reconstruct CLI
 command, and roundtrip verification

---
 parquet/Cargo.toml                            |   3 +-
 parquet/examples/page_store.rs                |   6 +-
 parquet/examples/page_store_dedup/.gitignore  |   6 +
 parquet/examples/page_store_dedup/README.md   | 159 +++++++
 parquet/examples/page_store_dedup/concept.py  | 360 +++++++++++++++
 .../page_store_dedup/page_store_concept.svg   | 220 ++++++++++
 parquet/examples/page_store_dedup/pipeline.py | 409 ++++++++++++++++++
 parquet/src/arrow/mod.rs                      |   4 +-
 parquet/src/arrow/page_store/mod.rs           | 314 +++++++++++---
 parquet/src/arrow/page_store/reader.rs        |  13 +-
 parquet/src/arrow/page_store/writer.rs        |  58 ++-
 parquet/src/bin/parquet-page-store.rs         | 117 ++++-
 12 files changed, 1571 insertions(+), 98 deletions(-)
 create mode 100644 parquet/examples/page_store_dedup/.gitignore
 create mode 100644 parquet/examples/page_store_dedup/README.md
 create mode 100644 parquet/examples/page_store_dedup/concept.py
 create mode 100644 parquet/examples/page_store_dedup/page_store_concept.svg
 create mode 100644 parquet/examples/page_store_dedup/pipeline.py

diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml
index 5e4eeacee1f8..0bf5f66eeeac 100644
--- a/parquet/Cargo.toml
+++ b/parquet/Cargo.toml
@@ -66,6 +66,7 @@ num-integer = { version = "0.1.46", default-features = false, features = ["std"]
 num-traits = { version = "0.2.19", default-features = false, features = ["std"] }
 base64 = { version = "0.22", default-features = false, features = ["std", ], optional = true }
 clap = { version = "4.1", default-features = false, features = ["std", "derive", "env", "help", "error-context", "usage"], optional = true }
+glob = { version = "0.3", default-features = false, optional = true }
 serde = { version = "1.0", default-features = false, features = ["derive"], optional = true }
 serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true }
 seq-macro = { version = "0.3", default-features = false }
@@ -110,7 +111,7 @@ arrow = ["base64", "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema",
 # Enable support for arrow canonical extension types
 arrow_canonical_extension_types = ["arrow-schema?/canonical_extension_types"]
 # Enable CLI tools
-cli = ["json", "base64", "clap", "arrow-csv", "serde"]
+cli = ["json", "base64", "clap", "arrow-csv", "serde", "dep:glob"]
 # Enable JSON APIs
 json = ["serde_json", "base64"]
 # Enable internal testing APIs
diff --git a/parquet/examples/page_store.rs b/parquet/examples/page_store.rs
index 8b963329c997..736ce9354694 100644
--- a/parquet/examples/page_store.rs
+++ b/parquet/examples/page_store.rs
@@ -87,7 +87,11 @@ fn main() -> parquet::errors::Result<()> {
     let batches = reader.read_batches()?;
 
     let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
-    println!("Read {} batch(es), {} total rows", batches.len(), total_rows);
+    println!(
+        "Read {} batch(es), {} total rows",
+        batches.len(),
+        total_rows
+    );
 
     // Display
     let formatted = pretty_format_batches(&batches).unwrap();
diff --git a/parquet/examples/page_store_dedup/.gitignore b/parquet/examples/page_store_dedup/.gitignore
new file mode 100644
index 000000000000..9d823ef6cc69
--- /dev/null
+++ b/parquet/examples/page_store_dedup/.gitignore
@@ -0,0 +1,6 @@
+data/
+meta/
+pages/
+verify/
+.venv/
+.cache/
diff --git a/parquet/examples/page_store_dedup/README.md b/parquet/examples/page_store_dedup/README.md
new file mode 100644
index 000000000000..63ffa70f2ceb
--- /dev/null
+++ b/parquet/examples/page_store_dedup/README.md
@@ -0,0 +1,159 @@
+# Parquet Page Store — Deduplication Demo
+
+> **Prototype**: This is an experimental feature exploring content-defined
+> chunking for Parquet.  APIs and file formats may change.
+
+Demonstrates how Content-Defined Chunking (CDC) enables efficient deduplication
+across multiple versions of a dataset using the Parquet page store writer in
+Apache Arrow Rust.  The deduplication is self-contained in the Parquet writer —
+no special storage system is required.
+
+## What this demo shows
+
+Four common dataset operations are applied to a real-world dataset
+([OpenHermes-2.5](https://huggingface.co/datasets/teknium/OpenHermes-2.5)
+conversational data, ~800 MB per file).  Each operation produces a separate
+Parquet file.  Without a page store, storing all four files costs the full sum
+of their sizes.  With the CDC page store, identical pages are stored **exactly
+once** — indexed by their BLAKE3 hash — so the four files share most of their
+bytes.  The resulting files can be stored anywhere.
+
+| File | Operation |
+|------|-----------|
+| `original.parquet` | Baseline dataset (~996k rows) |
+| `filtered.parquet` | Keep rows where `num_turns ≤ 3` |
+| `augmented.parquet` | Original + computed column `num_turns` |
+| `appended.parquet` | Original + 5 000 new rows appended |
+
+## Prerequisites
+
+```bash
+pip install pyarrow matplotlib huggingface_hub
+cargo build --release -p parquet --features page_store,cli
+```
+
+## Running the demo
+
+```bash
+cd parquet/examples/page_store_dedup
+
+# Run the full pipeline: prepare data, build binary, ingest into page store, show stats
+python pipeline.py
+
+# Then generate diagrams
+python diagram.py
+```
+
+Individual steps can be skipped if they've already run:
+
+```bash
+python pipeline.py --skip-prepare --skip-build   # re-run ingest + stats only
+python pipeline.py --skip-prepare --skip-build --skip-ingest  # stats only
+```
+
+Outputs:
+- `page_store_concept.png` — architectural overview of how shared pages work
+- `page_store_savings.png` — side-by-side storage comparison with real numbers
+
+## Using your own dataset
+
+```bash
+python pipeline.py --file /path/to/your.parquet
+```
+
+The script requires a `conversations` list column for the filtered and augmented
+variants.  Adapt `pipeline.py` to your own schema as needed.
+
+## Results
+
+Dataset: **OpenHermes-2.5** (short conversations, `num_turns < 10`)
+
+### Dataset variants
+
+| File | Operation | Rows | Size |
+|------|-----------|------|------|
+| `original.parquet` | Baseline | 996,009 | 782.1 MB |
+| `filtered.parquet` | Keep `num_turns ≤ 3` (removes 0.2% of rows) | 993,862 | 776.8 MB |
+| `augmented.parquet` | Add column `num_turns` | 996,009 | 782.2 MB |
+| `appended.parquet` | Append 5,000 rows | 1,001,009 | 788.6 MB |
+| **Total** | | | **3,129.7 MB** |
+
+### Page store results
+
+| Metric | Value |
+|--------|-------|
+| Unique pages stored | 3,400 |
+| Total page references | 15,179 |
+| Page store size | 559.0 MB |
+| Metadata files size | 4.4 MB |
+| **Page store + metadata** | **563.4 MB** |
+| **Storage saved** | **2,566.3 MB (82%)** |
+| **Deduplication ratio** | **5.6×** |
+
+### Per-file page breakdown
+
+| File | Page refs | Unique hashes | New pages | Reused pages |
+|------|-----------|---------------|-----------|--------------|
+| `original.parquet` | 3,782 | 3,100 | 3,100 | 0 |
+| `filtered.parquet` | 3,755 | 3,075 | 222 | 2,853 (92%) |
+| `augmented.parquet` | 3,834 | 3,136 | 36 | 3,100 (98%) |
+| `appended.parquet` | 3,808 | 3,125 | 42 | 3,083 (98%) |
+
+### Key insights
+
+1. **Adding a column** (`augmented`): only 36 new pages out of 3,136 (1.1%).
+   The existing 17 columns produce identical CDC pages — only the new `num_turns`
+   column contributes new pages.
+
+2. **Appending rows** (`appended`): only 42 new pages out of 3,125 (1.3%).
+   The original 996k rows' pages are unchanged; only the 5k new rows create new pages.
+
+3. **Filtering rows** (`filtered`): 92% of pages reused despite row removal.
+   Removing just 0.2% of rows barely shifts CDC boundaries — most pages are
+   unchanged.  Heavier filtering (removing 20–50% of rows) would produce more new
+   pages, as CDC boundaries shift further throughout the file.
+
+4. **Net result**: 4 dataset versions stored for **563 MB instead of 3.1 GB** — an
+   **82% reduction**, or equivalently, 4 versions for the cost of **0.72×** a single
+   version.
+
+## How it works
+
+```
+Standard Parquet — each file stored independently:
+
+  original.parquet   ──►  [ page 1 ][ page 2 ][ page 3 ]...[ page N ]
+  filtered.parquet   ──►  [ page 1'][ page 2 ][ page 3 ]...[ page M ]
+  augmented.parquet  ──►  [ page 1 ][ page 2 ][ page 3 ]...[ page N ][ extra ]
+  appended.parquet   ──►  [ page 1 ][ page 2 ][ page 3 ]...[ page N ][ new  ]
+
+  Total: sum of all four file sizes
+
+CDC Page Store — content-addressed, deduplicated:
+
+  pages/
+    <hash-of-page-1>.page    ←  shared by original, augmented, appended
+    <hash-of-page-2>.page    ←  shared by original, filtered, augmented, appended
+    <hash-of-page-3>.page    ←  shared by filtered only (boundary page)
+    ...                         (only UNIQUE pages stored)
+
+  meta/
+    original.meta.parquet    ←  tiny manifest referencing page hashes
+    filtered.meta.parquet
+    augmented.meta.parquet
+    appended.meta.parquet
+
+  Total: ~18% of the combined file sizes
+```
+
+CDC ensures that page boundaries are **content-defined** (not fixed row
+counts), so adding columns or appending rows only requires storing the small
+number of new pages — the rest remain identical and are reused.
+
+## Further reading
+
+- [`parquet::arrow::page_store`][api] API docs
+- [`parquet-page-store` CLI][cli] source
+
+[api]: https://docs.rs/parquet/latest/parquet/arrow/page_store/index.html
+[cli]: ../../src/bin/parquet-page-store.rs
diff --git a/parquet/examples/page_store_dedup/concept.py b/parquet/examples/page_store_dedup/concept.py
new file mode 100644
index 000000000000..cdd30789145e
--- /dev/null
+++ b/parquet/examples/page_store_dedup/concept.py
@@ -0,0 +1,360 @@
+#!/usr/bin/env python3
+"""
+Generate the Parquet Page Store concept diagram.
+
+Output: page_store_concept.svg  (open in any browser)
+        page_store_concept.png  (requires drawsvg[raster])
+"""
+
+import os
+import drawsvg as draw
+
+HERE = os.path.dirname(__file__)
+
+# ---------------------------------------------------------------------------
+# Palette
+# ---------------------------------------------------------------------------
+
+BG      = "#0f1117"
+SURFACE = "#161b22"
+BORDER  = "#2a2f3a"
+TEXT_HI = "#f0f6fc"
+TEXT_LO = "#6e7681"
+BLUE    = "#4493f8"
+GREEN   = "#3fb950"
+PURPLE  = "#bc8cff"
+ORANGE  = "#f0883e"
+WHITE   = "#ffffff"
+
+# ---------------------------------------------------------------------------
+# Layout grid  (derive everything from these constants)
+# ---------------------------------------------------------------------------
+
+PAD       = 28          # outer margin
+GAP       = 120         # gap between file panel right edge and store left edge
+
+FILE_W    = 360
+FILE_H    = 104
+FILE_GAP  = 14          # vertical gap between file cards
+
+N_FILES   = 4
+FILES_H   = N_FILES * FILE_H + (N_FILES - 1) * FILE_GAP   # 502
+
+STORE_Y_PAD = 38        # store header height (folder name + divider)
+STORE_LEG_H = 82        # legend block at bottom of store
+STORE_H   = FILES_H     # store and file panel share the same height
+
+TITLE_H   = 82          # space taken by title block
+TOP_Y     = TITLE_H + 12
+
+CMP_H     = 82          # bottom comparison bar height
+CMP_GAP   = 18
+
+STORE_W   = 256         # fixed, intentionally compact
+STORE_X   = PAD + FILE_W + GAP
+CANVAS_W  = STORE_X + STORE_W + PAD
+CANVAS_H  = TOP_Y + STORE_H + CMP_GAP + CMP_H + PAD
+
+
+# ---------------------------------------------------------------------------
+# Public entry point
+# ---------------------------------------------------------------------------
+
+def generate(total_mb: float = 3129.7, store_mb: float = 563.4) -> None:
+    """Render the concept SVG (and optionally PNG) to the script directory."""
+
+    d = draw.Drawing(CANVAS_W, CANVAS_H)
+
+    # -----------------------------------------------------------------------
+    # Drawing helpers  (closures over d)
+    # -----------------------------------------------------------------------
+
+    def bg_rect(x, y, w, h, fill=SURFACE, stroke=BORDER, rx=8, **kw):
+        d.append(draw.Rectangle(x, y, w, h, fill=fill, stroke=stroke,
+                                 stroke_width=1.5, rx=rx, ry=rx, **kw))
+
+    def txt(s, x, y, size=13, fill=TEXT_HI, anchor="middle", weight="normal", **kw):
+        d.append(draw.Text(s, size, x, y, text_anchor=anchor, fill=fill,
+                           font_weight=weight,
+                           font_family="ui-monospace,'SF Mono',monospace", **kw))
+
+    def hline(x1, x2, y, color=BORDER, width=1, opacity=1.0, dash=None):
+        kw = {"stroke_dasharray": dash} if dash else {}
+        d.append(draw.Line(x1, y, x2, y, stroke=color, stroke_width=width,
+                           stroke_opacity=opacity, **kw))
+
+    arrowhead_ids: dict[str, str] = {}
+
+    def _arrowhead(color: str) -> str:
+        if color not in arrowhead_ids:
+            aid = f"ah{len(arrowhead_ids)}"
+            arrowhead_ids[color] = aid
+            m = draw.Marker(-0.1, -3, 4, 3, orient="auto", id=aid)
+            m.append(draw.Path(d="M0,-2.5 L3.5,0 L0,2.5 Z", fill=color))
+            d.append_def(m)
+        return f"url(#{arrowhead_ids[color]})"
+
+    def arrow_line(x1, y, x2, color, label=None):
+        path = draw.Path(stroke=color, stroke_width=1.8, stroke_opacity=0.55,
+                         fill="none", marker_end=_arrowhead(color))
+        path.M(x1, y)
+        path.L(x2, y)
+        d.append(path)
+        if label:
+            mx, lw = (x1 + x2) / 2, 72
+            d.append(draw.Rectangle(mx - lw / 2, y - 10, lw, 16,
+                                     fill="#1c2128", stroke=color, stroke_width=1,
+                                     stroke_opacity=0.4, rx=8, ry=8))
+            txt(label, mx, y + 2, size=9, fill=color, weight="bold")
+
+    def page_tile(x, y, w, h, color, label):
+        """Filled page block with glow halo + label."""
+        d.append(draw.Rectangle(x - 3, y - 3, w + 6, h + 6,
+                                 fill=color, fill_opacity=0.07, rx=6, ry=6))
+        d.append(draw.Rectangle(x, y, w, h, fill=color, fill_opacity=0.18,
+                                 stroke=color, stroke_width=1.5, stroke_opacity=0.7,
+                                 rx=5, ry=5))
+        txt(label, x + w / 2, y + h / 2 + 5, size=14, fill=color, weight="bold")
+
+    def file_page(x, y, w, h, color, label):
+        """Smaller page block used inside file cards."""
+        d.append(draw.Rectangle(x, y, w, h, fill=color, fill_opacity=0.22,
+                                 stroke=color, stroke_width=1, stroke_opacity=0.55,
+                                 rx=3, ry=3))
+        txt(label, x + w / 2, y + h / 2 + 4, size=9, fill=color, weight="bold")
+
+    # -----------------------------------------------------------------------
+    # Background + grid
+    # -----------------------------------------------------------------------
+
+    d.append(draw.Rectangle(0, 0, CANVAS_W, CANVAS_H, fill=BG))
+    for gx in range(0, CANVAS_W, 40):
+        d.append(draw.Line(gx, 0, gx, CANVAS_H, stroke=WHITE,
+                           stroke_width=0.18, stroke_opacity=0.04))
+    for gy in range(0, CANVAS_H, 40):
+        d.append(draw.Line(0, gy, CANVAS_W, gy, stroke=WHITE,
+                           stroke_width=0.18, stroke_opacity=0.04))
+
+    # -----------------------------------------------------------------------
+    # Title
+    # -----------------------------------------------------------------------
+
+    txt("Parquet Page Store", CANVAS_W / 2, 32, size=22, weight="bold")
+
+    cx = CANVAS_W / 2
+    d.append(draw.Raw(
+        f'<text x="{cx}" y="53" text-anchor="middle"'
+        f' font-family="ui-monospace,\'SF Mono\',monospace" font-size="11">'
+        f'<tspan fill="{TEXT_LO}">Deduplication built into the Arrow Rust Parquet writer using </tspan>'
+        f'<tspan fill="{WHITE}" font-weight="bold">Content-Defined Chunking</tspan>'
+        f'</text>'
+    ))
+
+    hline(CANVAS_W / 2 - 230, CANVAS_W / 2 + 230, 63, color=BORDER)
+    hline(CANVAS_W / 2 - 50,  CANVAS_W / 2 + 50,  63, color=BLUE, width=2, opacity=0.45)
+
+    # -----------------------------------------------------------------------
+    # Section labels  (centered above each panel)
+    # -----------------------------------------------------------------------
+
+    FILES_CX = PAD + FILE_W / 2
+    STORE_CX = STORE_X + STORE_W / 2
+
+    LABEL_Y = TOP_Y - 10
+    txt("INPUT FILES", FILES_CX, LABEL_Y, size=9, fill=TEXT_LO, weight="bold")
+    txt("PAGE STORE",  STORE_CX, LABEL_Y, size=9, fill=TEXT_LO, weight="bold")
+
+    # -----------------------------------------------------------------------
+    # Store card
+    # -----------------------------------------------------------------------
+
+    bg_rect(STORE_X, TOP_Y, STORE_W, STORE_H, fill="#0d1117", stroke=BORDER, rx=10)
+
+    txt("pages/", STORE_X + 16, TOP_Y + 20, size=11, fill=TEXT_LO, anchor="start")
+    hline(STORE_X + 12, STORE_X + STORE_W - 12, TOP_Y + 30, color=BORDER)
+
+    # -----------------------------------------------------------------------
+    # Unique pages grid  (centered inside the store card)
+    # -----------------------------------------------------------------------
+
+    UNIQUE_PAGES = [
+        (BLUE,   "A"), (BLUE,   "B"), (BLUE,   "C"), (BLUE,   "D"),
+        (BLUE,   "E"), (BLUE,   "F"), (PURPLE, "G"), (PURPLE, "H"),
+        (GREEN,  "I"), (GREEN,  "J"), (ORANGE, "K"), (ORANGE, "L"),
+    ]
+
+    SP_COLS   = 3
+    SPW, SPH  = 56, 40
+    SP_GAP_X  = 14
+    SP_GAP_Y  = 10
+
+    grid_w = SP_COLS * SPW + (SP_COLS - 1) * SP_GAP_X
+    SP_START_X = STORE_X + (STORE_W - grid_w) // 2
+    SP_START_Y = TOP_Y + STORE_Y_PAD + 10
+
+    page_centers: dict[str, tuple[float, float]] = {}
+
+    for i, (color, label) in enumerate(UNIQUE_PAGES):
+        col, row = i % SP_COLS, i // SP_COLS
+        px = SP_START_X + col * (SPW + SP_GAP_X)
+        py = SP_START_Y + row * (SPH + SP_GAP_Y)
+        page_tile(px, py, SPW, SPH, color, label)
+        page_centers[label] = (px + SPW / 2, py + SPH / 2)
+        txt(f"#{label.lower()}3f9a…", px + SPW / 2, py + SPH + 10,
+            size=7, fill=TEXT_LO)
+
+    N_PAGE_ROWS = (len(UNIQUE_PAGES) + SP_COLS - 1) // SP_COLS
+    last_row_py   = SP_START_Y + (N_PAGE_ROWS - 1) * (SPH + SP_GAP_Y)
+    hash_label_bottom = last_row_py + SPH + 14
+
+    LIST_MARGIN_X = 14
+    LIST_INNER_X  = 10
+    LINE_H        = 13
+    LIST_INNER_PY = 7
+
+    LISTING = [
+        ("158k", "a3f9b2e1c04d7f28"),
+        ("201k", "ff22e9640578db3c"),
+        ("167k", "bc8cff3ad19f673d"),
+        ("148k", "4493f8c9b28705f3"),
+        ("160k", "3fb950efa4891422"),
+    ]
+
+    list_x = STORE_X + LIST_MARGIN_X
+    list_w = STORE_W - 2 * LIST_MARGIN_X
+    list_y = hash_label_bottom + 8
+    list_h = len(LISTING) * LINE_H + 2 * LIST_INNER_PY
+
+    d.append(draw.Rectangle(list_x, list_y, list_w, list_h,
+                             fill="#0a0d12", rx=4, ry=4))
+
+    for i, (size, hash_prefix) in enumerate(LISTING):
+        baseline = list_y + LIST_INNER_PY + i * LINE_H + LINE_H - 3
+        line_txt = f".rw-r--r--  {size:>4}  {hash_prefix}….page"
+        txt(line_txt, list_x + LIST_INNER_X, baseline,
+            size=7.5, fill="#3d4450", anchor="start")
+
+    # -----------------------------------------------------------------------
+    # Legend  (centered, pinned to bottom of store card)
+    # -----------------------------------------------------------------------
+
+    LEG_ITEMS = [
+        (BLUE,   "shared by all"),
+        (PURPLE, "filter boundary"),
+        (GREEN,  "new column"),
+        (ORANGE, "new rows"),
+    ]
+    LEG_COL_W = STORE_W / 2 - 4
+    LEG_Y0    = TOP_Y + STORE_H - STORE_LEG_H + 20
+
+    hline(STORE_X + 12, STORE_X + STORE_W - 12,
+          TOP_Y + STORE_H - STORE_LEG_H, color=BORDER)
+
+    for i, (color, label) in enumerate(LEG_ITEMS):
+        col, row = i % 2, i // 2
+        lx = STORE_X + 20 + col * LEG_COL_W
+        ly = LEG_Y0 + row * 22
+        d.append(draw.Rectangle(lx, ly - 7, 11, 11, fill=color,
+                                 fill_opacity=0.85, rx=2, ry=2))
+        txt(label, lx + 16, ly + 2, size=10, fill=TEXT_LO, anchor="start")
+
+    # -----------------------------------------------------------------------
+    # File cards
+    # -----------------------------------------------------------------------
+
+    PW, PH, PGAP = 34, 26, 4
+
+    FILES = [
+        ("original.parquet",  "baseline · 996k rows",
+         [(BLUE,"A"),(BLUE,"B"),(BLUE,"C"),(BLUE,"D"),(BLUE,"E"),(BLUE,"F")],
+         BLUE,    "baseline"),
+        ("filtered.parquet",  "keep num_turns < 3",
+         [(BLUE,"A"),(PURPLE,"G"),(BLUE,"C"),(BLUE,"D"),(BLUE,"E"),(PURPLE,"H")],
+         PURPLE,  "92% reused"),
+        ("augmented.parquet", "add num_turns column",
+         [(BLUE,"A"),(GREEN,"I"),(BLUE,"B"),(BLUE,"C"),(GREEN,"J"),(BLUE,"D"),(BLUE,"E"),(BLUE,"F")],
+         GREEN,   "98% reused"),
+        ("appended.parquet",  "append 5 000 rows",
+         [(BLUE,"A"),(BLUE,"B"),(BLUE,"C"),(BLUE,"D"),(BLUE,"E"),(BLUE,"F"),(ORANGE,"K"),(ORANGE,"L")],
+         ORANGE,  "98% reused"),
+    ]
+
+    for fi, (fname, subtitle, pages, accent, reuse_lbl) in enumerate(FILES):
+        fy = TOP_Y + fi * (FILE_H + FILE_GAP)
+        card_mid_y = fy + FILE_H / 2
+
+        bg_rect(PAD, fy, FILE_W, FILE_H, fill=SURFACE, stroke=BORDER, rx=8)
+        d.append(draw.Rectangle(PAD, fy + 10, 3, FILE_H - 20,
+                                 fill=accent, fill_opacity=0.85, rx=1, ry=1))
+        txt(fname,    PAD + 16, fy + 26, size=12, fill=TEXT_HI, weight="bold", anchor="start")
+        txt(subtitle, PAD + 16, fy + 43, size=10, fill=TEXT_LO, anchor="start")
+
+        strip_x = PAD + 16
+        strip_y = fy + FILE_H - PH - 12
+        for pi, (pcolor, plabel) in enumerate(pages):
+            file_page(strip_x + pi * (PW + PGAP), strip_y, PW, PH, pcolor, plabel)
+
+        arrow_line(PAD + FILE_W + 4, card_mid_y, STORE_X - 4, accent, label=reuse_lbl)
+
+    # -----------------------------------------------------------------------
+    # Bottom: storage comparison bars
+    # -----------------------------------------------------------------------
+
+    CMP_Y = TOP_Y + STORE_H + CMP_GAP
+    CMP_X = PAD
+    CMP_W = CANVAS_W - PAD * 2
+
+    bg_rect(CMP_X, CMP_Y, CMP_W, CMP_H, fill="#0d1117", stroke=BORDER, rx=8)
+    txt("STORAGE COMPARISON", CMP_X + CMP_W / 2, CMP_Y + 13,
+        size=9, fill=TEXT_LO, weight="bold")
+
+    LABEL_COL_W = 132
+    RIGHT_PAD   = 12
+    TRACK_X = CMP_X + LABEL_COL_W
+    TRACK_W = CMP_W - LABEL_COL_W - RIGHT_PAD - 220
+
+    BAR_H = 20
+    savings_pct = round((1 - store_mb / total_mb) * 100)
+    ratio = total_mb / store_mb
+
+    R1_Y = CMP_Y + 22
+    txt("Vanilla Parquet", TRACK_X - 8, R1_Y + BAR_H / 2 + 4,
+        size=10, fill=TEXT_LO, anchor="end")
+    d.append(draw.Rectangle(TRACK_X, R1_Y, TRACK_W, BAR_H,
+                             fill="#ef5350", fill_opacity=0.22,
+                             stroke="#ef5350", stroke_width=1.2, stroke_opacity=0.45,
+                             rx=4, ry=4))
+    txt(f"{total_mb:,.0f} MB  (4 independent files)",
+        TRACK_X + TRACK_W + 10, R1_Y + BAR_H / 2 + 4,
+        size=10, fill="#ef9a9a", anchor="start")
+
+    R2_Y  = R1_Y + BAR_H + 8
+    WITH_W = round(TRACK_W * store_mb / total_mb)
+    txt("Page Store via CDC", TRACK_X - 8, R2_Y + BAR_H / 2 + 4,
+        size=10, fill=TEXT_LO, anchor="end")
+    d.append(draw.Rectangle(TRACK_X, R2_Y, WITH_W, BAR_H,
+                             fill="#66bb6a", fill_opacity=0.22,
+                             stroke="#66bb6a", stroke_width=1.2, stroke_opacity=0.45,
+                             rx=4, ry=4))
+    txt(f"{store_mb:,.0f} MB  —  {savings_pct}% less  ·  {ratio:.1f}× ratio",
+        TRACK_X + WITH_W + 10, R2_Y + BAR_H / 2 + 4,
+        size=10, fill="#a5d6a7", anchor="start")
+
+    # -----------------------------------------------------------------------
+    # Save
+    # -----------------------------------------------------------------------
+
+    out_svg = os.path.join(HERE, "page_store_concept.svg")
+    out_png = os.path.join(HERE, "page_store_concept.png")
+    d.save_svg(out_svg)
+    print(f"  Saved {out_svg}")
+
+    try:
+        d.save_png(out_png)
+        print(f"  Saved {out_png}")
+    except Exception as e:
+        print(f"  PNG skipped ({e}) — open the SVG in a browser")
+
+
+if __name__ == "__main__":
+    generate()
diff --git a/parquet/examples/page_store_dedup/page_store_concept.svg b/parquet/examples/page_store_dedup/page_store_concept.svg
new file mode 100644
index 000000000000..fbedd5b227e9
--- /dev/null
+++ b/parquet/examples/page_store_dedup/page_store_concept.svg
@@ -0,0 +1,220 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"
+     width="792" height="680" viewBox="0 0 792 680">
+<defs>
+<marker markerWidth="4.1" markerHeight="6" viewBox="-0.1 -3 4.1 6" orient="auto" id="ah0">
+<path d="M0,-2.5 L3.5,0 L0,2.5 Z" fill="#4493f8" />
+</marker>
+<marker markerWidth="4.1" markerHeight="6" viewBox="-0.1 -3 4.1 6" orient="auto" id="ah1">
+<path d="M0,-2.5 L3.5,0 L0,2.5 Z" fill="#bc8cff" />
+</marker>
+<marker markerWidth="4.1" markerHeight="6" viewBox="-0.1 -3 4.1 6" orient="auto" id="ah2">
+<path d="M0,-2.5 L3.5,0 L0,2.5 Z" fill="#3fb950" />
+</marker>
+<marker markerWidth="4.1" markerHeight="6" viewBox="-0.1 -3 4.1 6" orient="auto" id="ah3">
+<path d="M0,-2.5 L3.5,0 L0,2.5 Z" fill="#f0883e" />
+</marker>
+</defs>
+<rect x="0" y="0" width="792" height="680" fill="#0f1117" />
+<path d="M0,0 L0,680" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M40,0 L40,680" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M80,0 L80,680" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M120,0 L120,680" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M160,0 L160,680" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M200,0 L200,680" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M240,0 L240,680" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M280,0 L280,680" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M320,0 L320,680" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M360,0 L360,680" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M400,0 L400,680" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M440,0 L440,680" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M480,0 L480,680" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M520,0 L520,680" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M560,0 L560,680" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M600,0 L600,680" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M640,0 L640,680" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M680,0 L680,680" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M720,0 L720,680" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M760,0 L760,680" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M0,0 L792,0" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M0,40 L792,40" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M0,80 L792,80" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M0,120 L792,120" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M0,160 L792,160" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M0,200 L792,200" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M0,240 L792,240" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M0,280 L792,280" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M0,320 L792,320" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M0,360 L792,360" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M0,400 L792,400" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M0,440 L792,440" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M0,480 L792,480" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M0,520 L792,520" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M0,560 L792,560" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M0,600 L792,600" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<path d="M0,640 L792,640" stroke="#ffffff" stroke-width="0.18" stroke-opacity="0.04" />
+<text x="396.0" y="32" font-size="22" text-anchor="middle" fill="#f0f6fc" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">Parquet Page Store</text>
+<text x="396.0" y="53" text-anchor="middle" font-family="ui-monospace,'SF Mono',monospace" font-size="11"><tspan fill="#6e7681">Deduplication built into the Arrow Rust Parquet writer using </tspan><tspan fill="#ffffff" font-weight="bold">Content-Defined Chunking</tspan></text>
+<path d="M166.0,63 L626.0,63" stroke="#2a2f3a" stroke-width="1" stroke-opacity="1.0" />
+<path d="M346.0,63 L446.0,63" stroke="#4493f8" stroke-width="2" stroke-opacity="0.45" />
+<text x="208.0" y="84" font-size="9" text-anchor="middle" fill="#6e7681" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">INPUT FILES</text>
+<text x="636.0" y="84" font-size="9" text-anchor="middle" fill="#6e7681" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">PAGE STORE</text>
+<rect x="508" y="94" width="256" height="458" fill="#0d1117" stroke="#2a2f3a" stroke-width="1.5" rx="10" ry="10" />
+<text x="524" y="114" font-size="11" text-anchor="start" fill="#6e7681" font-weight="normal" font-family="ui-monospace,'SF Mono',monospace">pages/</text>
+<path d="M520,124 L752,124" stroke="#2a2f3a" stroke-width="1" stroke-opacity="1.0" />
+<rect x="535" y="139" width="62" height="46" fill="#4493f8" fill-opacity="0.07" rx="6" ry="6" />
+<rect x="538" y="142" width="56" height="40" fill="#4493f8" fill-opacity="0.18" stroke="#4493f8" stroke-width="1.5" stroke-opacity="0.7" rx="5" ry="5" />
+<text x="566.0" y="167.0" font-size="14" text-anchor="middle" fill="#4493f8" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">A</text>
+<text x="566.0" y="192" font-size="7" text-anchor="middle" fill="#6e7681" font-weight="normal" font-family="ui-monospace,'SF Mono',monospace">#a3f9a…</text>
+<rect x="605" y="139" width="62" height="46" fill="#4493f8" fill-opacity="0.07" rx="6" ry="6" />
+<rect x="608" y="142" width="56" height="40" fill="#4493f8" fill-opacity="0.18" stroke="#4493f8" stroke-width="1.5" stroke-opacity="0.7" rx="5" ry="5" />
+<text x="636.0" y="167.0" font-size="14" text-anchor="middle" fill="#4493f8" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">B</text>
+<text x="636.0" y="192" font-size="7" text-anchor="middle" fill="#6e7681" font-weight="normal" font-family="ui-monospace,'SF Mono',monospace">#b3f9a…</text>
+<rect x="675" y="139" width="62" height="46" fill="#4493f8" fill-opacity="0.07" rx="6" ry="6" />
+<rect x="678" y="142" width="56" height="40" fill="#4493f8" fill-opacity="0.18" stroke="#4493f8" stroke-width="1.5" stroke-opacity="0.7" rx="5" ry="5" />
+<text x="706.0" y="167.0" font-size="14" text-anchor="middle" fill="#4493f8" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">C</text>
+<text x="706.0" y="192" font-size="7" text-anchor="middle" fill="#6e7681" font-weight="normal" font-family="ui-monospace,'SF Mono',monospace">#c3f9a…</text>
+<rect x="535" y="189" width="62" height="46" fill="#4493f8" fill-opacity="0.07" rx="6" ry="6" />
+<rect x="538" y="192" width="56" height="40" fill="#4493f8" fill-opacity="0.18" stroke="#4493f8" stroke-width="1.5" stroke-opacity="0.7" rx="5" ry="5" />
+<text x="566.0" y="217.0" font-size="14" text-anchor="middle" fill="#4493f8" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">D</text>
+<text x="566.0" y="242" font-size="7" text-anchor="middle" fill="#6e7681" font-weight="normal" font-family="ui-monospace,'SF Mono',monospace">#d3f9a…</text>
+<rect x="605" y="189" width="62" height="46" fill="#4493f8" fill-opacity="0.07" rx="6" ry="6" />
+<rect x="608" y="192" width="56" height="40" fill="#4493f8" fill-opacity="0.18" stroke="#4493f8" stroke-width="1.5" stroke-opacity="0.7" rx="5" ry="5" />
+<text x="636.0" y="217.0" font-size="14" text-anchor="middle" fill="#4493f8" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">E</text>
+<text x="636.0" y="242" font-size="7" text-anchor="middle" fill="#6e7681" font-weight="normal" font-family="ui-monospace,'SF Mono',monospace">#e3f9a…</text>
+<rect x="675" y="189" width="62" height="46" fill="#4493f8" fill-opacity="0.07" rx="6" ry="6" />
+<rect x="678" y="192" width="56" height="40" fill="#4493f8" fill-opacity="0.18" stroke="#4493f8" stroke-width="1.5" stroke-opacity="0.7" rx="5" ry="5" />
+<text x="706.0" y="217.0" font-size="14" text-anchor="middle" fill="#4493f8" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">F</text>
+<text x="706.0" y="242" font-size="7" text-anchor="middle" fill="#6e7681" font-weight="normal" font-family="ui-monospace,'SF Mono',monospace">#f3f9a…</text>
+<rect x="535" y="239" width="62" height="46" fill="#bc8cff" fill-opacity="0.07" rx="6" ry="6" />
+<rect x="538" y="242" width="56" height="40" fill="#bc8cff" fill-opacity="0.18" stroke="#bc8cff" stroke-width="1.5" stroke-opacity="0.7" rx="5" ry="5" />
+<text x="566.0" y="267.0" font-size="14" text-anchor="middle" fill="#bc8cff" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">G</text>
+<text x="566.0" y="292" font-size="7" text-anchor="middle" fill="#6e7681" font-weight="normal" font-family="ui-monospace,'SF Mono',monospace">#g3f9a…</text>
+<rect x="605" y="239" width="62" height="46" fill="#bc8cff" fill-opacity="0.07" rx="6" ry="6" />
+<rect x="608" y="242" width="56" height="40" fill="#bc8cff" fill-opacity="0.18" stroke="#bc8cff" stroke-width="1.5" stroke-opacity="0.7" rx="5" ry="5" />
+<text x="636.0" y="267.0" font-size="14" text-anchor="middle" fill="#bc8cff" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">H</text>
+<text x="636.0" y="292" font-size="7" text-anchor="middle" fill="#6e7681" font-weight="normal" font-family="ui-monospace,'SF Mono',monospace">#h3f9a…</text>
+<rect x="675" y="239" width="62" height="46" fill="#3fb950" fill-opacity="0.07" rx="6" ry="6" />
+<rect x="678" y="242" width="56" height="40" fill="#3fb950" fill-opacity="0.18" stroke="#3fb950" stroke-width="1.5" stroke-opacity="0.7" rx="5" ry="5" />
+<text x="706.0" y="267.0" font-size="14" text-anchor="middle" fill="#3fb950" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">I</text>
+<text x="706.0" y="292" font-size="7" text-anchor="middle" fill="#6e7681" font-weight="normal" font-family="ui-monospace,'SF Mono',monospace">#i3f9a…</text>
+<rect x="535" y="289" width="62" height="46" fill="#3fb950" fill-opacity="0.07" rx="6" ry="6" />
+<rect x="538" y="292" width="56" height="40" fill="#3fb950" fill-opacity="0.18" stroke="#3fb950" stroke-width="1.5" stroke-opacity="0.7" rx="5" ry="5" />
+<text x="566.0" y="317.0" font-size="14" text-anchor="middle" fill="#3fb950" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">J</text>
+<text x="566.0" y="342" font-size="7" text-anchor="middle" fill="#6e7681" font-weight="normal" font-family="ui-monospace,'SF Mono',monospace">#j3f9a…</text>
+<rect x="605" y="289" width="62" height="46" fill="#f0883e" fill-opacity="0.07" rx="6" ry="6" />
+<rect x="608" y="292" width="56" height="40" fill="#f0883e" fill-opacity="0.18" stroke="#f0883e" stroke-width="1.5" stroke-opacity="0.7" rx="5" ry="5" />
+<text x="636.0" y="317.0" font-size="14" text-anchor="middle" fill="#f0883e" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">K</text>
+<text x="636.0" y="342" font-size="7" text-anchor="middle" fill="#6e7681" font-weight="normal" font-family="ui-monospace,'SF Mono',monospace">#k3f9a…</text>
+<rect x="675" y="289" width="62" height="46" fill="#f0883e" fill-opacity="0.07" rx="6" ry="6" />
+<rect x="678" y="292" width="56" height="40" fill="#f0883e" fill-opacity="0.18" stroke="#f0883e" stroke-width="1.5" stroke-opacity="0.7" rx="5" ry="5" />
+<text x="706.0" y="317.0" font-size="14" text-anchor="middle" fill="#f0883e" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">L</text>
+<text x="706.0" y="342" font-size="7" text-anchor="middle" fill="#6e7681" font-weight="normal" font-family="ui-monospace,'SF Mono',monospace">#l3f9a…</text>
+<rect x="522" y="354" width="228" height="79" fill="#0a0d12" rx="4" ry="4" />
+<text x="532" y="371" font-size="7.5" text-anchor="start" fill="#3d4450" font-weight="normal" font-family="ui-monospace,'SF Mono',monospace">.rw-r--r--  158k  a3f9b2e1c04d7f28….page</text>
+<text x="532" y="384" font-size="7.5" text-anchor="start" fill="#3d4450" font-weight="normal" font-family="ui-monospace,'SF Mono',monospace">.rw-r--r--  201k  ff22e9640578db3c….page</text>
+<text x="532" y="397" font-size="7.5" text-anchor="start" fill="#3d4450" font-weight="normal" font-family="ui-monospace,'SF Mono',monospace">.rw-r--r--  167k  bc8cff3ad19f673d….page</text>
+<text x="532" y="410" font-size="7.5" text-anchor="start" fill="#3d4450" font-weight="normal" font-family="ui-monospace,'SF Mono',monospace">.rw-r--r--  148k  4493f8c9b28705f3….page</text>
+<text x="532" y="423" font-size="7.5" text-anchor="start" fill="#3d4450" font-weight="normal" font-family="ui-monospace,'SF Mono',monospace">.rw-r--r--  160k  3fb950efa4891422….page</text>
+<path d="M520,470 L752,470" stroke="#2a2f3a" stroke-width="1" stroke-opacity="1.0" />
+<rect x="528.0" y="483" width="11" height="11" fill="#4493f8" fill-opacity="0.85" rx="2" ry="2" />
+<text x="544.0" y="492" font-size="10" text-anchor="start" fill="#6e7681" font-weight="normal" font-family="ui-monospace,'SF Mono',monospace">shared by all</text>
+<rect x="652.0" y="483" width="11" height="11" fill="#bc8cff" fill-opacity="0.85" rx="2" ry="2" />
+<text x="668.0" y="492" font-size="10" text-anchor="start" fill="#6e7681" font-weight="normal" font-family="ui-monospace,'SF Mono',monospace">filter boundary</text>
+<rect x="528.0" y="505" width="11" height="11" fill="#3fb950" fill-opacity="0.85" rx="2" ry="2" />
+<text x="544.0" y="514" font-size="10" text-anchor="start" fill="#6e7681" font-weight="normal" font-family="ui-monospace,'SF Mono',monospace">new column</text>
+<rect x="652.0" y="505" width="11" height="11" fill="#f0883e" fill-opacity="0.85" rx="2" ry="2" />
+<text x="668.0" y="514" font-size="10" text-anchor="start" fill="#6e7681" font-weight="normal" font-family="ui-monospace,'SF Mono',monospace">new rows</text>
+<rect x="28" y="94" width="360" height="104" fill="#161b22" stroke="#2a2f3a" stroke-width="1.5" rx="8" ry="8" />
+<rect x="28" y="104" width="3" height="84" fill="#4493f8" fill-opacity="0.85" rx="1" ry="1" />
+<text x="44" y="120" font-size="12" text-anchor="start" fill="#f0f6fc" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">original.parquet</text>
+<text x="44" y="137" font-size="10" text-anchor="start" fill="#6e7681" font-weight="normal" font-family="ui-monospace,'SF Mono',monospace">baseline · 996k rows</text>
+<rect x="44" y="160" width="34" height="26" fill="#4493f8" fill-opacity="0.22" stroke="#4493f8" stroke-width="1" stroke-opacity="0.55" rx="3" ry="3" />
+<text x="61.0" y="177.0" font-size="9" text-anchor="middle" fill="#4493f8" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">A</text>
+<rect x="82" y="160" width="34" height="26" fill="#4493f8" fill-opacity="0.22" stroke="#4493f8" stroke-width="1" stroke-opacity="0.55" rx="3" ry="3" />
+<text x="99.0" y="177.0" font-size="9" text-anchor="middle" fill="#4493f8" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">B</text>
+<rect x="120" y="160" width="34" height="26" fill="#4493f8" fill-opacity="0.22" stroke="#4493f8" stroke-width="1" stroke-opacity="0.55" rx="3" ry="3" />
+<text x="137.0" y="177.0" font-size="9" text-anchor="middle" fill="#4493f8" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">C</text>
+<rect x="158" y="160" width="34" height="26" fill="#4493f8" fill-opacity="0.22" stroke="#4493f8" stroke-width="1" stroke-opacity="0.55" rx="3" ry="3" />
+<text x="175.0" y="177.0" font-size="9" text-anchor="middle" fill="#4493f8" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">D</text>
+<rect x="196" y="160" width="34" height="26" fill="#4493f8" fill-opacity="0.22" stroke="#4493f8" stroke-width="1" stroke-opacity="0.55" rx="3" ry="3" />
+<text x="213.0" y="177.0" font-size="9" text-anchor="middle" fill="#4493f8" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">E</text>
+<rect x="234" y="160" width="34" height="26" fill="#4493f8" fill-opacity="0.22" stroke="#4493f8" stroke-width="1" stroke-opacity="0.55" rx="3" ry="3" />
+<text x="251.0" y="177.0" font-size="9" text-anchor="middle" fill="#4493f8" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">F</text>
+<path d="M392,146.0 L504,146.0" stroke="#4493f8" stroke-width="1.8" stroke-opacity="0.55" fill="none" marker-end="url(#ah0)" />
+<rect x="412.0" y="136.0" width="72" height="16" fill="#1c2128" stroke="#4493f8" stroke-width="1" stroke-opacity="0.4" rx="8" ry="8" />
+<text x="448.0" y="148.0" font-size="9" text-anchor="middle" fill="#4493f8" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">baseline</text>
+<rect x="28" y="212" width="360" height="104" fill="#161b22" stroke="#2a2f3a" stroke-width="1.5" rx="8" ry="8" />
+<rect x="28" y="222" width="3" height="84" fill="#bc8cff" fill-opacity="0.85" rx="1" ry="1" />
+<text x="44" y="238" font-size="12" text-anchor="start" fill="#f0f6fc" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">filtered.parquet</text>
+<text x="44" y="255" font-size="10" text-anchor="start" fill="#6e7681" font-weight="normal" font-family="ui-monospace,'SF Mono',monospace">keep num_turns &lt; 3</text>
+<rect x="44" y="278" width="34" height="26" fill="#4493f8" fill-opacity="0.22" stroke="#4493f8" stroke-width="1" stroke-opacity="0.55" rx="3" ry="3" />
+<text x="61.0" y="295.0" font-size="9" text-anchor="middle" fill="#4493f8" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">A</text>
+<rect x="82" y="278" width="34" height="26" fill="#bc8cff" fill-opacity="0.22" stroke="#bc8cff" stroke-width="1" stroke-opacity="0.55" rx="3" ry="3" />
+<text x="99.0" y="295.0" font-size="9" text-anchor="middle" fill="#bc8cff" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">G</text>
+<rect x="120" y="278" width="34" height="26" fill="#4493f8" fill-opacity="0.22" stroke="#4493f8" stroke-width="1" stroke-opacity="0.55" rx="3" ry="3" />
+<text x="137.0" y="295.0" font-size="9" text-anchor="middle" fill="#4493f8" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">C</text>
+<rect x="158" y="278" width="34" height="26" fill="#4493f8" fill-opacity="0.22" stroke="#4493f8" stroke-width="1" stroke-opacity="0.55" rx="3" ry="3" />
+<text x="175.0" y="295.0" font-size="9" text-anchor="middle" fill="#4493f8" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">D</text>
+<rect x="196" y="278" width="34" height="26" fill="#4493f8" fill-opacity="0.22" stroke="#4493f8" stroke-width="1" stroke-opacity="0.55" rx="3" ry="3" />
+<text x="213.0" y="295.0" font-size="9" text-anchor="middle" fill="#4493f8" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">E</text>
+<rect x="234" y="278" width="34" height="26" fill="#bc8cff" fill-opacity="0.22" stroke="#bc8cff" stroke-width="1" stroke-opacity="0.55" rx="3" ry="3" />
+<text x="251.0" y="295.0" font-size="9" text-anchor="middle" fill="#bc8cff" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">H</text>
+<path d="M392,264.0 L504,264.0" stroke="#bc8cff" stroke-width="1.8" stroke-opacity="0.55" fill="none" marker-end="url(#ah1)" />
+<rect x="412.0" y="254.0" width="72" height="16" fill="#1c2128" stroke="#bc8cff" stroke-width="1" stroke-opacity="0.4" rx="8" ry="8" />
+<text x="448.0" y="266.0" font-size="9" text-anchor="middle" fill="#bc8cff" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">92% reused</text>
+<rect x="28" y="330" width="360" height="104" fill="#161b22" stroke="#2a2f3a" stroke-width="1.5" rx="8" ry="8" />
+<rect x="28" y="340" width="3" height="84" fill="#3fb950" fill-opacity="0.85" rx="1" ry="1" />
+<text x="44" y="356" font-size="12" text-anchor="start" fill="#f0f6fc" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">augmented.parquet</text>
+<text x="44" y="373" font-size="10" text-anchor="start" fill="#6e7681" font-weight="normal" font-family="ui-monospace,'SF Mono',monospace">add num_turns column</text>
+<rect x="44" y="396" width="34" height="26" fill="#4493f8" fill-opacity="0.22" stroke="#4493f8" stroke-width="1" stroke-opacity="0.55" rx="3" ry="3" />
+<text x="61.0" y="413.0" font-size="9" text-anchor="middle" fill="#4493f8" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">A</text>
+<rect x="82" y="396" width="34" height="26" fill="#3fb950" fill-opacity="0.22" stroke="#3fb950" stroke-width="1" stroke-opacity="0.55" rx="3" ry="3" />
+<text x="99.0" y="413.0" font-size="9" text-anchor="middle" fill="#3fb950" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">I</text>
+<rect x="120" y="396" width="34" height="26" fill="#4493f8" fill-opacity="0.22" stroke="#4493f8" stroke-width="1" stroke-opacity="0.55" rx="3" ry="3" />
+<text x="137.0" y="413.0" font-size="9" text-anchor="middle" fill="#4493f8" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">B</text>
+<rect x="158" y="396" width="34" height="26" fill="#4493f8" fill-opacity="0.22" stroke="#4493f8" stroke-width="1" stroke-opacity="0.55" rx="3" ry="3" />
+<text x="175.0" y="413.0" font-size="9" text-anchor="middle" fill="#4493f8" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">C</text>
+<rect x="196" y="396" width="34" height="26" fill="#3fb950" fill-opacity="0.22" stroke="#3fb950" stroke-width="1" stroke-opacity="0.55" rx="3" ry="3" />
+<text x="213.0" y="413.0" font-size="9" text-anchor="middle" fill="#3fb950" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">J</text>
+<rect x="234" y="396" width="34" height="26" fill="#4493f8" fill-opacity="0.22" stroke="#4493f8" stroke-width="1" stroke-opacity="0.55" rx="3" ry="3" />
+<text x="251.0" y="413.0" font-size="9" text-anchor="middle" fill="#4493f8" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">D</text>
+<rect x="272" y="396" width="34" height="26" fill="#4493f8" fill-opacity="0.22" stroke="#4493f8" stroke-width="1" stroke-opacity="0.55" rx="3" ry="3" />
+<text x="289.0" y="413.0" font-size="9" text-anchor="middle" fill="#4493f8" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">E</text>
+<rect x="310" y="396" width="34" height="26" fill="#4493f8" fill-opacity="0.22" stroke="#4493f8" stroke-width="1" stroke-opacity="0.55" rx="3" ry="3" />
+<text x="327.0" y="413.0" font-size="9" text-anchor="middle" fill="#4493f8" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">F</text>
+<path d="M392,382.0 L504,382.0" stroke="#3fb950" stroke-width="1.8" stroke-opacity="0.55" fill="none" marker-end="url(#ah2)" />
+<rect x="412.0" y="372.0" width="72" height="16" fill="#1c2128" stroke="#3fb950" stroke-width="1" stroke-opacity="0.4" rx="8" ry="8" />
+<text x="448.0" y="384.0" font-size="9" text-anchor="middle" fill="#3fb950" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">98% reused</text>
+<rect x="28" y="448" width="360" height="104" fill="#161b22" stroke="#2a2f3a" stroke-width="1.5" rx="8" ry="8" />
+<rect x="28" y="458" width="3" height="84" fill="#f0883e" fill-opacity="0.85" rx="1" ry="1" />
+<text x="44" y="474" font-size="12" text-anchor="start" fill="#f0f6fc" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">appended.parquet</text>
+<text x="44" y="491" font-size="10" text-anchor="start" fill="#6e7681" font-weight="normal" font-family="ui-monospace,'SF Mono',monospace">append 5 000 rows</text>
+<rect x="44" y="514" width="34" height="26" fill="#4493f8" fill-opacity="0.22" stroke="#4493f8" stroke-width="1" stroke-opacity="0.55" rx="3" ry="3" />
+<text x="61.0" y="531.0" font-size="9" text-anchor="middle" fill="#4493f8" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">A</text>
+<rect x="82" y="514" width="34" height="26" fill="#4493f8" fill-opacity="0.22" stroke="#4493f8" stroke-width="1" stroke-opacity="0.55" rx="3" ry="3" />
+<text x="99.0" y="531.0" font-size="9" text-anchor="middle" fill="#4493f8" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">B</text>
+<rect x="120" y="514" width="34" height="26" fill="#4493f8" fill-opacity="0.22" stroke="#4493f8" stroke-width="1" stroke-opacity="0.55" rx="3" ry="3" />
+<text x="137.0" y="531.0" font-size="9" text-anchor="middle" fill="#4493f8" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">C</text>
+<rect x="158" y="514" width="34" height="26" fill="#4493f8" fill-opacity="0.22" stroke="#4493f8" stroke-width="1" stroke-opacity="0.55" rx="3" ry="3" />
+<text x="175.0" y="531.0" font-size="9" text-anchor="middle" fill="#4493f8" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">D</text>
+<rect x="196" y="514" width="34" height="26" fill="#4493f8" fill-opacity="0.22" stroke="#4493f8" stroke-width="1" stroke-opacity="0.55" rx="3" ry="3" />
+<text x="213.0" y="531.0" font-size="9" text-anchor="middle" fill="#4493f8" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">E</text>
+<rect x="234" y="514" width="34" height="26" fill="#4493f8" fill-opacity="0.22" stroke="#4493f8" stroke-width="1" stroke-opacity="0.55" rx="3" ry="3" />
+<text x="251.0" y="531.0" font-size="9" text-anchor="middle" fill="#4493f8" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">F</text>
+<rect x="272" y="514" width="34" height="26" fill="#f0883e" fill-opacity="0.22" stroke="#f0883e" stroke-width="1" stroke-opacity="0.55" rx="3" ry="3" />
+<text x="289.0" y="531.0" font-size="9" text-anchor="middle" fill="#f0883e" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">K</text>
+<rect x="310" y="514" width="34" height="26" fill="#f0883e" fill-opacity="0.22" stroke="#f0883e" stroke-width="1" stroke-opacity="0.55" rx="3" ry="3" />
+<text x="327.0" y="531.0" font-size="9" text-anchor="middle" fill="#f0883e" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">L</text>
+<path d="M392,500.0 L504,500.0" stroke="#f0883e" stroke-width="1.8" stroke-opacity="0.55" fill="none" marker-end="url(#ah3)" />
+<rect x="412.0" y="490.0" width="72" height="16" fill="#1c2128" stroke="#f0883e" stroke-width="1" stroke-opacity="0.4" rx="8" ry="8" />
+<text x="448.0" y="502.0" font-size="9" text-anchor="middle" fill="#f0883e" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">98% reused</text>
+<rect x="28" y="570" width="736" height="82" fill="#0d1117" stroke="#2a2f3a" stroke-width="1.5" rx="8" ry="8" />
+<text x="396.0" y="583" font-size="9" text-anchor="middle" fill="#6e7681" font-weight="bold" font-family="ui-monospace,'SF Mono',monospace">STORAGE COMPARISON</text>
+<text x="152" y="606.0" font-size="10" text-anchor="end" fill="#6e7681" font-weight="normal" font-family="ui-monospace,'SF Mono',monospace">Vanilla Parquet</text>
+<rect x="160" y="592" width="372" height="20" fill="#ef5350" fill-opacity="0.22" stroke="#ef5350" stroke-width="1.2" stroke-opacity="0.45" rx="4" ry="4" />
+<text x="542" y="606.0" font-size="10" text-anchor="start" fill="#ef9a9a" font-weight="normal" font-family="ui-monospace,'SF Mono',monospace">2,796 MB  (4 independent files)</text>
+<text x="152" y="634.0" font-size="10" text-anchor="end" fill="#6e7681" font-weight="normal" font-family="ui-monospace,'SF Mono',monospace">Page Store via CDC</text>
+<rect x="160" y="620" width="115" height="20" fill="#66bb6a" fill-opacity="0.22" stroke="#66bb6a" stroke-width="1.2" stroke-opacity="0.45" rx="4" ry="4" />
+<text x="285" y="634.0" font-size="10" text-anchor="start" fill="#a5d6a7" font-weight="normal" font-family="ui-monospace,'SF Mono',monospace">865 MB  —  69% less  ·  3.2× ratio</text>
+</svg>
\ No newline at end of file
diff --git a/parquet/examples/page_store_dedup/pipeline.py b/parquet/examples/page_store_dedup/pipeline.py
new file mode 100644
index 000000000000..a49eb0659139
--- /dev/null
+++ b/parquet/examples/page_store_dedup/pipeline.py
@@ -0,0 +1,409 @@
+#!/usr/bin/env python3
+"""
+Full pipeline for the Parquet Page Store deduplication demo.
+
+Steps:
+  1. Prepare  – download dataset and produce 4 Parquet variant files.
+  2. Build    – compile the parquet-page-store CLI binary.
+  3. Ingest   – write all variants into a shared content-addressed page store.
+  4. Stats    – compute and display deduplication statistics.
+
+Usage:
+    python pipeline.py [--file PATH] [--skip-prepare] [--skip-build] [--skip-ingest]
+
+Options:
+    --file PATH      Use a local Parquet file instead of downloading from HuggingFace
+    --skip-prepare   Skip data preparation (variants must already exist in data/)
+    --skip-build     Skip cargo build (binary must already exist)
+    --skip-ingest    Skip page store ingest (pages must already exist in pages/)
+"""
+
+import argparse
+import os
+import shutil
+import subprocess
+import sys
+
+# Ensure imports from the same directory work regardless of cwd
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+import pyarrow as pa
+import pyarrow.compute as pc
+import pyarrow.parquet as pq
+
+# ---------------------------------------------------------------------------
+# Config
+# ---------------------------------------------------------------------------
+
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+DATA_DIR = os.path.join(SCRIPT_DIR, "data")
+META_DIR = os.path.join(SCRIPT_DIR, "meta")
+PAGES_DIR = os.path.join(SCRIPT_DIR, "pages")
+CACHE_DIR = os.path.join(SCRIPT_DIR, ".cache")
+
+# Repo root is 3 levels up: page_store_dedup/ -> examples/ -> parquet/ -> arrow-rs/
+REPO_ROOT = os.path.abspath(os.path.join(SCRIPT_DIR, "..", "..", ".."))
+BINARY = os.path.join(REPO_ROOT, "target", "release", "parquet-page-store")
+
+HF_REPO_ID = "kszucs/pq"
+HF_FILENAME = "hermes-2.5-cdc-short.parquet"
+
+# Number of rows to reserve for the appended variant
+APPEND_ROWS = 5_000
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _fmt(n: int) -> str:
+    return f"{n:,}"
+
+
+def _mb(path: str) -> str:
+    return f"{os.path.getsize(path) / 1e6:.1f} MB"
+
+
+def _dir_size(directory: str, ext: str) -> int:
+    total = 0
+    for entry in os.scandir(directory):
+        if entry.is_file() and entry.name.endswith(ext):
+            total += entry.stat().st_size
+    return total
+
+
+# ---------------------------------------------------------------------------
+# Step 1 – Prepare
+# ---------------------------------------------------------------------------
+
+
+def load_raw(path: str | None) -> tuple[pa.Table, pa.Table]:
+    if path is None:
+        try:
+            from huggingface_hub import hf_hub_download
+        except ImportError:
+            sys.exit("ERROR: huggingface_hub is required. Install with: pip install huggingface_hub")
+
+        print(f"  Downloading {HF_FILENAME} from HuggingFace ... ", end="", flush=True)
+        path = hf_hub_download(
+            repo_id=HF_REPO_ID,
+            filename=HF_FILENAME,
+            repo_type="dataset",
+            cache_dir=CACHE_DIR,
+        )
+        print(f"done ({_mb(path)})")
+
+    print(f"  Reading {os.path.basename(path)} ...")
+    full = pq.read_table(path)
+    print(f"  Full table: {_fmt(len(full))} rows, {len(full.schema)} columns")
+
+    if len(full) <= APPEND_ROWS:
+        sys.exit(f"ERROR: dataset has only {_fmt(len(full))} rows, need more than {_fmt(APPEND_ROWS)}")
+
+    base = full.slice(0, len(full) - APPEND_ROWS)
+    extra = full.slice(len(full) - APPEND_ROWS, APPEND_ROWS)
+    return base, extra
+
+
+def step_prepare(file_path: str | None) -> None:
+    print("=" * 60)
+    print("  Step 1 – Prepare dataset variants")
+    print("=" * 60)
+
+    os.makedirs(DATA_DIR, exist_ok=True)
+    base, extra = load_raw(file_path)
+    print()
+
+    total_bytes = 0
+    print(f"  Writing 4 variants to {DATA_DIR}/")
+    print()
+
+    out = os.path.join(DATA_DIR, "original.parquet")
+    pq.write_table(base, out)
+    sz = os.path.getsize(out)
+    total_bytes += sz
+    print(f"  original.parquet   {_fmt(len(base)):>9} rows   {sz / 1e6:>6.1f} MB  (baseline)")
+
+    out = os.path.join(DATA_DIR, "filtered.parquet")
+    mask = pc.less(pc.list_value_length(base["conversations"]), 3)
+    filtered = base.filter(mask)
+    pq.write_table(filtered, out)
+    sz = os.path.getsize(out)
+    total_bytes += sz
+    pct = len(filtered) * 100 // len(base)
+    print(f"  filtered.parquet   {_fmt(len(filtered)):>9} rows   {sz / 1e6:>6.1f} MB  ({pct}% of original rows kept)")
+
+    out = os.path.join(DATA_DIR, "augmented.parquet")
+    num_turns = pc.list_value_length(base["conversations"]).cast(pa.int32())
+    augmented = base.append_column(pa.field("num_turns", pa.int32()), num_turns)
+    pq.write_table(augmented, out)
+    sz = os.path.getsize(out)
+    total_bytes += sz
+    print(f"  augmented.parquet  {_fmt(len(augmented)):>9} rows   {sz / 1e6:>6.1f} MB  (same rows, +1 column)")
+
+    out = os.path.join(DATA_DIR, "appended.parquet")
+    appended = pa.concat_tables([base, extra])
+    pq.write_table(appended, out)
+    sz = os.path.getsize(out)
+    total_bytes += sz
+    print(f"  appended.parquet   {_fmt(len(appended)):>9} rows   {sz / 1e6:>6.1f} MB  (+{_fmt(APPEND_ROWS)} rows appended)")
+
+    print()
+    print(f"  Total (4 independent files): {total_bytes / 1e6:.1f} MB")
+    print()
+
+
+# ---------------------------------------------------------------------------
+# Step 2 – Build
+# ---------------------------------------------------------------------------
+
+
+def step_build() -> None:
+    print("=" * 60)
+    print("  Step 2 – Build parquet-page-store binary")
+    print("=" * 60)
+    print()
+
+    cmd = ["cargo", "build", "--release", "-p", "parquet", "--features", "page_store,cli"]
+    print(f"  Running: {' '.join(cmd)}")
+    print()
+
+    result = subprocess.run(cmd, cwd=REPO_ROOT)
+    if result.returncode != 0:
+        sys.exit(f"ERROR: cargo build failed (exit code {result.returncode})")
+
+    print()
+    print(f"  Binary: {BINARY}")
+    print()
+
+
+# ---------------------------------------------------------------------------
+# Step 3 – Ingest into page store
+# ---------------------------------------------------------------------------
+
+
+def step_ingest() -> None:
+    print("=" * 60)
+    print("  Step 3 – Ingest Parquet files into page store")
+    print("=" * 60)
+    print()
+
+    if not os.path.isfile(BINARY):
+        sys.exit(f"ERROR: binary not found at {BINARY}\n       Run without --skip-build first.")
+
+    for d in (PAGES_DIR, META_DIR):
+        if os.path.isdir(d):
+            shutil.rmtree(d)
+        os.makedirs(d)
+
+    inputs = sorted(
+        os.path.join(DATA_DIR, f)
+        for f in os.listdir(DATA_DIR)
+        if f.endswith(".parquet") and os.path.isfile(os.path.join(DATA_DIR, f))
+    )
+    if not inputs:
+        sys.exit(f"ERROR: no .parquet files found in {DATA_DIR}")
+
+    cmd = [BINARY, "write"] + inputs + ["--store", PAGES_DIR, "--output", META_DIR, "--compression", "snappy"]
+    print(f"  Running: parquet-page-store write <{len(inputs)} files> --store pages --output meta --compression snappy")
+    print()
+
+    result = subprocess.run(cmd, cwd=SCRIPT_DIR)
+    if result.returncode != 0:
+        sys.exit(f"ERROR: parquet-page-store write failed (exit code {result.returncode})")
+
+    print()
+
+
+# ---------------------------------------------------------------------------
+# Step 4 – Statistics
+# ---------------------------------------------------------------------------
+
+
+def step_stats() -> tuple[float, float]:
+    print("=" * 60)
+    print("  Step 4 – Deduplication statistics")
+    print("=" * 60)
+    print()
+
+    # Input file sizes (top-level .parquet files only)
+    input_files = sorted(
+        os.path.join(DATA_DIR, f)
+        for f in os.listdir(DATA_DIR)
+        if f.endswith(".parquet") and os.path.isfile(os.path.join(DATA_DIR, f))
+    )
+    if not input_files:
+        print("  No input files found — run without --skip-ingest first.")
+        return 0.0, 0.0
+
+    total_input = sum(os.path.getsize(p) for p in input_files)
+
+    print("  Input files:")
+    for path in input_files:
+        sz = os.path.getsize(path)
+        print(f"    {os.path.basename(path):<25} {sz / 1e6:>7.1f} MB")
+    print(f"    {'Total':<25} {total_input / 1e6:>7.1f} MB")
+    print()
+
+    # Page store size
+    if not os.path.isdir(PAGES_DIR):
+        print("  Page store directory not found — run without --skip-ingest first.")
+        return 0.0, 0.0
+
+    page_files = [e for e in os.scandir(PAGES_DIR) if e.is_file() and e.name.endswith(".page")]
+    if not page_files:
+        print("  No .page files found in pages/ — run without --skip-ingest first.")
+        return 0.0, 0.0
+
+    total_pages = _dir_size(PAGES_DIR, ".page")
+    page_count = len(page_files)
+
+    ratio = total_pages / total_input
+    savings = 1.0 - ratio
+    bar_len = 20
+    bar = "█" * round(ratio * bar_len)
+
+    print("  Page store:")
+    print(f"    Unique pages:              {page_count:>7,}")
+    print(f"    Page store size:           {total_pages / 1e6:>7.1f} MB")
+    print(f"    Total input size:          {total_input / 1e6:>7.1f} MB")
+    print(f"    Dedup ratio (store/input): {ratio * 100:>6.1f}%  {bar}")
+    print(f"    Space savings:             {savings * 100:>6.1f}%")
+    print()
+    print("  Note: these numbers reflect page-level deduplication within the")
+    print("  page store. Block-level tools (e.g. 'de stats') operate at a")
+    print("  different granularity and will report lower dedup ratios.")
+    print()
+
+    return total_input / 1e6, total_pages / 1e6
+
+
+# ---------------------------------------------------------------------------
+# Step 5 – Regenerate concept diagram
+# ---------------------------------------------------------------------------
+
+
+def step_concept(total_mb: float, store_mb: float) -> None:
+    print("=" * 60)
+    print("  Step 5 – Regenerate concept diagram")
+    print("=" * 60)
+    print()
+
+    try:
+        from concept import generate
+    except ImportError:
+        print("  SKIP: drawsvg not installed (pip install drawsvg)")
+        print()
+        return
+
+    generate(total_mb=total_mb, store_mb=store_mb)
+    print()
+
+
+# ---------------------------------------------------------------------------
+# Step 6 – Roundtrip verification
+# ---------------------------------------------------------------------------
+
+
+def step_verify() -> None:
+    print("=" * 60)
+    print("  Step 6 – Roundtrip verification")
+    print("=" * 60)
+    print()
+
+    verify_dir = os.path.join(SCRIPT_DIR, "verify")
+    if os.path.isdir(verify_dir):
+        shutil.rmtree(verify_dir)
+    os.makedirs(verify_dir)
+
+    if not os.path.isfile(BINARY):
+        sys.exit(f"ERROR: binary not found at {BINARY}\n       Run without --skip-build first.")
+
+    meta_files = sorted(
+        os.path.join(META_DIR, f)
+        for f in os.listdir(META_DIR)
+        if f.endswith(".meta.parquet") and os.path.isfile(os.path.join(META_DIR, f))
+    )
+    if not meta_files:
+        sys.exit(f"ERROR: no .meta.parquet files found in {META_DIR}")
+
+    all_ok = True
+    for meta_path in meta_files:
+        stem = os.path.basename(meta_path).replace(".meta.parquet", "")
+        original_path = os.path.join(DATA_DIR, f"{stem}.parquet")
+        reconstructed_path = os.path.join(verify_dir, f"{stem}.parquet")
+
+        if not os.path.isfile(original_path):
+            print(f"  SKIP {stem}: original not found in data/")
+            continue
+
+        cmd = [BINARY, "reconstruct", meta_path, "--store", PAGES_DIR, "--output", reconstructed_path]
+        result = subprocess.run(cmd, capture_output=True)
+        if result.returncode != 0:
+            print(f"  FAIL {stem}: reconstruction failed")
+            print(result.stderr.decode())
+            all_ok = False
+            continue
+
+        original = pq.read_table(original_path)
+        reconstructed = pq.read_table(reconstructed_path)
+
+        if original.equals(reconstructed, check_metadata=False):
+            print(f"  OK   {stem}.parquet  ({len(original):,} rows, {len(original.schema)} columns)")
+        else:
+            print(f"  FAIL {stem}: data mismatch")
+            orig_rows, rec_rows = len(original), len(reconstructed)
+            if orig_rows != rec_rows:
+                print(f"         row count: original={orig_rows:,}  reconstructed={rec_rows:,}")
+            else:
+                for col in original.schema.names:
+                    if col not in reconstructed.schema.names:
+                        print(f"         missing column in reconstructed: {col}")
+                    elif not original[col].equals(reconstructed[col]):
+                        print(f"         column mismatch: {col}")
+            all_ok = False
+
+    print()
+    if all_ok:
+        print("  All roundtrip checks passed.")
+    else:
+        sys.exit("ERROR: roundtrip verification failed")
+    print()
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument("--file", metavar="PATH", help="Use a local Parquet file instead of downloading")
+    parser.add_argument("--skip-prepare", action="store_true", help="Skip data preparation step")
+    parser.add_argument("--skip-build", action="store_true", help="Skip cargo build step")
+    parser.add_argument("--skip-ingest", action="store_true", help="Skip page store ingest step")
+    parser.add_argument("--skip-concept", action="store_true", help="Skip concept diagram regeneration")
+    parser.add_argument("--skip-verify", action="store_true", help="Skip roundtrip verification step")
+    args = parser.parse_args()
+
+    if not args.skip_prepare:
+        step_prepare(args.file)
+
+    if not args.skip_build:
+        step_build()
+
+    if not args.skip_ingest:
+        step_ingest()
+
+    total_mb, store_mb = step_stats()
+
+    if not args.skip_concept:
+        step_concept(total_mb, store_mb)
+
+    if not args.skip_verify:
+        step_verify()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs
index e24788e4bcd7..11007930dfb0 100644
--- a/parquet/src/arrow/mod.rs
+++ b/parquet/src/arrow/mod.rs
@@ -182,10 +182,10 @@
 experimental!(mod array_reader);
 pub mod arrow_reader;
 pub mod arrow_writer;
-#[cfg(feature = "page_store")]
-pub mod page_store;
 mod buffer;
 mod decoder;
+#[cfg(feature = "page_store")]
+pub mod page_store;
 
 #[cfg(feature = "async")]
 pub mod async_reader;
diff --git a/parquet/src/arrow/page_store/mod.rs b/parquet/src/arrow/page_store/mod.rs
index 87aa408b7840..aa710a102ff7 100644
--- a/parquet/src/arrow/page_store/mod.rs
+++ b/parquet/src/arrow/page_store/mod.rs
@@ -67,18 +67,16 @@ mod tests {
     use std::sync::Arc;
 
     use arrow_array::{
-        ArrayRef, BooleanArray, Float64Array, Int32Array, ListArray, RecordBatch,
-        StringArray, StructArray,
+        ArrayRef, BooleanArray, Float64Array, Int32Array, ListArray, RecordBatch, StringArray,
+        StructArray,
     };
     use arrow_schema::Field;
 
     use super::*;
+    use crate::arrow::ArrowSchemaConverter;
     use crate::errors::Result;
-    use crate::file::metadata::{
-        FileMetaData, KeyValue, ParquetMetaData, ParquetMetaDataWriter,
-    };
+    use crate::file::metadata::{FileMetaData, KeyValue, ParquetMetaData, ParquetMetaDataWriter};
     use crate::file::properties::{EnabledStatistics, WriterProperties};
-    use crate::arrow::ArrowSchemaConverter;
     use crate::schema::types::SchemaDescriptor;
 
     // -----------------------------------------------------------------------
@@ -109,13 +107,50 @@ mod tests {
 
     fn sample_batch() -> RecordBatch {
         RecordBatch::try_from_iter(vec![
-            ("id", Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])) as ArrayRef),
-            ("value", Arc::new(Float64Array::from(vec![1.0, 2.5, 3.7, 4.2, 5.9])) as ArrayRef),
-            ("name", Arc::new(StringArray::from(vec!["alice", "bob", "charlie", "diana", "eve"])) as ArrayRef),
+            (
+                "id",
+                Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])) as ArrayRef,
+            ),
+            (
+                "value",
+                Arc::new(Float64Array::from(vec![1.0, 2.5, 3.7, 4.2, 5.9])) as ArrayRef,
+            ),
+            (
+                "name",
+                Arc::new(StringArray::from(vec![
+                    "alice", "bob", "charlie", "diana", "eve",
+                ])) as ArrayRef,
+            ),
+        ])
+        .unwrap()
+    }
+
+    /// A large batch that encodes to well over 256 KiB per column, guaranteeing
+    /// multiple CDC pages per column with default CDC parameters (min 256 KiB).
+    /// Uses 100 000 rows of varied (non-compressible) data.
+    fn large_batch(n: usize) -> RecordBatch {
+        let ids: Vec<i32> = (0..n as i32).collect();
+        // Vary the float values so they resist run-length compression
+        let values: Vec<f64> = (0..n).map(|i| (i as f64 * 1.000_001_f64).sin()).collect();
+        // 30-byte strings — varied enough to prevent dictionary/RLE collapsing
+        let names: Vec<String> = (0..n)
+            .map(|i| format!("row_{:0>10}_pad_{:0>10}", i, i * 7 + 3))
+            .collect();
+        RecordBatch::try_from_iter(vec![
+            ("id", Arc::new(Int32Array::from(ids)) as ArrayRef),
+            ("value", Arc::new(Float64Array::from(values)) as ArrayRef),
+            ("name", Arc::new(StringArray::from(names)) as ArrayRef),
         ])
         .unwrap()
     }
 
+    /// Concatenate all batches into one for equality comparison.
+    fn concat_batches(batches: &[RecordBatch]) -> RecordBatch {
+        use arrow_select::concat::concat_batches;
+        let schema = batches[0].schema();
+        concat_batches(&schema, batches).unwrap()
+    }
+
     // -----------------------------------------------------------------------
     // Round-trip tests
     // -----------------------------------------------------------------------
@@ -145,20 +180,26 @@ mod tests {
         let store = tmp.path().join("pages");
         let meta = tmp.path().join("data.parquet");
 
-        let b1 = RecordBatch::try_from_iter(vec![
-            ("x", Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef),
-        ]).unwrap();
-        let b2 = RecordBatch::try_from_iter(vec![
-            ("x", Arc::new(Int32Array::from(vec![4, 5])) as ArrayRef),
-        ]).unwrap();
-
-        let metadata = write_batches(&store, &meta, &[b1, b2], None).unwrap();
+        let b1 = RecordBatch::try_from_iter(vec![(
+            "x",
+            Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef,
+        )])
+        .unwrap();
+        let b2 = RecordBatch::try_from_iter(vec![(
+            "x",
+            Arc::new(Int32Array::from(vec![4, 5])) as ArrayRef,
+        )])
+        .unwrap();
+
+        let metadata = write_batches(&store, &meta, &[b1.clone(), b2.clone()], None).unwrap();
         assert_eq!(metadata.num_row_groups(), 1);
         assert_eq!(metadata.file_metadata().num_rows(), 5);
 
-        let batches = PageStoreReader::try_new(&meta, &store).unwrap().read_batches().unwrap();
-        let total: usize = batches.iter().map(|b| b.num_rows()).sum();
-        assert_eq!(total, 5);
+        let batches = PageStoreReader::try_new(&meta, &store)
+            .unwrap()
+            .read_batches()
+            .unwrap();
+        assert_eq!(concat_batches(&batches), concat_batches(&[b1, b2]));
     }
 
     #[test]
@@ -179,10 +220,64 @@ mod tests {
         assert_eq!(metadata.num_row_groups(), 3);
         assert_eq!(metadata.file_metadata().num_rows(), 15);
 
-        let total: usize = PageStoreReader::try_new(&meta, &store)
-            .unwrap().read_batches().unwrap()
-            .iter().map(|b| b.num_rows()).sum();
-        assert_eq!(total, 15);
+        let batches = PageStoreReader::try_new(&meta, &store)
+            .unwrap()
+            .read_batches()
+            .unwrap();
+        let expected = concat_batches(&[batch.clone(), batch.clone(), batch]);
+        assert_eq!(concat_batches(&batches), expected);
+    }
+
+    #[test]
+    fn test_multipage_roundtrip() {
+        // 100 000 rows encodes to several MiB per column, well above the 256 KiB
+        // CDC minimum, so every column gets multiple pages.  Assert that the
+        // reconstructed data is bit-for-bit identical to the input.
+        let tmp = tempfile::tempdir().unwrap();
+        let store = tmp.path().join("pages");
+        let meta = tmp.path().join("data.parquet");
+
+        let batch = large_batch(1_000_000);
+        write_batches(&store, &meta, &[batch.clone()], None).unwrap();
+
+        // Must have produced more than one page per column
+        assert!(count_page_files(&store) > batch.num_columns());
+
+        let batches = PageStoreReader::try_new(&meta, &store)
+            .unwrap()
+            .read_batches()
+            .unwrap();
+        assert_eq!(concat_batches(&batches), batch);
+    }
+
+    #[test]
+    fn test_multipage_multiple_row_groups_roundtrip() {
+        // Three row groups, each large enough for multiple pages per column.
+        let tmp = tempfile::tempdir().unwrap();
+        let store = tmp.path().join("pages");
+        let meta = tmp.path().join("data.parquet");
+
+        let b1 = large_batch(500_000);
+        let b2 = large_batch(500_000);
+        let b3 = large_batch(500_000);
+
+        let mut writer = PageStoreWriter::try_new(&store, b1.schema(), None).unwrap();
+        writer.write(&b1).unwrap();
+        writer.flush().unwrap();
+        writer.write(&b2).unwrap();
+        writer.flush().unwrap();
+        writer.write(&b3).unwrap();
+        let metadata = writer.finish(&meta).unwrap();
+
+        assert_eq!(metadata.num_row_groups(), 3);
+        assert_eq!(metadata.file_metadata().num_rows(), 1_500_000);
+
+        let batches = PageStoreReader::try_new(&meta, &store)
+            .unwrap()
+            .read_batches()
+            .unwrap();
+        let expected = concat_batches(&[b1, b2, b3]);
+        assert_eq!(concat_batches(&batches), expected);
     }
 
     #[test]
@@ -213,12 +308,34 @@ mod tests {
         let meta = tmp.path().join("data.parquet");
 
         let batch = RecordBatch::try_from_iter(vec![
-            ("id", Arc::new(Int32Array::from(vec![Some(1), None, Some(3), None, Some(5)])) as ArrayRef),
-            ("label", Arc::new(StringArray::from(vec![Some("a"), Some("b"), None, None, Some("e")])) as ArrayRef),
-        ]).unwrap();
+            (
+                "id",
+                Arc::new(Int32Array::from(vec![
+                    Some(1),
+                    None,
+                    Some(3),
+                    None,
+                    Some(5),
+                ])) as ArrayRef,
+            ),
+            (
+                "label",
+                Arc::new(StringArray::from(vec![
+                    Some("a"),
+                    Some("b"),
+                    None,
+                    None,
+                    Some("e"),
+                ])) as ArrayRef,
+            ),
+        ])
+        .unwrap();
 
         write_batches(&store, &meta, &[batch.clone()], None).unwrap();
-        let batches = PageStoreReader::try_new(&meta, &store).unwrap().read_batches().unwrap();
+        let batches = PageStoreReader::try_new(&meta, &store)
+            .unwrap()
+            .read_batches()
+            .unwrap();
         assert_eq!(batches[0], batch);
     }
 
@@ -228,12 +345,17 @@ mod tests {
         let store = tmp.path().join("pages");
         let meta = tmp.path().join("data.parquet");
 
-        let batch = RecordBatch::try_from_iter(vec![
-            ("flag", Arc::new(BooleanArray::from(vec![true, false, true, true, false])) as ArrayRef),
-        ]).unwrap();
+        let batch = RecordBatch::try_from_iter(vec![(
+            "flag",
+            Arc::new(BooleanArray::from(vec![true, false, true, true, false])) as ArrayRef,
+        )])
+        .unwrap();
 
         write_batches(&store, &meta, &[batch.clone()], None).unwrap();
-        let batches = PageStoreReader::try_new(&meta, &store).unwrap().read_batches().unwrap();
+        let batches = PageStoreReader::try_new(&meta, &store)
+            .unwrap()
+            .read_batches()
+            .unwrap();
         assert_eq!(batches[0], batch);
     }
 
@@ -244,17 +366,23 @@ mod tests {
         let meta = tmp.path().join("data.parquet");
 
         let struct_array = StructArray::from(vec![
-            (Arc::new(Field::new("a", arrow_schema::DataType::Int32, false)),
-             Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef),
-            (Arc::new(Field::new("b", arrow_schema::DataType::Utf8, false)),
-             Arc::new(StringArray::from(vec!["x", "y", "z"])) as ArrayRef),
+            (
+                Arc::new(Field::new("a", arrow_schema::DataType::Int32, false)),
+                Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef,
+            ),
+            (
+                Arc::new(Field::new("b", arrow_schema::DataType::Utf8, false)),
+                Arc::new(StringArray::from(vec!["x", "y", "z"])) as ArrayRef,
+            ),
         ]);
-        let batch = RecordBatch::try_from_iter(vec![
-            ("s", Arc::new(struct_array) as ArrayRef),
-        ]).unwrap();
+        let batch =
+            RecordBatch::try_from_iter(vec![("s", Arc::new(struct_array) as ArrayRef)]).unwrap();
 
         write_batches(&store, &meta, &[batch.clone()], None).unwrap();
-        let batches = PageStoreReader::try_new(&meta, &store).unwrap().read_batches().unwrap();
+        let batches = PageStoreReader::try_new(&meta, &store)
+            .unwrap()
+            .read_batches()
+            .unwrap();
         assert_eq!(batches[0], batch);
     }
 
@@ -272,12 +400,14 @@ mod tests {
             Arc::new(values),
             None,
         );
-        let batch = RecordBatch::try_from_iter(vec![
-            ("items", Arc::new(list) as ArrayRef),
-        ]).unwrap();
+        let batch =
+            RecordBatch::try_from_iter(vec![("items", Arc::new(list) as ArrayRef)]).unwrap();
 
         write_batches(&store, &meta, &[batch.clone()], None).unwrap();
-        let batches = PageStoreReader::try_new(&meta, &store).unwrap().read_batches().unwrap();
+        let batches = PageStoreReader::try_new(&meta, &store)
+            .unwrap()
+            .read_batches()
+            .unwrap();
         assert_eq!(batches[0], batch);
     }
 
@@ -295,8 +425,12 @@ mod tests {
         write_batches(&store, &meta, &[batch.clone()], None).unwrap();
 
         let total: usize = PageStoreReader::try_new(&meta, &store)
-            .unwrap().read_batches().unwrap()
-            .iter().map(|b| b.num_rows()).sum();
+            .unwrap()
+            .read_batches()
+            .unwrap()
+            .iter()
+            .map(|b| b.num_rows())
+            .sum();
         assert_eq!(total, 5);
     }
 
@@ -312,7 +446,10 @@ mod tests {
             .build();
         write_batches(&store, &meta, &[batch.clone()], Some(props)).unwrap();
 
-        let batches = PageStoreReader::try_new(&meta, &store).unwrap().read_batches().unwrap();
+        let batches = PageStoreReader::try_new(&meta, &store)
+            .unwrap()
+            .read_batches()
+            .unwrap();
         assert_eq!(batches[0], batch);
     }
 
@@ -348,8 +485,9 @@ mod tests {
         let unique: std::collections::HashSet<_> = manifest.pages.iter().map(|p| &p.hash).collect();
         assert_eq!(count_page_files(&store), unique.len());
 
-        let total: usize = reader.read_batches().unwrap().iter().map(|b| b.num_rows()).sum();
-        assert_eq!(total, 10);
+        let batches = reader.read_batches().unwrap();
+        let expected = concat_batches(&[batch.clone(), batch]);
+        assert_eq!(concat_batches(&batches), expected);
     }
 
     #[test]
@@ -369,8 +507,14 @@ mod tests {
 
         assert_eq!(pages_after_first, pages_after_second);
 
-        let batches_a = PageStoreReader::try_new(&meta_a, &store).unwrap().read_batches().unwrap();
-        let batches_b = PageStoreReader::try_new(&meta_b, &store).unwrap().read_batches().unwrap();
+        let batches_a = PageStoreReader::try_new(&meta_a, &store)
+            .unwrap()
+            .read_batches()
+            .unwrap();
+        let batches_b = PageStoreReader::try_new(&meta_b, &store)
+            .unwrap()
+            .read_batches()
+            .unwrap();
         assert_eq!(batches_a, batches_b);
         assert_eq!(batches_a[0], batch);
     }
@@ -389,7 +533,10 @@ mod tests {
         let batch = sample_batch();
         write_batches(&store, &meta, &[batch.clone()], None).unwrap();
 
-        let batches = PageStoreReader::try_new(&meta, &store).unwrap().read_batches().unwrap();
+        let batches = PageStoreReader::try_new(&meta, &store)
+            .unwrap()
+            .read_batches()
+            .unwrap();
         assert_eq!(batches[0], batch);
     }
 
@@ -428,11 +575,14 @@ mod tests {
         }
         assert!(manifest.pages.iter().all(|p| p.row_group == 0));
 
-        let columns: std::collections::HashSet<_> = manifest.pages.iter().map(|p| p.column).collect();
+        let columns: std::collections::HashSet<_> =
+            manifest.pages.iter().map(|p| p.column).collect();
         assert_eq!(columns.len(), metadata.row_groups()[0].num_columns());
 
         for col in &columns {
-            let mut idxs: Vec<_> = manifest.pages.iter()
+            let mut idxs: Vec<_> = manifest
+                .pages
+                .iter()
                 .filter(|p| p.column == *col)
                 .map(|p| p.page_index)
                 .collect();
@@ -454,7 +604,10 @@ mod tests {
         let batch = sample_batch();
         write_batches(&store, &meta, &[batch.clone()], None).unwrap();
 
-        let schema = PageStoreReader::try_new(&meta, &store).unwrap().schema().unwrap();
+        let schema = PageStoreReader::try_new(&meta, &store)
+            .unwrap()
+            .schema()
+            .unwrap();
         assert_eq!(schema.fields(), batch.schema().fields());
     }
 
@@ -490,13 +643,19 @@ mod tests {
 
         write_batches(&store, &meta, &[sample_batch()], None).unwrap();
 
-        let first_page = fs::read_dir(&store).unwrap()
+        let first_page = fs::read_dir(&store)
+            .unwrap()
             .filter_map(|e| e.ok())
             .find(|e| e.path().extension().map_or(false, |ext| ext == "page"))
             .unwrap();
         fs::remove_file(first_page.path()).unwrap();
 
-        assert!(PageStoreReader::try_new(&meta, &store).unwrap().read_batches().is_err());
+        assert!(
+            PageStoreReader::try_new(&meta, &store)
+                .unwrap()
+                .read_batches()
+                .is_err()
+        );
     }
 
     #[test]
@@ -505,18 +664,30 @@ mod tests {
         let store = tmp.path().join("pages");
         let meta = tmp.path().join("data.parquet");
 
-        let schema = ArrowSchemaConverter::new().convert(&sample_batch().schema()).unwrap();
+        let schema = ArrowSchemaConverter::new()
+            .convert(&sample_batch().schema())
+            .unwrap();
         let schema_descr = Arc::new(SchemaDescriptor::new(schema.root_schema_ptr()));
         let file_metadata = FileMetaData::new(
-            2, 0, None,
-            Some(vec![KeyValue::new(MANIFEST_KEY.to_string(), "not json{{{".to_string())]),
-            schema_descr, None,
+            2,
+            0,
+            None,
+            Some(vec![KeyValue::new(
+                MANIFEST_KEY.to_string(),
+                "not json{{{".to_string(),
+            )]),
+            schema_descr,
+            None,
         );
         fs::create_dir_all(&store).unwrap();
         let file = fs::File::create(&meta).unwrap();
-        ParquetMetaDataWriter::new(file, &ParquetMetaData::new(file_metadata, vec![])).finish().unwrap();
+        ParquetMetaDataWriter::new(file, &ParquetMetaData::new(file_metadata, vec![]))
+            .finish()
+            .unwrap();
 
-        let err = PageStoreReader::try_new(&meta, &store).unwrap_err().to_string();
+        let err = PageStoreReader::try_new(&meta, &store)
+            .unwrap_err()
+            .to_string();
         assert!(err.contains("expected"), "unexpected error: {err}");
     }
 
@@ -526,14 +697,23 @@ mod tests {
         let store = tmp.path().join("pages");
         let meta = tmp.path().join("data.parquet");
 
-        let schema = ArrowSchemaConverter::new().convert(&sample_batch().schema()).unwrap();
+        let schema = ArrowSchemaConverter::new()
+            .convert(&sample_batch().schema())
+            .unwrap();
         let schema_descr = Arc::new(SchemaDescriptor::new(schema.root_schema_ptr()));
         let file_metadata = FileMetaData::new(2, 0, None, None, schema_descr, None);
         fs::create_dir_all(&store).unwrap();
         let file = fs::File::create(&meta).unwrap();
-        ParquetMetaDataWriter::new(file, &ParquetMetaData::new(file_metadata, vec![])).finish().unwrap();
+        ParquetMetaDataWriter::new(file, &ParquetMetaData::new(file_metadata, vec![]))
+            .finish()
+            .unwrap();
 
-        let err = PageStoreReader::try_new(&meta, &store).unwrap_err().to_string();
-        assert!(err.contains(MANIFEST_KEY), "error should mention key: {err}");
+        let err = PageStoreReader::try_new(&meta, &store)
+            .unwrap_err()
+            .to_string();
+        assert!(
+            err.contains(MANIFEST_KEY),
+            "error should mention key: {err}"
+        );
     }
 }
diff --git a/parquet/src/arrow/page_store/reader.rs b/parquet/src/arrow/page_store/reader.rs
index 9712bf0e2c03..f5e02a70e76a 100644
--- a/parquet/src/arrow/page_store/reader.rs
+++ b/parquet/src/arrow/page_store/reader.rs
@@ -28,7 +28,7 @@ use bytes::Bytes;
 use arrow_array::RecordBatch;
 use arrow_schema::{ArrowError, SchemaRef};
 
-use super::{PageStoreManifest, MANIFEST_KEY};
+use super::{MANIFEST_KEY, PageStoreManifest};
 use crate::arrow::arrow_reader::{
     ArrowReaderMetadata, ArrowReaderOptions, ParquetRecordBatchReaderBuilder,
 };
@@ -74,10 +74,7 @@ impl PageStoreReader {
     ///
     /// * `metadata_path` — path to the metadata-only `.parquet` file.
     /// * `store_dir` — directory containing `{hash}.page` blobs.
-    pub fn try_new(
-        metadata_path: impl AsRef<Path>,
-        store_dir: impl Into<PathBuf>,
-    ) -> Result<Self> {
+    pub fn try_new(metadata_path: impl AsRef<Path>, store_dir: impl Into<PathBuf>) -> Result<Self> {
         let store_dir = store_dir.into();
         let file = fs::File::open(metadata_path.as_ref())?;
 
@@ -119,8 +116,7 @@ impl PageStoreReader {
     /// decoded on-demand and only one batch is held in memory at a time.
     pub fn reader(&self) -> Result<crate::arrow::arrow_reader::ParquetRecordBatchReader> {
         let chunk_reader = PageStoreChunkReader::new(self.store_dir.clone(), &self.manifest);
-        let options =
-            ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::Required);
+        let options = ArrowReaderOptions::new().with_page_index_policy(PageIndexPolicy::Required);
         let arrow_metadata = ArrowReaderMetadata::try_new(Arc::clone(&self.metadata), options)?;
         ParquetRecordBatchReaderBuilder::new_with_metadata(chunk_reader, arrow_metadata).build()
     }
@@ -150,8 +146,7 @@ impl PageStoreReader {
             crate::errors::ParquetError::General(format!("'{MANIFEST_KEY}' has no value"))
         })?;
 
-        serde_json::from_str(value)
-            .map_err(|e| crate::errors::ParquetError::General(e.to_string()))
+        serde_json::from_str(value).map_err(|e| crate::errors::ParquetError::General(e.to_string()))
     }
 }
 
diff --git a/parquet/src/arrow/page_store/writer.rs b/parquet/src/arrow/page_store/writer.rs
index 0c5377d3741c..87d2b4914ef9 100644
--- a/parquet/src/arrow/page_store/writer.rs
+++ b/parquet/src/arrow/page_store/writer.rs
@@ -27,12 +27,12 @@ use bytes::Bytes;
 use arrow_array::RecordBatch;
 use arrow_schema::{DataType as ArrowDataType, SchemaRef};
 
-use super::{PageRef, PageStoreManifest, MANIFEST_KEY};
+use super::{MANIFEST_KEY, PageRef, PageStoreManifest};
+use crate::arrow::ArrowSchemaConverter;
 use crate::arrow::arrow_writer::{
     ArrowColumnChunk, ArrowColumnChunkData, ArrowColumnWriterImpl, ArrowRowGroupWriter,
     SharedColumnChunk,
 };
-use crate::arrow::ArrowSchemaConverter;
 use crate::column::chunker::ContentDefinedChunker;
 use crate::column::page::{CompressedPage, PageWriteSpec, PageWriter};
 use crate::column::writer::{GenericColumnWriter, get_column_writer};
@@ -43,7 +43,9 @@ use crate::file::metadata::{
 };
 use crate::file::page_index::column_index::ColumnIndexMetaData;
 use crate::file::page_index::offset_index::OffsetIndexMetaData;
-use crate::file::properties::{CdcOptions, EnabledStatistics, WriterProperties, WriterPropertiesPtr};
+use crate::file::properties::{
+    CdcOptions, EnabledStatistics, WriterProperties, WriterPropertiesPtr,
+};
 use crate::parquet_thrift::{ThriftCompactOutputProtocol, WriteThrift};
 use crate::schema::types::{ColumnDescPtr, SchemaDescPtr, SchemaDescriptor};
 
@@ -186,11 +188,7 @@ fn make_column_writer(
     let chunk: SharedColumnChunk = pw.buffer.clone();
 
     let writer = if use_byte_array {
-        ArrowColumnWriterImpl::ByteArray(GenericColumnWriter::new(
-            desc.clone(),
-            props.clone(),
-            pw,
-        ))
+        ArrowColumnWriterImpl::ByteArray(GenericColumnWriter::new(desc.clone(), props.clone(), pw))
     } else {
         ArrowColumnWriterImpl::Column(get_column_writer(desc.clone(), props.clone(), pw))
     };
@@ -238,17 +236,53 @@ fn create_writers_for_type(
         | ArrowDataType::FixedSizeList(f, _)
         | ArrowDataType::ListView(f)
         | ArrowDataType::LargeListView(f) => {
-            create_writers_for_type(f.data_type(), props, leaves, store_dir, page_refs, row_group, col_idx, out)?;
+            create_writers_for_type(
+                f.data_type(),
+                props,
+                leaves,
+                store_dir,
+                page_refs,
+                row_group,
+                col_idx,
+                out,
+            )?;
         }
         ArrowDataType::Struct(fields) => {
             for field in fields {
-                create_writers_for_type(field.data_type(), props, leaves, store_dir, page_refs, row_group, col_idx, out)?;
+                create_writers_for_type(
+                    field.data_type(),
+                    props,
+                    leaves,
+                    store_dir,
+                    page_refs,
+                    row_group,
+                    col_idx,
+                    out,
+                )?;
             }
         }
         ArrowDataType::Map(f, _) => match f.data_type() {
             ArrowDataType::Struct(f) => {
-                create_writers_for_type(f[0].data_type(), props, leaves, store_dir, page_refs, row_group, col_idx, out)?;
-                create_writers_for_type(f[1].data_type(), props, leaves, store_dir, page_refs, row_group, col_idx, out)?;
+                create_writers_for_type(
+                    f[0].data_type(),
+                    props,
+                    leaves,
+                    store_dir,
+                    page_refs,
+                    row_group,
+                    col_idx,
+                    out,
+                )?;
+                create_writers_for_type(
+                    f[1].data_type(),
+                    props,
+                    leaves,
+                    store_dir,
+                    page_refs,
+                    row_group,
+                    col_idx,
+                    out,
+                )?;
             }
             _ => unreachable!("invalid map type"),
         },
diff --git a/parquet/src/bin/parquet-page-store.rs b/parquet/src/bin/parquet-page-store.rs
index 47f9e4be1f75..6b8d0feb5762 100644
--- a/parquet/src/bin/parquet-page-store.rs
+++ b/parquet/src/bin/parquet-page-store.rs
@@ -41,9 +41,10 @@ use std::path::PathBuf;
 use arrow_array::RecordBatchReader;
 use clap::{Parser, Subcommand, ValueEnum};
 use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
+use parquet::arrow::arrow_writer::ArrowWriter;
 use parquet::arrow::page_store::{PageStoreReader, PageStoreWriter};
 use parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel};
-use parquet::errors::Result;
+use parquet::errors::{ParquetError, Result};
 use parquet::file::properties::WriterProperties;
 
 #[derive(Debug, Parser)]
@@ -56,13 +57,16 @@ use parquet::file::properties::WriterProperties;
 ///
 /// The workflow has two steps:
 ///
-///   1. `write` — reads regular Parquet files, re-encodes their pages with CDC
+///   1. `write`       — reads regular Parquet files, re-encodes their pages with CDC
 ///      chunking, writes each page as a {hash}.page blob into a shared store
 ///      directory, and produces a lightweight metadata-only Parquet file.
 ///
-///   2. `read`  — given a metadata Parquet file and the store directory,
+///   2. `read`        — given a metadata Parquet file and the store directory,
 ///      reassembles the data and prints it.
 ///
+///   3. `reconstruct` — given a metadata Parquet file and the store directory,
+///      writes a self-contained regular Parquet file (no page store dependency).
+///
 /// Quick start:
 ///
 ///   # Write a file into the store
@@ -71,6 +75,9 @@ use parquet::file::properties::WriterProperties;
 ///   # Read it back
 ///   parquet-page-store read ./meta/data.meta.parquet --store ./pages
 ///
+///   # Reconstruct a self-contained Parquet file from the page store
+///   parquet-page-store reconstruct ./meta/data.meta.parquet --store ./pages --output data.parquet
+///
 ///   # Write several files (pages are deduplicated across them)
 ///   parquet-page-store write a.parquet b.parquet --store ./pages
 struct Cli {
@@ -137,6 +144,32 @@ enum Command {
         #[clap(short, long)]
         store: PathBuf,
     },
+
+    /// Reconstruct a self-contained Parquet file from a page-store-backed one.
+    ///
+    /// Reads all data from the page store via the metadata file and writes a
+    /// regular Parquet file that has no dependency on the store directory.
+    /// Useful for exporting, verification, or migrating data out of the store.
+    ///
+    /// Example:
+    ///
+    ///   parquet-page-store reconstruct data.meta.parquet --store ./pages --output data.parquet
+    Reconstruct {
+        /// Path to the metadata-only Parquet file.
+        input: PathBuf,
+
+        /// Page store directory containing the .page blobs.
+        #[clap(short, long)]
+        store: PathBuf,
+
+        /// Output path for the reconstructed regular Parquet file.
+        #[clap(short, long)]
+        output: PathBuf,
+
+        /// Compression codec for the output file [default: snappy].
+        #[clap(long, default_value = "snappy")]
+        compression: CompressionArg,
+    },
 }
 
 #[derive(Debug, Clone, ValueEnum)]
@@ -176,6 +209,12 @@ fn main() {
             compression,
         } => cmd_write(&inputs, &store, output.as_deref(), compression),
         Command::Read { input, store } => cmd_read(&input, &store),
+        Command::Reconstruct {
+            input,
+            store,
+            output,
+            compression,
+        } => cmd_reconstruct(&input, &store, &output, compression),
     };
     if let Err(e) = result {
         eprintln!("Error: {e}");
@@ -183,6 +222,35 @@ fn main() {
     }
 }
 
+/// Expand any glob patterns in `inputs` into concrete file paths.
+///
+/// Patterns containing `*` or `?` are expanded using the `glob` crate.
+/// Literal paths (no wildcards) are passed through unchanged.
+/// This lets you write `parquet-page-store write "data/*.parquet"` on any
+/// platform without relying on shell glob expansion.
+fn expand_inputs(inputs: &[PathBuf]) -> Result<Vec<PathBuf>> {
+    let mut expanded = Vec::new();
+    for input in inputs {
+        let s = input.to_string_lossy();
+        if s.contains('*') || s.contains('?') {
+            let mut matches: Vec<PathBuf> = glob::glob(&s)
+                .map_err(|e| ParquetError::General(format!("invalid glob pattern: {e}")))?
+                .map(|entry| entry.map_err(|e| ParquetError::General(format!("glob error: {e}"))))
+                .collect::<Result<_>>()?;
+            if matches.is_empty() {
+                return Err(ParquetError::General(format!(
+                    "glob pattern matched no files: {s}"
+                )));
+            }
+            matches.sort();
+            expanded.extend(matches);
+        } else {
+            expanded.push(input.clone());
+        }
+    }
+    Ok(expanded)
+}
+
 fn cmd_write(
     inputs: &[PathBuf],
     store: &PathBuf,
@@ -192,7 +260,9 @@ fn cmd_write(
     let output_dir = output_dir.unwrap_or_else(|| std::path::Path::new("."));
     std::fs::create_dir_all(output_dir)?;
 
-    for input in inputs {
+    let inputs = expand_inputs(inputs)?;
+
+    for input in &inputs {
         let file = File::open(input)?;
         let reader = ParquetRecordBatchReaderBuilder::try_new(file)?
             .with_batch_size(8192)
@@ -211,8 +281,7 @@ fn cmd_write(
         let mut writer = PageStoreWriter::try_new(store, schema, Some(props))?;
         let mut total_rows = 0usize;
         for batch in reader {
-            let batch =
-                batch.map_err(|e| parquet::errors::ParquetError::General(e.to_string()))?;
+            let batch = batch.map_err(|e| parquet::errors::ParquetError::General(e.to_string()))?;
             total_rows += batch.num_rows();
             writer.write(&batch)?;
         }
@@ -256,6 +325,42 @@ fn cmd_write(
     Ok(())
 }
 
+fn cmd_reconstruct(
+    input: &PathBuf,
+    store: &PathBuf,
+    output: &PathBuf,
+    compression: CompressionArg,
+) -> Result<()> {
+    let reader = PageStoreReader::try_new(input, store)?;
+    let schema = reader
+        .schema()
+        .map_err(|e| ParquetError::General(e.to_string()))?;
+
+    let props = WriterProperties::builder()
+        .set_compression(compression.to_parquet())
+        .build();
+    let file = File::create(output)?;
+    let mut writer = ArrowWriter::try_new(file, schema, Some(props))?;
+
+    let mut total_rows = 0usize;
+    for batch in reader.reader()? {
+        let batch = batch.map_err(|e| ParquetError::General(e.to_string()))?;
+        total_rows += batch.num_rows();
+        writer.write(&batch)?;
+    }
+    let metadata = writer.close()?;
+
+    eprintln!(
+        "{}: {} row(s), {} row group(s) -> {}",
+        input.display(),
+        total_rows,
+        metadata.num_row_groups(),
+        output.display(),
+    );
+
+    Ok(())
+}
+
 fn cmd_read(input: &PathBuf, store: &PathBuf) -> Result<()> {
     let reader = PageStoreReader::try_new(input, store)?;
     let md = reader.metadata();

From 2e9ee369f8898bd2448c86bf598f44708a2a3a3d Mon Sep 17 00:00:00 2001
From: Krisztian Szucs <szucs.krisztian@gmail.com>
Date: Fri, 3 Apr 2026 19:27:26 +0200
Subject: [PATCH 5/8] chore: add ASF license headers and fix RAT/Prettier CI
 failures

---
 dev/release/rat_exclude_files.txt             |  1 +
 parquet/examples/page_store_dedup/README.md   | 90 +++++++++++--------
 parquet/examples/page_store_dedup/concept.py  | 16 ++++
 parquet/examples/page_store_dedup/pipeline.py | 16 ++++
 4 files changed, 88 insertions(+), 35 deletions(-)

diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt
index d08a0ea8c74a..3b2dd0051a1a 100644
--- a/dev/release/rat_exclude_files.txt
+++ b/dev/release/rat_exclude_files.txt
@@ -20,3 +20,4 @@ arrow-flight/src/sql/arrow.flight.protocol.sql.rs
 .github/*
 parquet/src/bin/parquet-fromcsv-help.txt
 arrow-flight/examples/data/*
+parquet/examples/page_store_dedup/page_store_concept.svg
diff --git a/parquet/examples/page_store_dedup/README.md b/parquet/examples/page_store_dedup/README.md
index 63ffa70f2ceb..41d34f7b99c7 100644
--- a/parquet/examples/page_store_dedup/README.md
+++ b/parquet/examples/page_store_dedup/README.md
@@ -1,29 +1,48 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
 # Parquet Page Store — Deduplication Demo
 
 > **Prototype**: This is an experimental feature exploring content-defined
-> chunking for Parquet.  APIs and file formats may change.
+> chunking for Parquet. APIs and file formats may change.
 
 Demonstrates how Content-Defined Chunking (CDC) enables efficient deduplication
 across multiple versions of a dataset using the Parquet page store writer in
-Apache Arrow Rust.  The deduplication is self-contained in the Parquet writer —
+Apache Arrow Rust. The deduplication is self-contained in the Parquet writer —
 no special storage system is required.
 
 ## What this demo shows
 
 Four common dataset operations are applied to a real-world dataset
 ([OpenHermes-2.5](https://huggingface.co/datasets/teknium/OpenHermes-2.5)
-conversational data, ~800 MB per file).  Each operation produces a separate
-Parquet file.  Without a page store, storing all four files costs the full sum
-of their sizes.  With the CDC page store, identical pages are stored **exactly
+conversational data, ~800 MB per file). Each operation produces a separate
+Parquet file. Without a page store, storing all four files costs the full sum
+of their sizes. With the CDC page store, identical pages are stored **exactly
 once** — indexed by their BLAKE3 hash — so the four files share most of their
-bytes.  The resulting files can be stored anywhere.
+bytes. The resulting files can be stored anywhere.
 
-| File | Operation |
-|------|-----------|
-| `original.parquet` | Baseline dataset (~996k rows) |
-| `filtered.parquet` | Keep rows where `num_turns ≤ 3` |
+| File                | Operation                              |
+| ------------------- | -------------------------------------- |
+| `original.parquet`  | Baseline dataset (~996k rows)          |
+| `filtered.parquet`  | Keep rows where `num_turns ≤ 3`        |
 | `augmented.parquet` | Original + computed column `num_turns` |
-| `appended.parquet` | Original + 5 000 new rows appended |
+| `appended.parquet`  | Original + 5 000 new rows appended     |
 
 ## Prerequisites
 
@@ -52,6 +71,7 @@ python pipeline.py --skip-prepare --skip-build --skip-ingest  # stats only
 ```
 
 Outputs:
+
 - `page_store_concept.png` — architectural overview of how shared pages work
 - `page_store_savings.png` — side-by-side storage comparison with real numbers
 
@@ -62,7 +82,7 @@ python pipeline.py --file /path/to/your.parquet
 ```
 
 The script requires a `conversations` list column for the filtered and augmented
-variants.  Adapt `pipeline.py` to your own schema as needed.
+variants. Adapt `pipeline.py` to your own schema as needed.
 
 ## Results
 
@@ -70,34 +90,34 @@ Dataset: **OpenHermes-2.5** (short conversations, `num_turns < 10`)
 
 ### Dataset variants
 
-| File | Operation | Rows | Size |
-|------|-----------|------|------|
-| `original.parquet` | Baseline | 996,009 | 782.1 MB |
-| `filtered.parquet` | Keep `num_turns ≤ 3` (removes 0.2% of rows) | 993,862 | 776.8 MB |
-| `augmented.parquet` | Add column `num_turns` | 996,009 | 782.2 MB |
-| `appended.parquet` | Append 5,000 rows | 1,001,009 | 788.6 MB |
-| **Total** | | | **3,129.7 MB** |
+| File                | Operation                                   | Rows      | Size           |
+| ------------------- | ------------------------------------------- | --------- | -------------- |
+| `original.parquet`  | Baseline                                    | 996,009   | 782.1 MB       |
+| `filtered.parquet`  | Keep `num_turns ≤ 3` (removes 0.2% of rows) | 993,862   | 776.8 MB       |
+| `augmented.parquet` | Add column `num_turns`                      | 996,009   | 782.2 MB       |
+| `appended.parquet`  | Append 5,000 rows                           | 1,001,009 | 788.6 MB       |
+| **Total**           |                                             |           | **3,129.7 MB** |
 
 ### Page store results
 
-| Metric | Value |
-|--------|-------|
-| Unique pages stored | 3,400 |
-| Total page references | 15,179 |
-| Page store size | 559.0 MB |
-| Metadata files size | 4.4 MB |
-| **Page store + metadata** | **563.4 MB** |
-| **Storage saved** | **2,566.3 MB (82%)** |
-| **Deduplication ratio** | **5.6×** |
+| Metric                    | Value                |
+| ------------------------- | -------------------- |
+| Unique pages stored       | 3,400                |
+| Total page references     | 15,179               |
+| Page store size           | 559.0 MB             |
+| Metadata files size       | 4.4 MB               |
+| **Page store + metadata** | **563.4 MB**         |
+| **Storage saved**         | **2,566.3 MB (82%)** |
+| **Deduplication ratio**   | **5.6×**             |
 
 ### Per-file page breakdown
 
-| File | Page refs | Unique hashes | New pages | Reused pages |
-|------|-----------|---------------|-----------|--------------|
-| `original.parquet` | 3,782 | 3,100 | 3,100 | 0 |
-| `filtered.parquet` | 3,755 | 3,075 | 222 | 2,853 (92%) |
-| `augmented.parquet` | 3,834 | 3,136 | 36 | 3,100 (98%) |
-| `appended.parquet` | 3,808 | 3,125 | 42 | 3,083 (98%) |
+| File                | Page refs | Unique hashes | New pages | Reused pages |
+| ------------------- | --------- | ------------- | --------- | ------------ |
+| `original.parquet`  | 3,782     | 3,100         | 3,100     | 0            |
+| `filtered.parquet`  | 3,755     | 3,075         | 222       | 2,853 (92%)  |
+| `augmented.parquet` | 3,834     | 3,136         | 36        | 3,100 (98%)  |
+| `appended.parquet`  | 3,808     | 3,125         | 42        | 3,083 (98%)  |
 
 ### Key insights
 
@@ -110,7 +130,7 @@ Dataset: **OpenHermes-2.5** (short conversations, `num_turns < 10`)
 
 3. **Filtering rows** (`filtered`): 92% of pages reused despite row removal.
    Removing just 0.2% of rows barely shifts CDC boundaries — most pages are
-   unchanged.  Heavier filtering (removing 20–50% of rows) would produce more new
+   unchanged. Heavier filtering (removing 20–50% of rows) would produce more new
    pages, as CDC boundaries shift further throughout the file.
 
 4. **Net result**: 4 dataset versions stored for **563 MB instead of 3.1 GB** — an
diff --git a/parquet/examples/page_store_dedup/concept.py b/parquet/examples/page_store_dedup/concept.py
index cdd30789145e..01523f664465 100644
--- a/parquet/examples/page_store_dedup/concept.py
+++ b/parquet/examples/page_store_dedup/concept.py
@@ -1,4 +1,20 @@
 #!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
 """
 Generate the Parquet Page Store concept diagram.
 
diff --git a/parquet/examples/page_store_dedup/pipeline.py b/parquet/examples/page_store_dedup/pipeline.py
index a49eb0659139..6e4db2b6a2be 100644
--- a/parquet/examples/page_store_dedup/pipeline.py
+++ b/parquet/examples/page_store_dedup/pipeline.py
@@ -1,4 +1,20 @@
 #!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
 """
 Full pipeline for the Parquet Page Store deduplication demo.
 

From 735970f1cdf3937ec272100806560fd06fc1b7ba Mon Sep 17 00:00:00 2001
From: Krisztian Szucs <szucs.krisztian@gmail.com>
Date: Fri, 3 Apr 2026 19:57:05 +0200
Subject: [PATCH 6/8] fix(parquet): fix clippy warnings in page_store and cdc

---
 parquet/src/arrow/page_store/mod.rs    | 30 +++++++++++++-------------
 parquet/src/arrow/page_store/writer.rs |  5 ++---
 parquet/src/column/chunker/cdc.rs      |  2 +-
 3 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/parquet/src/arrow/page_store/mod.rs b/parquet/src/arrow/page_store/mod.rs
index aa710a102ff7..4aae932bbe35 100644
--- a/parquet/src/arrow/page_store/mod.rs
+++ b/parquet/src/arrow/page_store/mod.rs
@@ -87,7 +87,7 @@ mod tests {
         fs::read_dir(dir)
             .unwrap()
             .filter_map(|e| e.ok())
-            .filter(|e| e.path().extension().map_or(false, |ext| ext == "page"))
+            .filter(|e| e.path().extension().is_some_and(|ext| ext == "page"))
             .count()
     }
 
@@ -162,7 +162,7 @@ mod tests {
         let meta = tmp.path().join("data.parquet");
 
         let batch = sample_batch();
-        let metadata = write_batches(&store, &meta, &[batch.clone()], None).unwrap();
+        let metadata = write_batches(&store, &meta, std::slice::from_ref(&batch), None).unwrap();
 
         assert_eq!(metadata.num_row_groups(), 1);
         assert_eq!(metadata.file_metadata().num_rows(), 5);
@@ -238,7 +238,7 @@ mod tests {
         let meta = tmp.path().join("data.parquet");
 
         let batch = large_batch(1_000_000);
-        write_batches(&store, &meta, &[batch.clone()], None).unwrap();
+        write_batches(&store, &meta, std::slice::from_ref(&batch), None).unwrap();
 
         // Must have produced more than one page per column
         assert!(count_page_files(&store) > batch.num_columns());
@@ -331,7 +331,7 @@ mod tests {
         ])
         .unwrap();
 
-        write_batches(&store, &meta, &[batch.clone()], None).unwrap();
+        write_batches(&store, &meta, std::slice::from_ref(&batch), None).unwrap();
         let batches = PageStoreReader::try_new(&meta, &store)
             .unwrap()
             .read_batches()
@@ -351,7 +351,7 @@ mod tests {
         )])
         .unwrap();
 
-        write_batches(&store, &meta, &[batch.clone()], None).unwrap();
+        write_batches(&store, &meta, std::slice::from_ref(&batch), None).unwrap();
         let batches = PageStoreReader::try_new(&meta, &store)
             .unwrap()
             .read_batches()
@@ -378,7 +378,7 @@ mod tests {
         let batch =
             RecordBatch::try_from_iter(vec![("s", Arc::new(struct_array) as ArrayRef)]).unwrap();
 
-        write_batches(&store, &meta, &[batch.clone()], None).unwrap();
+        write_batches(&store, &meta, std::slice::from_ref(&batch), None).unwrap();
         let batches = PageStoreReader::try_new(&meta, &store)
             .unwrap()
             .read_batches()
@@ -403,7 +403,7 @@ mod tests {
         let batch =
             RecordBatch::try_from_iter(vec![("items", Arc::new(list) as ArrayRef)]).unwrap();
 
-        write_batches(&store, &meta, &[batch.clone()], None).unwrap();
+        write_batches(&store, &meta, std::slice::from_ref(&batch), None).unwrap();
         let batches = PageStoreReader::try_new(&meta, &store)
             .unwrap()
             .read_batches()
@@ -422,7 +422,7 @@ mod tests {
         let meta = tmp.path().join("data.parquet");
 
         let batch = sample_batch();
-        write_batches(&store, &meta, &[batch.clone()], None).unwrap();
+        write_batches(&store, &meta, std::slice::from_ref(&batch), None).unwrap();
 
         let total: usize = PageStoreReader::try_new(&meta, &store)
             .unwrap()
@@ -444,7 +444,7 @@ mod tests {
         let props = WriterProperties::builder()
             .set_statistics_enabled(EnabledStatistics::Page)
             .build();
-        write_batches(&store, &meta, &[batch.clone()], Some(props)).unwrap();
+        write_batches(&store, &meta, std::slice::from_ref(&batch), Some(props)).unwrap();
 
         let batches = PageStoreReader::try_new(&meta, &store)
             .unwrap()
@@ -499,10 +499,10 @@ mod tests {
 
         let batch = sample_batch();
 
-        write_batches(&store, &meta_a, &[batch.clone()], None).unwrap();
+        write_batches(&store, &meta_a, std::slice::from_ref(&batch), None).unwrap();
         let pages_after_first = count_page_files(&store);
 
-        write_batches(&store, &meta_b, &[batch.clone()], None).unwrap();
+        write_batches(&store, &meta_b, std::slice::from_ref(&batch), None).unwrap();
         let pages_after_second = count_page_files(&store);
 
         assert_eq!(pages_after_first, pages_after_second);
@@ -531,7 +531,7 @@ mod tests {
         fs::create_dir_all(meta.parent().unwrap()).unwrap();
 
         let batch = sample_batch();
-        write_batches(&store, &meta, &[batch.clone()], None).unwrap();
+        write_batches(&store, &meta, std::slice::from_ref(&batch), None).unwrap();
 
         let batches = PageStoreReader::try_new(&meta, &store)
             .unwrap()
@@ -551,7 +551,7 @@ mod tests {
         for entry in fs::read_dir(&store).unwrap() {
             let entry = entry.unwrap();
             let path = entry.path();
-            if path.extension().map_or(false, |ext| ext == "page") {
+            if path.extension().is_some_and(|ext| ext == "page") {
                 let data = fs::read(&path).unwrap();
                 let hash = blake3::hash(&data);
                 let expected = format!("{}.page", hash.to_hex());
@@ -602,7 +602,7 @@ mod tests {
         let meta = tmp.path().join("data.parquet");
 
         let batch = sample_batch();
-        write_batches(&store, &meta, &[batch.clone()], None).unwrap();
+        write_batches(&store, &meta, std::slice::from_ref(&batch), None).unwrap();
 
         let schema = PageStoreReader::try_new(&meta, &store)
             .unwrap()
@@ -646,7 +646,7 @@ mod tests {
         let first_page = fs::read_dir(&store)
             .unwrap()
             .filter_map(|e| e.ok())
-            .find(|e| e.path().extension().map_or(false, |ext| ext == "page"))
+            .find(|e| e.path().extension().is_some_and(|ext| ext == "page"))
             .unwrap();
         fs::remove_file(first_page.path()).unwrap();
 
diff --git a/parquet/src/arrow/page_store/writer.rs b/parquet/src/arrow/page_store/writer.rs
index 87d2b4914ef9..9b4cc123ceb7 100644
--- a/parquet/src/arrow/page_store/writer.rs
+++ b/parquet/src/arrow/page_store/writer.rs
@@ -196,6 +196,7 @@ fn make_column_writer(
     Ok(crate::arrow::arrow_writer::ArrowColumnWriter { chunk, writer })
 }
 
+#[allow(clippy::too_many_arguments)]
 fn create_writers_for_type(
     data_type: &ArrowDataType,
     props: &WriterPropertiesPtr,
@@ -446,9 +447,8 @@ impl PageStoreWriter {
         let mut total_byte_size = 0i64;
 
         let mut cumulative_offset: i64 = self.next_page_offset;
-        let mut col_idx = 0usize;
 
-        for chunk in chunks {
+        for (col_idx, chunk) in chunks.into_iter().enumerate() {
             let mut close = chunk.close;
             total_byte_size += close.metadata.uncompressed_size();
 
@@ -479,7 +479,6 @@ impl PageStoreWriter {
                 }
             }
 
-            col_idx += 1;
             cumulative_offset += close.metadata.compressed_size();
 
             column_metadata.push(close.metadata);
diff --git a/parquet/src/column/chunker/cdc.rs b/parquet/src/column/chunker/cdc.rs
index bd03af2b471d..274a4a231018 100644
--- a/parquet/src/column/chunker/cdc.rs
+++ b/parquet/src/column/chunker/cdc.rs
@@ -2308,7 +2308,7 @@ mod arrow_tests {
             None,
             true,
         );
-        let read = concat_batches(&read_batches(&buf));
+        let read = concat_batches(read_batches(&buf));
         let read_list = read.column(0).as_list::<i32>();
         assert_eq!(read_list.len(), 5);
         assert!(read_list.is_valid(0));

From de45d6dfadfe3fd91863bd6b0d0178e23b8ae549 Mon Sep 17 00:00:00 2001
From: Krisztian Szucs <szucs.krisztian@gmail.com>
Date: Fri, 3 Apr 2026 20:11:53 +0200
Subject: [PATCH 7/8] fix(parquet): fix clippy map_or and rustdoc unresolved
 link in page_store

---
 parquet/src/arrow/page_store/reader.rs | 2 +-
 parquet/src/bin/parquet-page-store.rs  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/parquet/src/arrow/page_store/reader.rs b/parquet/src/arrow/page_store/reader.rs
index f5e02a70e76a..c1637f8f7df4 100644
--- a/parquet/src/arrow/page_store/reader.rs
+++ b/parquet/src/arrow/page_store/reader.rs
@@ -110,7 +110,7 @@ impl PageStoreReader {
         )?))
     }
 
-    /// Build a streaming [`ParquetRecordBatchReader`] over the page store.
+    /// Build a streaming `ParquetRecordBatchReader` over the page store.
     ///
     /// Prefer this over [`Self::read_batches`] for large files — batches are
     /// decoded on-demand and only one batch is held in memory at a time.
diff --git a/parquet/src/bin/parquet-page-store.rs b/parquet/src/bin/parquet-page-store.rs
index 6b8d0feb5762..496ec944de1d 100644
--- a/parquet/src/bin/parquet-page-store.rs
+++ b/parquet/src/bin/parquet-page-store.rs
@@ -314,7 +314,7 @@ fn cmd_write(
 
     let page_files = std::fs::read_dir(store)?
         .filter_map(|e| e.ok())
-        .filter(|e| e.path().extension().map_or(false, |ext| ext == "page"))
+        .filter(|e| e.path().extension().is_some_and(|ext| ext == "page"))
         .count();
     eprintln!(
         "Page store: {} page file(s) in {}",

From 96a4f2eb19057737b5151d2fd1115befb0897b33 Mon Sep 17 00:00:00 2001
From: Krisztian Szucs <szucs.krisztian@gmail.com>
Date: Fri, 3 Apr 2026 20:27:57 +0200
Subject: [PATCH 8/8] fix(parquet): fix clippy map_or in page_store example

---
 parquet/examples/page_store.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/parquet/examples/page_store.rs b/parquet/examples/page_store.rs
index 736ce9354694..9b668546a6bc 100644
--- a/parquet/examples/page_store.rs
+++ b/parquet/examples/page_store.rs
@@ -77,7 +77,7 @@ fn main() -> parquet::errors::Result<()> {
     let page_files: Vec<_> = std::fs::read_dir(&store_dir)
         .unwrap()
         .filter_map(|e| e.ok())
-        .filter(|e| e.path().extension().map_or(false, |ext| ext == "page"))
+        .filter(|e| e.path().extension().is_some_and(|ext| ext == "page"))
         .collect();
     println!("Page files in store: {}", page_files.len());