From 0d712ced1a0640b3183cc886157aeb1796573a30 Mon Sep 17 00:00:00 2001 From: AndreaBozzo Date: Thu, 8 Jan 2026 12:42:08 +0100 Subject: [PATCH 1/2] docs(parquet): add example for preserving dictionary encoding --- parquet/src/arrow/arrow_reader/mod.rs | 60 +++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index a626076ebdd7..ef6969f2463b 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -508,6 +508,66 @@ impl ArrowReaderOptions { /// let mut reader = builder.build().unwrap(); /// let _batch = reader.next().unwrap().unwrap(); /// ``` + /// + /// # Example: Preserving Dictionary Encoding + /// + /// By default, Parquet string columns are read as `Utf8Array` (or `LargeUtf8Array`), + /// even if the underlying Parquet data uses dictionary encoding. You can preserve + /// the dictionary encoding by specifying a `Dictionary` type in the schema hint: + /// + /// ``` + /// use std::sync::Arc; + /// use tempfile::tempfile; + /// use arrow_array::{ArrayRef, RecordBatch, StringArray}; + /// use arrow_schema::{DataType, Field, Schema}; + /// use parquet::arrow::arrow_reader::{ArrowReaderOptions, ParquetRecordBatchReaderBuilder}; + /// use parquet::arrow::ArrowWriter; + /// + /// // Write a Parquet file with string data + /// let file = tempfile().unwrap(); + /// let schema = Arc::new(Schema::new(vec![ + /// Field::new("city", DataType::Utf8, false) + /// ])); + /// let cities = StringArray::from(vec!["Berlin", "Berlin", "Paris", "Berlin", "Paris"]); + /// let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(cities)]).unwrap(); + /// + /// let mut writer = ArrowWriter::try_new(file.try_clone().unwrap(), batch.schema(), None).unwrap(); + /// writer.write(&batch).unwrap(); + /// writer.close().unwrap(); + /// + /// // Read the file back, requesting dictionary encoding preservation + /// let dict_schema = Arc::new(Schema::new(vec![ + /// Field::new("city", DataType::Dictionary( + /// Box::new(DataType::Int32), + /// Box::new(DataType::Utf8) + /// ), false) + /// ])); + /// let options = ArrowReaderOptions::new().with_schema(dict_schema); + /// let builder = ParquetRecordBatchReaderBuilder::try_new_with_options( + /// file.try_clone().unwrap(), + /// options + /// ).unwrap(); + /// + /// // Verify the schema shows Dictionary type + /// assert!(matches!( + /// builder.schema().field(0).data_type(), + /// DataType::Dictionary(_, _) + /// )); + /// + /// let mut reader = builder.build().unwrap(); + /// let batch = reader.next().unwrap().unwrap(); + /// + /// // The column is now a DictionaryArray + /// assert!(matches!( + /// batch.column(0).data_type(), + /// DataType::Dictionary(_, _) + /// )); + /// ``` + /// + /// **Note**: Dictionary encoding preservation works best when the batch size + /// is a divisor of the row group size and a single read does not span multiple + /// column chunks. If these conditions are not met, the reader may compute + /// a fresh dictionary from the decoded values. pub fn with_schema(self, schema: SchemaRef) -> Self { Self { supplied_schema: Some(schema), From 1ac4d540f24aa86a8a4cda82a2d032d80324c607 Mon Sep 17 00:00:00 2001 From: AndreaBozzo Date: Fri, 9 Jan 2026 22:02:51 +0100 Subject: [PATCH 2/2] address review feedback - Remove redundant schema type check - Update note with accurate dictionary encoding guidance --- parquet/src/arrow/arrow_reader/mod.rs | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index ef6969f2463b..6a3f76b38867 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -548,12 +548,6 @@ impl ArrowReaderOptions { /// options /// ).unwrap(); /// - /// // Verify the schema shows Dictionary type - /// assert!(matches!( - /// builder.schema().field(0).data_type(), - /// DataType::Dictionary(_, _) - /// )); - /// /// let mut reader = builder.build().unwrap(); /// let batch = reader.next().unwrap().unwrap(); /// @@ -564,10 +558,9 @@ impl ArrowReaderOptions { /// )); /// ``` /// - /// **Note**: Dictionary encoding preservation works best when the batch size - /// is a divisor of the row group size and a single read does not span multiple - /// column chunks. If these conditions are not met, the reader may compute - /// a fresh dictionary from the decoded values. + /// **Note**: Dictionary encoding preservation works best when: + /// 1. The original column was dictionary encoded (the default for string columns) + /// 2. There are a small number of distinct values pub fn with_schema(self, schema: SchemaRef) -> Self { Self { supplied_schema: Some(schema),