From 0d712ced1a0640b3183cc886157aeb1796573a30 Mon Sep 17 00:00:00 2001
From: AndreaBozzo <andreabozzo92@gmail.com>
Date: Thu, 8 Jan 2026 12:42:08 +0100
Subject: [PATCH 1/2] docs(parquet): add example for preserving dictionary
 encoding

---
 parquet/src/arrow/arrow_reader/mod.rs | 60 +++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs
index a626076ebdd7..ef6969f2463b 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -508,6 +508,66 @@ impl ArrowReaderOptions {
     /// let mut reader = builder.build().unwrap();
     /// let _batch = reader.next().unwrap().unwrap();
     /// ```
+    ///
+    /// # Example: Preserving Dictionary Encoding
+    ///
+    /// By default, Parquet string columns are read as `Utf8Array` (or `LargeUtf8Array`),
+    /// even if the underlying Parquet data uses dictionary encoding. You can preserve
+    /// the dictionary encoding by specifying a `Dictionary` type in the schema hint:
+    ///
+    /// ```
+    /// use std::sync::Arc;
+    /// use tempfile::tempfile;
+    /// use arrow_array::{ArrayRef, RecordBatch, StringArray};
+    /// use arrow_schema::{DataType, Field, Schema};
+    /// use parquet::arrow::arrow_reader::{ArrowReaderOptions, ParquetRecordBatchReaderBuilder};
+    /// use parquet::arrow::ArrowWriter;
+    ///
+    /// // Write a Parquet file with string data
+    /// let file = tempfile().unwrap();
+    /// let schema = Arc::new(Schema::new(vec![
+    ///     Field::new("city", DataType::Utf8, false)
+    /// ]));
+    /// let cities = StringArray::from(vec!["Berlin", "Berlin", "Paris", "Berlin", "Paris"]);
+    /// let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(cities)]).unwrap();
+    ///
+    /// let mut writer = ArrowWriter::try_new(file.try_clone().unwrap(), batch.schema(), None).unwrap();
+    /// writer.write(&batch).unwrap();
+    /// writer.close().unwrap();
+    ///
+    /// // Read the file back, requesting dictionary encoding preservation
+    /// let dict_schema = Arc::new(Schema::new(vec![
+    ///     Field::new("city", DataType::Dictionary(
+    ///         Box::new(DataType::Int32),
+    ///         Box::new(DataType::Utf8)
+    ///     ), false)
+    /// ]));
+    /// let options = ArrowReaderOptions::new().with_schema(dict_schema);
+    /// let builder = ParquetRecordBatchReaderBuilder::try_new_with_options(
+    ///     file.try_clone().unwrap(),
+    ///     options
+    /// ).unwrap();
+    ///
+    /// // Verify the schema shows Dictionary type
+    /// assert!(matches!(
+    ///     builder.schema().field(0).data_type(),
+    ///     DataType::Dictionary(_, _)
+    /// ));
+    ///
+    /// let mut reader = builder.build().unwrap();
+    /// let batch = reader.next().unwrap().unwrap();
+    ///
+    /// // The column is now a DictionaryArray
+    /// assert!(matches!(
+    ///     batch.column(0).data_type(),
+    ///     DataType::Dictionary(_, _)
+    /// ));
+    /// ```
+    ///
+    /// **Note**: Dictionary encoding preservation works best when the batch size
+    /// is a divisor of the row group size and a single read does not span multiple
+    /// column chunks. If these conditions are not met, the reader may compute
+    /// a fresh dictionary from the decoded values.
     pub fn with_schema(self, schema: SchemaRef) -> Self {
         Self {
             supplied_schema: Some(schema),

From 1ac4d540f24aa86a8a4cda82a2d032d80324c607 Mon Sep 17 00:00:00 2001
From: AndreaBozzo <andreabozzo92@gmail.com>
Date: Fri, 9 Jan 2026 22:02:51 +0100
Subject: [PATCH 2/2] address review feedback

- Remove redundant schema type check
- Update note with accurate dictionary encoding guidance
---
 parquet/src/arrow/arrow_reader/mod.rs | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs
index ef6969f2463b..6a3f76b38867 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -548,12 +548,6 @@ impl ArrowReaderOptions {
     ///     options
     /// ).unwrap();
     ///
-    /// // Verify the schema shows Dictionary type
-    /// assert!(matches!(
-    ///     builder.schema().field(0).data_type(),
-    ///     DataType::Dictionary(_, _)
-    /// ));
-    ///
     /// let mut reader = builder.build().unwrap();
     /// let batch = reader.next().unwrap().unwrap();
     ///
@@ -564,10 +558,9 @@ impl ArrowReaderOptions {
     /// ));
     /// ```
     ///
-    /// **Note**: Dictionary encoding preservation works best when the batch size
-    /// is a divisor of the row group size and a single read does not span multiple
-    /// column chunks. If these conditions are not met, the reader may compute
-    /// a fresh dictionary from the decoded values.
+    /// **Note**: Dictionary encoding preservation works best when:
+    /// 1. The original column was dictionary encoded (the default for string columns)
+    /// 2. There are a small number of distinct values
     pub fn with_schema(self, schema: SchemaRef) -> Self {
         Self {
             supplied_schema: Some(schema),