diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index a626076ebdd7..6a3f76b38867 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -508,6 +508,59 @@ impl ArrowReaderOptions { /// let mut reader = builder.build().unwrap(); /// let _batch = reader.next().unwrap().unwrap(); /// ``` + /// + /// # Example: Preserving Dictionary Encoding + /// + /// By default, Parquet string columns are read as `Utf8Array` (or `LargeUtf8Array`), + /// even if the underlying Parquet data uses dictionary encoding. You can preserve + /// the dictionary encoding by specifying a `Dictionary` type in the schema hint: + /// + /// ``` + /// use std::sync::Arc; + /// use tempfile::tempfile; + /// use arrow_array::{ArrayRef, RecordBatch, StringArray}; + /// use arrow_schema::{DataType, Field, Schema}; + /// use parquet::arrow::arrow_reader::{ArrowReaderOptions, ParquetRecordBatchReaderBuilder}; + /// use parquet::arrow::ArrowWriter; + /// + /// // Write a Parquet file with string data + /// let file = tempfile().unwrap(); + /// let schema = Arc::new(Schema::new(vec![ + /// Field::new("city", DataType::Utf8, false) + /// ])); + /// let cities = StringArray::from(vec!["Berlin", "Berlin", "Paris", "Berlin", "Paris"]); + /// let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(cities)]).unwrap(); + /// + /// let mut writer = ArrowWriter::try_new(file.try_clone().unwrap(), batch.schema(), None).unwrap(); + /// writer.write(&batch).unwrap(); + /// writer.close().unwrap(); + /// + /// // Read the file back, requesting dictionary encoding preservation + /// let dict_schema = Arc::new(Schema::new(vec![ + /// Field::new("city", DataType::Dictionary( + /// Box::new(DataType::Int32), + /// Box::new(DataType::Utf8) + /// ), false) + /// ])); + /// let options = ArrowReaderOptions::new().with_schema(dict_schema); + /// let builder = ParquetRecordBatchReaderBuilder::try_new_with_options( + /// file.try_clone().unwrap(), + /// options + /// ).unwrap(); + /// + /// let mut reader = builder.build().unwrap(); + /// let batch = reader.next().unwrap().unwrap(); + /// + /// // The column is now a DictionaryArray + /// assert!(matches!( + /// batch.column(0).data_type(), + /// DataType::Dictionary(_, _) + /// )); + /// ``` + /// + /// **Note**: Dictionary encoding preservation works best when: + /// 1. The original column was dictionary encoded (the default for string columns) + /// 2. There are a small number of distinct values pub fn with_schema(self, schema: SchemaRef) -> Self { Self { supplied_schema: Some(schema),