From 1bcea5173284315a072bf64ac5552741fc4bf336 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Mon, 29 Sep 2025 19:09:26 +0300 Subject: [PATCH 01/22] fix(parquet): converting parquet schema with backward compatible repeated struct/primitive with provided arrow schema closes: - #8495 --- parquet/src/arrow/schema/complex.rs | 259 ++++++++++++++++++++++++++++ 1 file changed, 259 insertions(+) diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index ecc80a65904a..ee8088d06809 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -624,3 +624,262 @@ pub fn convert_type(parquet_type: &TypePtr) -> Result { Ok(visitor.dispatch(parquet_type, context)?.unwrap()) } + +#[cfg(test)] +mod tests { + use crate::arrow::schema::complex::convert_schema; + use crate::arrow::ProjectionMask; + use crate::schema::parser::parse_message_type; + use crate::schema::types::SchemaDescriptor; + use arrow_schema::{DataType, Fields}; + use std::sync::Arc; + + #[test] + fn convert_schema_with_repeated_primitive_should_use_inferred_schema( + ) -> crate::errors::Result<()> { + let message_type = " + message schema { + repeated BYTE_ARRAY col_1 = 1; + } + "; + + let parsed_input_schema = Arc::new(parse_message_type(message_type)?); + let schema = SchemaDescriptor::new(parsed_input_schema); + + let converted = convert_schema(&schema, ProjectionMask::all(), None)?.unwrap(); + + let DataType::Struct(schema_fields) = &converted.arrow_type else { + panic!("Expected struct from convert_schema"); + }; + + assert_eq!(schema_fields.len(), 1); + + let expected_schema = DataType::Struct(Fields::from(vec![Arc::new( + arrow_schema::Field::new( + "col_1", + DataType::List(Arc::new(arrow_schema::Field::new( + "col_1", + DataType::Binary, + false, + ))), + false, + ) + .with_metadata(schema_fields[0].metadata().clone()), + )])); + + assert_eq!(converted.arrow_type, expected_schema); + + let utf8_instead_of_binary = Fields::from(vec![Arc::new( + arrow_schema::Field::new( + "col_1", + DataType::List(Arc::new(arrow_schema::Field::new( + "col_1", + DataType::Utf8, + false, + ))), + false, + ) + .with_metadata(schema_fields[0].metadata().clone()), + )]); + + // Should be able to convert the same thing + let converted_again = convert_schema( + &schema, + ProjectionMask::all(), + Some(&utf8_instead_of_binary), + )? + .unwrap(); + + // Assert that we changed to Utf8 + assert_eq!( + converted_again.arrow_type, + DataType::Struct(utf8_instead_of_binary) + ); + + Ok(()) + } + + #[test] + fn convert_schema_with_repeated_struct_and_inferred_schema() -> crate::errors::Result<()> { + let message_type = " + message schema { + repeated group col_1 { + optional binary col_2; + optional binary col_3; + optional group col_4 { + optional int64 col_5; + optional int32 col_6; + } + } + } + "; + + let parsed_input_schema = Arc::new(parse_message_type(message_type)?); + let schema = SchemaDescriptor::new(parsed_input_schema); + + let converted = convert_schema(&schema, ProjectionMask::all(), None)?.unwrap(); + + let DataType::Struct(schema_fields) = &converted.arrow_type else { + panic!("Expected struct from convert_schema"); + }; + + assert_eq!(schema_fields.len(), 1); + + // Should be able to convert the same thing + let converted_again = + convert_schema(&schema, ProjectionMask::all(), Some(&schema_fields))?.unwrap(); + + // Assert that we changed to Utf8 + assert_eq!(converted_again.arrow_type, converted.arrow_type); + + Ok(()) + } + + #[test] + fn convert_schema_with_nested_repeated_struct_and_primitives() -> crate::errors::Result<()> { + let message_type = " +message schema { + repeated group col_1 { + optional binary col_2; + repeated BYTE_ARRAY col_3; + repeated group col_4 { + optional int64 col_5; + repeated binary col_6; + } + } +} +"; + + let parsed_input_schema = Arc::new(parse_message_type(message_type)?); + let schema = SchemaDescriptor::new(parsed_input_schema); + + let converted = convert_schema(&schema, ProjectionMask::all(), None)?.unwrap(); + + let DataType::Struct(schema_fields) = &converted.arrow_type else { + panic!("Expected struct from convert_schema"); + }; + + assert_eq!(schema_fields.len(), 1); + + // Build expected schema + let expected_schema = DataType::Struct(Fields::from(vec![Arc::new( + arrow_schema::Field::new( + "col_1", + DataType::List(Arc::new(arrow_schema::Field::new( + "col_1", + DataType::Struct(Fields::from(vec![ + Arc::new(arrow_schema::Field::new("col_2", DataType::Binary, true)), + Arc::new(arrow_schema::Field::new( + "col_3", + DataType::List(Arc::new(arrow_schema::Field::new( + "col_3", + DataType::Binary, + false, + ))), + false, + )), + Arc::new(arrow_schema::Field::new( + "col_4", + DataType::List(Arc::new(arrow_schema::Field::new( + "col_4", + DataType::Struct(Fields::from(vec![ + Arc::new(arrow_schema::Field::new( + "col_5", + DataType::Int64, + true, + )), + Arc::new(arrow_schema::Field::new( + "col_6", + DataType::List(Arc::new(arrow_schema::Field::new( + "col_6", + DataType::Binary, + false, + ))), + false, + )), + ])), + false, + ))), + false, + )), + ])), + false, + ))), + false, + ) + .with_metadata(schema_fields[0].metadata().clone()), + )])); + + assert_eq!(converted.arrow_type, expected_schema); + + // Test conversion with inferred schema + let converted_again = + convert_schema(&schema, ProjectionMask::all(), Some(&schema_fields))?.unwrap(); + + assert_eq!(converted_again.arrow_type, converted.arrow_type); + + // Test conversion with modified schema (change col_6 from Binary to Utf8) + let modified_schema_fields = Fields::from(vec![Arc::new( + arrow_schema::Field::new( + "col_1", + DataType::List(Arc::new(arrow_schema::Field::new( + "col_1", + DataType::Struct(Fields::from(vec![ + Arc::new(arrow_schema::Field::new("col_2", DataType::Binary, true)), + Arc::new(arrow_schema::Field::new( + "col_3", + DataType::List(Arc::new(arrow_schema::Field::new( + "col_3", + DataType::Binary, + false, + ))), + false, + )), + Arc::new(arrow_schema::Field::new( + "col_4", + DataType::List(Arc::new(arrow_schema::Field::new( + "col_4", + DataType::Struct(Fields::from(vec![ + Arc::new(arrow_schema::Field::new( + "col_5", + DataType::Int64, + true, + )), + Arc::new(arrow_schema::Field::new( + "col_6", + DataType::List(Arc::new(arrow_schema::Field::new( + "col_6", + // Change to Utf8 + DataType::Utf8, + false, + ))), + false, + )), + ])), + false, + ))), + false, + )), + ])), + false, + ))), + false, + ) + .with_metadata(schema_fields[0].metadata().clone()), + )]); + + let converted_with_modified = convert_schema( + &schema, + ProjectionMask::all(), + Some(&modified_schema_fields), + )? + .unwrap(); + + assert_eq!( + converted_with_modified.arrow_type, + DataType::Struct(modified_schema_fields) + ); + + Ok(()) + } +} From cc5ab0d348a943feb63e107639556e6ab88c2f38 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Mon, 29 Sep 2025 19:25:53 +0300 Subject: [PATCH 02/22] add tests with inferring list types as well --- parquet/src/arrow/schema/complex.rs | 94 ++++++++++++++++++++++++----- 1 file changed, 80 insertions(+), 14 deletions(-) diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index ee8088d06809..fd8b3307fb24 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -664,7 +664,7 @@ mod tests { ))), false, ) - .with_metadata(schema_fields[0].metadata().clone()), + .with_metadata(schema_fields[0].metadata().clone()), )])); assert_eq!(converted.arrow_type, expected_schema); @@ -679,7 +679,7 @@ mod tests { ))), false, ) - .with_metadata(schema_fields[0].metadata().clone()), + .with_metadata(schema_fields[0].metadata().clone()), )]); // Should be able to convert the same thing @@ -688,7 +688,73 @@ mod tests { ProjectionMask::all(), Some(&utf8_instead_of_binary), )? - .unwrap(); + .unwrap(); + + // Assert that we changed to Utf8 + assert_eq!( + converted_again.arrow_type, + DataType::Struct(utf8_instead_of_binary) + ); + + Ok(()) + } + + #[test] + fn convert_schema_with_repeated_primitive_should_use_inferred_schema_for_list_as_well( + ) -> crate::errors::Result<()> { + let message_type = " + message schema { + repeated BYTE_ARRAY col_1 = 1; + } + "; + + let parsed_input_schema = Arc::new(parse_message_type(message_type)?); + let schema = SchemaDescriptor::new(parsed_input_schema); + + let converted = convert_schema(&schema, ProjectionMask::all(), None)?.unwrap(); + + let DataType::Struct(schema_fields) = &converted.arrow_type else { + panic!("Expected struct from convert_schema"); + }; + + assert_eq!(schema_fields.len(), 1); + + let expected_schema = DataType::Struct(Fields::from(vec![Arc::new( + arrow_schema::Field::new( + "col_1", + DataType::List(Arc::new(arrow_schema::Field::new( + "col_1", + DataType::Binary, + false, + ))), + false, + ) + .with_metadata(schema_fields[0].metadata().clone()), + )])); + + assert_eq!(converted.arrow_type, expected_schema); + + let utf8_instead_of_binary = Fields::from(vec![Arc::new( + arrow_schema::Field::new( + "col_1", + // Inferring as LargeList instead of List + DataType::LargeList(Arc::new(arrow_schema::Field::new( + "col_1", + DataType::Utf8, + false, + ))), + false, + ) + .with_metadata(schema_fields[0].metadata().clone()), + )]); + + // Should be able to convert the same thing + let converted_again = convert_schema( + &schema, + ProjectionMask::all(), + Some(&utf8_instead_of_binary), + )? + .unwrap(); // Assert that we changed to Utf8 assert_eq!( @@ -727,7 +793,7 @@ mod tests { // Should be able to convert the same thing let converted_again = - convert_schema(&schema, ProjectionMask::all(), Some(&schema_fields))?.unwrap(); + convert_schema(&schema, ProjectionMask::all(), Some(&schema_fields))?.unwrap(); // Assert that we changed to Utf8 assert_eq!(converted_again.arrow_type, converted.arrow_type); @@ -818,26 +884,27 @@ message schema { assert_eq!(converted_again.arrow_type, converted.arrow_type); - // Test conversion with modified schema (change col_6 from Binary to Utf8) + // Test conversion with modified schema (change lists to either LargeList or FixedSizeList) + // as well as changing Binary to Utf8 or BinaryView let modified_schema_fields = Fields::from(vec![Arc::new( arrow_schema::Field::new( "col_1", - DataType::List(Arc::new(arrow_schema::Field::new( + DataType::LargeList(Arc::new(arrow_schema::Field::new( "col_1", DataType::Struct(Fields::from(vec![ - Arc::new(arrow_schema::Field::new("col_2", DataType::Binary, true)), + Arc::new(arrow_schema::Field::new("col_2", DataType::LargeBinary, true)), Arc::new(arrow_schema::Field::new( "col_3", - DataType::List(Arc::new(arrow_schema::Field::new( + DataType::LargeList(Arc::new(arrow_schema::Field::new( "col_3", - DataType::Binary, + DataType::Utf8, false, ))), false, )), Arc::new(arrow_schema::Field::new( "col_4", - DataType::List(Arc::new(arrow_schema::Field::new( + DataType::FixedSizeList(Arc::new(arrow_schema::Field::new( "col_4", DataType::Struct(Fields::from(vec![ Arc::new(arrow_schema::Field::new( @@ -847,17 +914,16 @@ message schema { )), Arc::new(arrow_schema::Field::new( "col_6", - DataType::List(Arc::new(arrow_schema::Field::new( + DataType::LargeList(Arc::new(arrow_schema::Field::new( "col_6", - // Change to Utf8 - DataType::Utf8, + DataType::BinaryView, false, ))), false, )), ])), false, - ))), + )), 3), false, )), ])), From fb2579adfe4deee2da0b6a1e212ebc6481d3c7a7 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Mon, 29 Sep 2025 21:32:27 +0300 Subject: [PATCH 03/22] add more tests and fix bug --- parquet/src/arrow/schema/complex.rs | 385 +++++++++++++++++++++------- 1 file changed, 286 insertions(+), 99 deletions(-) diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index fd8b3307fb24..1de919b35c65 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -58,14 +58,41 @@ pub struct ParquetField { impl ParquetField { /// Converts `self` into an arrow list, with its current type as the field type + /// accept an optional `list_data_type` to specify the type of list to create /// /// This is used to convert repeated columns, into their arrow representation - fn into_list(self, name: &str) -> Self { + fn into_list(self, parquet_field_type: &Type, list_data_type: Option) -> Self { + let arrow_field = match &list_data_type { + Some(DataType::List(field_hint)) + | Some(DataType::LargeList(field_hint)) + | Some(DataType::FixedSizeList(field_hint, _)) => Some(field_hint.as_ref()), + Some(_) => unreachable!( + "should be validated earlier that list_data_type is only a type of list" + ), + None => None, + }; + + let arrow_field = convert_field( + parquet_field_type, + &self, + arrow_field, + // Only add the field id to the list and not to the element + false, + ) + .with_nullable(false); + ParquetField { rep_level: self.rep_level, def_level: self.def_level, nullable: false, - arrow_type: DataType::List(Arc::new(Field::new(name, self.arrow_type.clone(), false))), + arrow_type: match list_data_type { + Some(DataType::List(_)) => DataType::List(Arc::new(arrow_field)), + Some(DataType::LargeList(_)) => DataType::LargeList(Arc::new(arrow_field)), + Some(DataType::FixedSizeList(_, len)) => { + DataType::FixedSizeList(Arc::new(arrow_field), len) + } + _ => DataType::List(Arc::new(arrow_field)), + }, field_type: ParquetFieldType::Group { children: vec![self], }, @@ -143,7 +170,25 @@ impl Visitor { let repetition = get_repetition(primitive_type); let (def_level, rep_level, nullable) = context.levels(repetition); - let arrow_type = convert_primitive(primitive_type, context.data_type)?; + let primitive_arrow_data_type = match repetition { + Repetition::REPEATED => { + let arrow_field = match &context.data_type { + Some(DataType::List(f)) => Some(f.as_ref()), + Some(DataType::LargeList(f)) => Some(f.as_ref()), + Some(DataType::FixedSizeList(f, _)) => Some(f.as_ref()), + Some(d) => return Err(arrow_err!( + "incompatible arrow schema, expected list got {} for repeated primitive field", + d + )), + None => None, + }; + + arrow_field.map(|f| f.data_type().clone()) + } + _ => context.data_type.clone(), + }; + + let arrow_type = convert_primitive(primitive_type, primitive_arrow_data_type)?; let primitive_field = ParquetField { rep_level, @@ -157,7 +202,7 @@ impl Visitor { }; Ok(Some(match repetition { - Repetition::REPEATED => primitive_field.into_list(primitive_type.name()), + Repetition::REPEATED => primitive_field.into_list(primitive_type, context.data_type), _ => primitive_field, })) } @@ -174,7 +219,25 @@ impl Visitor { let parquet_fields = struct_type.get_fields(); // Extract any arrow fields from the hints - let arrow_fields = match &context.data_type { + let arrow_struct = match repetition { + Repetition::REPEATED => { + let arrow_field = match &context.data_type { + Some(DataType::List(f)) => Some(f.as_ref()), + Some(DataType::LargeList(f)) => Some(f.as_ref()), + Some(DataType::FixedSizeList(f, _)) => Some(f.as_ref()), + Some(d) => return Err(arrow_err!( + "incompatible arrow schema, expected list got {} for repeated struct field", + d + )), + None => None, + }; + + arrow_field.map(|f| f.data_type()) + } + _ => context.data_type.as_ref(), + }; + + let arrow_fields = match &arrow_struct { Some(DataType::Struct(fields)) => { if fields.len() != parquet_fields.len() { return Err(arrow_err!( @@ -221,10 +284,10 @@ impl Visitor { data_type, }; - if let Some(mut child) = self.dispatch(parquet_field, child_ctx)? { + if let Some(child) = self.dispatch(parquet_field, child_ctx)? { // The child type returned may be different from what is encoded in the arrow // schema in the event of a mismatch or a projection - child_fields.push(convert_field(parquet_field, &mut child, arrow_field)); + child_fields.push(convert_field(parquet_field, &child, arrow_field, true)); children.push(child); } } @@ -242,7 +305,7 @@ impl Visitor { }; Ok(Some(match repetition { - Repetition::REPEATED => struct_field.into_list(struct_type.name()), + Repetition::REPEATED => struct_field.into_list(struct_type, context.data_type), _ => struct_field, })) } @@ -353,13 +416,13 @@ impl Visitor { // Need both columns to be projected match (maybe_key, maybe_value) { - (Some(mut key), Some(mut value)) => { + (Some(key), Some(value)) => { let key_field = Arc::new( - convert_field(map_key, &mut key, arrow_key) + convert_field(map_key, &key, arrow_key, true) // The key is always non-nullable (#5630) .with_nullable(false), ); - let value_field = Arc::new(convert_field(map_value, &mut value, arrow_value)); + let value_field = Arc::new(convert_field(map_value, &value, arrow_value, true)); let field_metadata = match arrow_map { Some(field) => field.metadata().clone(), _ => HashMap::default(), @@ -496,8 +559,8 @@ impl Visitor { }; match self.dispatch(item_type, new_context) { - Ok(Some(mut item)) => { - let item_field = Arc::new(convert_field(item_type, &mut item, arrow_field)); + Ok(Some(item)) => { + let item_field = Arc::new(convert_field(item_type, &item, arrow_field, true)); // Use arrow type as hint for index size let arrow_type = match context.data_type { @@ -547,8 +610,9 @@ impl Visitor { /// dictated by the `parquet_type`, and any metadata from `arrow_hint` fn convert_field( parquet_type: &Type, - field: &mut ParquetField, + field: &ParquetField, arrow_hint: Option<&Field>, + add_field_id: bool, ) -> Field { let name = parquet_type.name(); let data_type = field.arrow_type.clone(); @@ -572,7 +636,7 @@ fn convert_field( None => { let mut ret = Field::new(name, data_type, nullable); let basic_info = parquet_type.get_basic_info(); - if basic_info.has_id() { + if add_field_id && basic_info.has_id() { let mut meta = HashMap::with_capacity(1); meta.insert( PARQUET_FIELD_ID_META_KEY.to_string(), @@ -628,12 +692,60 @@ pub fn convert_type(parquet_type: &TypePtr) -> Result { #[cfg(test)] mod tests { use crate::arrow::schema::complex::convert_schema; - use crate::arrow::ProjectionMask; + use crate::arrow::{ProjectionMask, PARQUET_FIELD_ID_META_KEY}; use crate::schema::parser::parse_message_type; use crate::schema::types::SchemaDescriptor; use arrow_schema::{DataType, Fields}; use std::sync::Arc; + trait WithFieldId { + fn with_field_id(self, id: i32) -> Self; + } + impl WithFieldId for arrow_schema::Field { + fn with_field_id(self, id: i32) -> Self { + let mut metadata = self.metadata().clone(); + metadata.insert(PARQUET_FIELD_ID_META_KEY.to_string(), id.to_string()); + self.with_metadata(metadata) + } + } + + #[test] + fn convert_schema_with_repeated_primitive() -> crate::errors::Result<()> { + let message_type = " + message schema { + repeated BYTE_ARRAY col_1 = 1; + } + "; + + let parsed_input_schema = Arc::new(parse_message_type(message_type)?); + let schema = SchemaDescriptor::new(parsed_input_schema); + + let converted = convert_schema(&schema, ProjectionMask::all(), None)?.unwrap(); + + let DataType::Struct(schema_fields) = &converted.arrow_type else { + panic!("Expected struct from convert_schema"); + }; + + assert_eq!(schema_fields.len(), 1); + + let expected_schema = DataType::Struct(Fields::from(vec![Arc::new( + arrow_schema::Field::new( + "col_1", + DataType::List(Arc::new( + // No metadata on inner field + arrow_schema::Field::new("col_1", DataType::Binary, false), + )), + false, + ) + // add the field id to the outer list + .with_field_id(1), + )])); + + assert_eq!(converted.arrow_type, expected_schema); + + Ok(()) + } + #[test] fn convert_schema_with_repeated_primitive_should_use_inferred_schema( ) -> crate::errors::Result<()> { @@ -664,7 +776,7 @@ mod tests { ))), false, ) - .with_metadata(schema_fields[0].metadata().clone()), + .with_metadata(schema_fields[0].metadata().clone()), )])); assert_eq!(converted.arrow_type, expected_schema); @@ -679,7 +791,7 @@ mod tests { ))), false, ) - .with_metadata(schema_fields[0].metadata().clone()), + .with_metadata(schema_fields[0].metadata().clone()), )]); // Should be able to convert the same thing @@ -688,7 +800,7 @@ mod tests { ProjectionMask::all(), Some(&utf8_instead_of_binary), )? - .unwrap(); + .unwrap(); // Assert that we changed to Utf8 assert_eq!( @@ -729,7 +841,7 @@ mod tests { ))), false, ) - .with_metadata(schema_fields[0].metadata().clone()), + .with_metadata(schema_fields[0].metadata().clone()), )])); assert_eq!(converted.arrow_type, expected_schema); @@ -745,7 +857,7 @@ mod tests { ))), false, ) - .with_metadata(schema_fields[0].metadata().clone()), + .with_metadata(schema_fields[0].metadata().clone()), )]); // Should be able to convert the same thing @@ -754,7 +866,7 @@ mod tests { ProjectionMask::all(), Some(&utf8_instead_of_binary), )? - .unwrap(); + .unwrap(); // Assert that we changed to Utf8 assert_eq!( @@ -769,12 +881,12 @@ mod tests { fn convert_schema_with_repeated_struct_and_inferred_schema() -> crate::errors::Result<()> { let message_type = " message schema { - repeated group col_1 { - optional binary col_2; - optional binary col_3; - optional group col_4 { - optional int64 col_5; - optional int32 col_6; + repeated group my_col_1 = 1 { + optional binary my_col_2 = 2; + optional binary my_col_3 = 3; + optional group my_col_4 = 4 { + optional int64 my_col_5 = 5; + optional int32 my_col_6 = 6; } } } @@ -793,7 +905,44 @@ mod tests { // Should be able to convert the same thing let converted_again = - convert_schema(&schema, ProjectionMask::all(), Some(&schema_fields))?.unwrap(); + convert_schema(&schema, ProjectionMask::all(), Some(schema_fields))?.unwrap(); + + // Assert that we changed to Utf8 + assert_eq!(converted_again.arrow_type, converted.arrow_type); + + Ok(()) + } + + #[test] + fn convert_schema_with_repeated_struct_and_inferred_schema_and_field_id( + ) -> crate::errors::Result<()> { + let message_type = " + message schema { + repeated group my_col_1 = 1 { + optional binary my_col_2 = 2; + optional binary my_col_3 = 3; + optional group my_col_4 = 4 { + optional int64 my_col_5 = 5; + optional int32 my_col_6 = 6; + } + } + } + "; + + let parsed_input_schema = Arc::new(parse_message_type(message_type)?); + let schema = SchemaDescriptor::new(parsed_input_schema); + + let converted = convert_schema(&schema, ProjectionMask::all(), None)?.unwrap(); + + let DataType::Struct(schema_fields) = &converted.arrow_type else { + panic!("Expected struct from convert_schema"); + }; + + assert_eq!(schema_fields.len(), 1); + + // Should be able to convert the same thing + let converted_again = + convert_schema(&schema, ProjectionMask::all(), Some(schema_fields))?.unwrap(); // Assert that we changed to Utf8 assert_eq!(converted_again.arrow_type, converted.arrow_type); @@ -805,12 +954,12 @@ mod tests { fn convert_schema_with_nested_repeated_struct_and_primitives() -> crate::errors::Result<()> { let message_type = " message schema { - repeated group col_1 { - optional binary col_2; - repeated BYTE_ARRAY col_3; - repeated group col_4 { - optional int64 col_5; - repeated binary col_6; + repeated group my_col_1 = 1 { + optional binary my_col_2 = 2; + repeated BYTE_ARRAY my_col_3 = 3; + repeated group my_col_4 = 4 { + optional int64 my_col_5 = 5; + repeated binary my_col_6 = 6; } } } @@ -830,57 +979,77 @@ message schema { // Build expected schema let expected_schema = DataType::Struct(Fields::from(vec![Arc::new( arrow_schema::Field::new( - "col_1", + "my_col_1", DataType::List(Arc::new(arrow_schema::Field::new( - "col_1", + "my_col_1", DataType::Struct(Fields::from(vec![ - Arc::new(arrow_schema::Field::new("col_2", DataType::Binary, true)), - Arc::new(arrow_schema::Field::new( - "col_3", - DataType::List(Arc::new(arrow_schema::Field::new( - "col_3", - DataType::Binary, + Arc::new( + arrow_schema::Field::new("my_col_2", DataType::Binary, true) + .with_field_id(2), + ), + Arc::new( + arrow_schema::Field::new( + "my_col_3", + DataType::List(Arc::new(arrow_schema::Field::new( + "my_col_3", + DataType::Binary, + false, + ))), false, - ))), - false, - )), - Arc::new(arrow_schema::Field::new( - "col_4", - DataType::List(Arc::new(arrow_schema::Field::new( - "col_4", - DataType::Struct(Fields::from(vec![ - Arc::new(arrow_schema::Field::new( - "col_5", - DataType::Int64, - true, - )), - Arc::new(arrow_schema::Field::new( - "col_6", - DataType::List(Arc::new(arrow_schema::Field::new( - "col_6", - DataType::Binary, - false, - ))), - false, - )), - ])), + ) + // add the field id to the outer list + .with_field_id(3), + ), + Arc::new( + arrow_schema::Field::new( + "my_col_4", + DataType::List(Arc::new(arrow_schema::Field::new( + "my_col_4", + DataType::Struct(Fields::from(vec![ + Arc::new( + arrow_schema::Field::new( + "my_col_5", + DataType::Int64, + true, + ) + // add the field id to the outer list + .with_field_id(5), + ), + Arc::new( + arrow_schema::Field::new( + "my_col_6", + DataType::List(Arc::new(arrow_schema::Field::new( + "my_col_6", + DataType::Binary, + false, + ))), + false, + ) + // add the field id to the outer list + .with_field_id(6), + ), + ])), + false, + ))), false, - ))), - false, - )), + ) + // add the field id to the outer list + .with_field_id(4), + ), ])), false, ))), false, ) - .with_metadata(schema_fields[0].metadata().clone()), + // add the field id to the outer list + .with_field_id(1), )])); assert_eq!(converted.arrow_type, expected_schema); // Test conversion with inferred schema let converted_again = - convert_schema(&schema, ProjectionMask::all(), Some(&schema_fields))?.unwrap(); + convert_schema(&schema, ProjectionMask::all(), Some(schema_fields))?.unwrap(); assert_eq!(converted_again.arrow_type, converted.arrow_type); @@ -888,50 +1057,68 @@ message schema { // as well as changing Binary to Utf8 or BinaryView let modified_schema_fields = Fields::from(vec![Arc::new( arrow_schema::Field::new( - "col_1", + "my_col_1", DataType::LargeList(Arc::new(arrow_schema::Field::new( - "col_1", + "my_col_1", DataType::Struct(Fields::from(vec![ - Arc::new(arrow_schema::Field::new("col_2", DataType::LargeBinary, true)), Arc::new(arrow_schema::Field::new( - "col_3", + "my_col_2", + DataType::LargeBinary, + true, + ) + .with_field_id(2)), + Arc::new(arrow_schema::Field::new( + "my_col_3", DataType::LargeList(Arc::new(arrow_schema::Field::new( - "col_3", + "my_col_3", DataType::Utf8, false, ))), false, - )), + ) + // add the field id to the outer list + .with_field_id(3)), Arc::new(arrow_schema::Field::new( - "col_4", - DataType::FixedSizeList(Arc::new(arrow_schema::Field::new( - "col_4", - DataType::Struct(Fields::from(vec![ - Arc::new(arrow_schema::Field::new( - "col_5", - DataType::Int64, - true, - )), - Arc::new(arrow_schema::Field::new( - "col_6", - DataType::LargeList(Arc::new(arrow_schema::Field::new( - "col_6", - DataType::BinaryView, + "my_col_4", + DataType::FixedSizeList( + Arc::new(arrow_schema::Field::new( + "my_col_4", + DataType::Struct(Fields::from(vec![ + Arc::new(arrow_schema::Field::new( + "my_col_5", + DataType::Int64, + true, + ) + .with_field_id(5)), + Arc::new(arrow_schema::Field::new( + "my_col_6", + DataType::LargeList(Arc::new( + arrow_schema::Field::new( + "my_col_6", + DataType::BinaryView, + false, + ), + )), false, - ))), - false, - )), - ])), - false, - )), 3), + ) + // add the field id to the outer list + .with_field_id(6)), + ])), + false, + )), + 3, + ), false, - )), + ) + // add the field id to the outer list + .with_field_id(4)), ])), false, ))), false, ) - .with_metadata(schema_fields[0].metadata().clone()), + // add the field id to the outer list + .with_field_id(1) )]); let converted_with_modified = convert_schema( From 297cf651ad008feb43b9385f7c2761a6fbf6d44a Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Mon, 29 Sep 2025 21:34:41 +0300 Subject: [PATCH 04/22] format --- parquet/src/arrow/schema/complex.rs | 112 +++++++++++++++------------- 1 file changed, 60 insertions(+), 52 deletions(-) diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index 1de919b35c65..224407bdc407 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -225,10 +225,12 @@ impl Visitor { Some(DataType::List(f)) => Some(f.as_ref()), Some(DataType::LargeList(f)) => Some(f.as_ref()), Some(DataType::FixedSizeList(f, _)) => Some(f.as_ref()), - Some(d) => return Err(arrow_err!( + Some(d) => { + return Err(arrow_err!( "incompatible arrow schema, expected list got {} for repeated struct field", d - )), + )) + } None => None, }; @@ -1041,8 +1043,8 @@ message schema { ))), false, ) - // add the field id to the outer list - .with_field_id(1), + // add the field id to the outer list + .with_field_id(1), )])); assert_eq!(converted.arrow_type, expected_schema); @@ -1061,64 +1063,70 @@ message schema { DataType::LargeList(Arc::new(arrow_schema::Field::new( "my_col_1", DataType::Struct(Fields::from(vec![ - Arc::new(arrow_schema::Field::new( - "my_col_2", - DataType::LargeBinary, - true, - ) - .with_field_id(2)), - Arc::new(arrow_schema::Field::new( - "my_col_3", - DataType::LargeList(Arc::new(arrow_schema::Field::new( + Arc::new( + arrow_schema::Field::new("my_col_2", DataType::LargeBinary, true) + .with_field_id(2), + ), + Arc::new( + arrow_schema::Field::new( "my_col_3", - DataType::Utf8, + DataType::LargeList(Arc::new(arrow_schema::Field::new( + "my_col_3", + DataType::Utf8, + false, + ))), false, - ))), - false, - ) - // add the field id to the outer list - .with_field_id(3)), - Arc::new(arrow_schema::Field::new( - "my_col_4", - DataType::FixedSizeList( - Arc::new(arrow_schema::Field::new( - "my_col_4", - DataType::Struct(Fields::from(vec![ - Arc::new(arrow_schema::Field::new( - "my_col_5", - DataType::Int64, - true, - ) - .with_field_id(5)), - Arc::new(arrow_schema::Field::new( - "my_col_6", - DataType::LargeList(Arc::new( + ) + // add the field id to the outer list + .with_field_id(3), + ), + Arc::new( + arrow_schema::Field::new( + "my_col_4", + DataType::FixedSizeList( + Arc::new(arrow_schema::Field::new( + "my_col_4", + DataType::Struct(Fields::from(vec![ + Arc::new( + arrow_schema::Field::new( + "my_col_5", + DataType::Int64, + true, + ) + .with_field_id(5), + ), + Arc::new( arrow_schema::Field::new( "my_col_6", - DataType::BinaryView, + DataType::LargeList(Arc::new( + arrow_schema::Field::new( + "my_col_6", + DataType::BinaryView, + false, + ), + )), false, - ), - )), - false, - ) - // add the field id to the outer list - .with_field_id(6)), - ])), - false, - )), - 3, - ), - false, - ) - // add the field id to the outer list - .with_field_id(4)), + ) + // add the field id to the outer list + .with_field_id(6), + ), + ])), + false, + )), + 3, + ), + false, + ) + // add the field id to the outer list + .with_field_id(4), + ), ])), false, ))), false, ) - // add the field id to the outer list - .with_field_id(1) + // add the field id to the outer list + .with_field_id(1), )]); let converted_with_modified = convert_schema( From 85d84b6f6bcb6dc53717fc983c6f888ee1bf5a7b Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Mon, 29 Sep 2025 21:59:49 +0300 Subject: [PATCH 05/22] added more tests as I know I have a bug --- parquet/src/arrow/schema/complex.rs | 125 ++++++++++++++++++++++------ 1 file changed, 101 insertions(+), 24 deletions(-) diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index 224407bdc407..bf1eceb217ed 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -711,6 +711,103 @@ mod tests { } } + fn test_roundtrip(message_type: &str) -> crate::errors::Result<()> { + let parsed_input_schema = Arc::new(parse_message_type(message_type)?); + let schema = SchemaDescriptor::new(parsed_input_schema); + + let converted = convert_schema(&schema, ProjectionMask::all(), None)?.unwrap(); + + let DataType::Struct(schema_fields) = &converted.arrow_type else { + panic!("Expected struct from convert_schema"); + }; + + // Should be able to convert the same thing + let converted_again = + convert_schema(&schema, ProjectionMask::all(), Some(schema_fields))?.unwrap(); + + // Assert that we changed to Utf8 + assert_eq!(converted_again.arrow_type, converted.arrow_type); + + Ok(()) + } + + #[test] + fn basic_backward_compatible_list() -> crate::errors::Result<()> { + test_roundtrip(" + message schema { + optional group my_list (LIST) { + repeated int32 element; + } + } + ") + } + + #[test] + fn basic_backward_compatible_list_2() -> crate::errors::Result<()> { + test_roundtrip(" + message schema { + optional group my_list (LIST) { + repeated group element { + required binary str (STRING); + required int32 num; + }; + } + } + ") + } + + #[test] + fn basic_backward_compatible_list_3() -> crate::errors::Result<()> { + test_roundtrip(" + message schema { + optional group my_list (LIST) { + repeated group array (LIST) { + repeated int32 array; + }; +} + } + ") + } + + #[test] + fn basic_backward_compatible_list_4_1() -> crate::errors::Result<()> { + test_roundtrip(" + message schema { + optional group my_list (LIST) { + repeated group array { + required binary str (STRING); + }; +} + } + ") + } + + #[test] + fn basic_backward_compatible_list_4_2() -> crate::errors::Result<()> { + test_roundtrip(" + message schema { + optional group my_list (LIST) { + repeated group my_list_tuple { + required binary str (STRING); + }; +} + } + ") + } + + #[test] + fn basic_backward_compatible_list_5() -> crate::errors::Result<()> { + test_roundtrip(" + message schema { + optional group my_list (LIST) { + repeated group element { + optional binary str (STRING); + }; +} + } + ") + } + #[test] fn convert_schema_with_repeated_primitive() -> crate::errors::Result<()> { let message_type = " @@ -739,8 +836,8 @@ mod tests { )), false, ) - // add the field id to the outer list - .with_field_id(1), + // add the field id to the outer list + .with_field_id(1), )])); assert_eq!(converted.arrow_type, expected_schema); @@ -881,7 +978,7 @@ mod tests { #[test] fn convert_schema_with_repeated_struct_and_inferred_schema() -> crate::errors::Result<()> { - let message_type = " + test_roundtrip(" message schema { repeated group my_col_1 = 1 { optional binary my_col_2 = 2; @@ -892,27 +989,7 @@ mod tests { } } } - "; - - let parsed_input_schema = Arc::new(parse_message_type(message_type)?); - let schema = SchemaDescriptor::new(parsed_input_schema); - - let converted = convert_schema(&schema, ProjectionMask::all(), None)?.unwrap(); - - let DataType::Struct(schema_fields) = &converted.arrow_type else { - panic!("Expected struct from convert_schema"); - }; - - assert_eq!(schema_fields.len(), 1); - - // Should be able to convert the same thing - let converted_again = - convert_schema(&schema, ProjectionMask::all(), Some(schema_fields))?.unwrap(); - - // Assert that we changed to Utf8 - assert_eq!(converted_again.arrow_type, converted.arrow_type); - - Ok(()) + ") } #[test] From 52eceedb714512bc447c629bbe0a095a6f7086aa Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Mon, 29 Sep 2025 22:02:14 +0300 Subject: [PATCH 06/22] format --- parquet/src/arrow/schema/complex.rs | 94 ++++++++++++++--------------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index bf1eceb217ed..cdb6172f66bc 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -734,78 +734,78 @@ mod tests { #[test] fn basic_backward_compatible_list() -> crate::errors::Result<()> { test_roundtrip(" - message schema { - optional group my_list (LIST) { - repeated int32 element; - } - } - ") + message schema { + optional group my_list (LIST) { + repeated int32 element; + } + } + ") } #[test] fn basic_backward_compatible_list_2() -> crate::errors::Result<()> { test_roundtrip(" - message schema { - optional group my_list (LIST) { - repeated group element { - required binary str (STRING); - required int32 num; - }; - } - } - ") + message schema { + optional group my_list (LIST) { + repeated group element { + required binary str (STRING); + required int32 num; + }; + } + } + ") } #[test] fn basic_backward_compatible_list_3() -> crate::errors::Result<()> { test_roundtrip(" - message schema { - optional group my_list (LIST) { - repeated group array (LIST) { - repeated int32 array; - }; -} - } - ") + message schema { + optional group my_list (LIST) { + repeated group array (LIST) { + repeated int32 array; + }; + } + } + ") } #[test] fn basic_backward_compatible_list_4_1() -> crate::errors::Result<()> { test_roundtrip(" - message schema { - optional group my_list (LIST) { - repeated group array { - required binary str (STRING); - }; -} - } - ") + message schema { + optional group my_list (LIST) { + repeated group array { + required binary str (STRING); + }; + } + } + ") } #[test] fn basic_backward_compatible_list_4_2() -> crate::errors::Result<()> { test_roundtrip(" - message schema { - optional group my_list (LIST) { - repeated group my_list_tuple { - required binary str (STRING); - }; -} - } - ") + message schema { + optional group my_list (LIST) { + repeated group my_list_tuple { + required binary str (STRING); + }; + } + } + ") } #[test] fn basic_backward_compatible_list_5() -> crate::errors::Result<()> { test_roundtrip(" - message schema { - optional group my_list (LIST) { - repeated group element { - optional binary str (STRING); - }; -} - } - ") + message schema { + optional group my_list (LIST) { + repeated group element { + optional binary str (STRING); + }; + } + } + ") } #[test] From d299ca9eed10169f193df9fb642753223d1a1f22 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Mon, 29 Sep 2025 22:11:16 +0300 Subject: [PATCH 07/22] add more tests --- parquet/src/arrow/schema/complex.rs | 65 ++++++++++++++++++++++++----- 1 file changed, 55 insertions(+), 10 deletions(-) diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index cdb6172f66bc..6ec906343ab8 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -697,7 +697,7 @@ mod tests { use crate::arrow::{ProjectionMask, PARQUET_FIELD_ID_META_KEY}; use crate::schema::parser::parse_message_type; use crate::schema::types::SchemaDescriptor; - use arrow_schema::{DataType, Fields}; + use arrow_schema::{DataType, Field, Fields}; use std::sync::Arc; trait WithFieldId { @@ -731,20 +731,43 @@ mod tests { Ok(()) } + fn test_expected_type(message_type: &str, expected_fields: Fields) -> crate::errors::Result<()> { + test_roundtrip(message_type)?; + + + let parsed_input_schema = Arc::new(parse_message_type(message_type)?); + let schema = SchemaDescriptor::new(parsed_input_schema); + + let converted = convert_schema(&schema, ProjectionMask::all(), None)?.unwrap(); + + let DataType::Struct(schema_fields) = &converted.arrow_type else { + panic!("Expected struct from convert_schema"); + }; + + assert_eq!(schema_fields, &expected_fields); + + Ok(()) + } + #[test] fn basic_backward_compatible_list() -> crate::errors::Result<()> { - test_roundtrip(" + test_expected_type(" message schema { - optional group my_list (LIST) { + optional group my_list (LIST) { repeated int32 element; } } - ") + ", Fields::from(vec![ + // Rule 1: List (nullable list, non-null elements) + Field::new("my_list", DataType::List(Arc::new( + Field::new("element", DataType::Int32, false) + )), true) + ])) } #[test] fn basic_backward_compatible_list_2() -> crate::errors::Result<()> { - test_roundtrip(" + test_expected_type(" message schema { optional group my_list (LIST) { repeated group element { @@ -753,12 +776,22 @@ mod tests { }; } } - ") + ", Fields::from(vec![ + // Rule 2: List> (nullable list, non-null elements) + Field::new("my_list", DataType::List(Arc::new( + Field::new("element", DataType::Struct( + Fields::from(vec![ + Field::new("str", DataType::Binary, false), + Field::new("num", DataType::Int32, false), + ]) + ), false) + )), true) + ])) } #[test] fn basic_backward_compatible_list_3() -> crate::errors::Result<()> { - test_roundtrip(" + test_expected_type(" message schema { optional group my_list (LIST) { repeated group array (LIST) { @@ -766,7 +799,14 @@ mod tests { }; } } - ") + ", Fields::from(vec![ + // Rule 3: List> (nullable outer list, non-null elements) + Field::new("my_list", DataType::List(Arc::new( + Field::new("array", DataType::List(Arc::new( + Field::new("array", DataType::Int32, false) + )), false) + )), true) + ])) } #[test] @@ -797,7 +837,7 @@ mod tests { #[test] fn basic_backward_compatible_list_5() -> crate::errors::Result<()> { - test_roundtrip(" + test_expected_type(" message schema { optional group my_list (LIST) { repeated group element { @@ -805,7 +845,12 @@ mod tests { }; } } - ") + ", Fields::from(vec![ + // Rule 5: List (nullable list, nullable elements) + Field::new("my_list", DataType::List(Arc::new( + Field::new("element", DataType::Binary, true) + )), true) + ])) } #[test] From 1aa573fc91f6b08678d5174343fc6a71bdacf743 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Mon, 29 Sep 2025 22:38:06 +0300 Subject: [PATCH 08/22] fix --- parquet/src/arrow/schema/complex.rs | 252 ++++++++++++++++++++++------ 1 file changed, 199 insertions(+), 53 deletions(-) diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index 6ec906343ab8..c2e7c8d792a5 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -58,10 +58,29 @@ pub struct ParquetField { impl ParquetField { /// Converts `self` into an arrow list, with its current type as the field type - /// accept an optional `list_data_type` to specify the type of list to create /// /// This is used to convert repeated columns, into their arrow representation - fn into_list(self, parquet_field_type: &Type, list_data_type: Option) -> Self { + fn into_list(self, name: &str) -> Self { + ParquetField { + rep_level: self.rep_level, + def_level: self.def_level, + nullable: false, + arrow_type: DataType::List(Arc::new(Field::new(name, self.arrow_type.clone(), false))), + field_type: ParquetFieldType::Group { + children: vec![self], + }, + } + } + + /// Converts `self` into an arrow list, with its current type as the field type + /// accept an optional `list_data_type` to specify the type of list to create + /// + /// This is used to convert deprecated repeated columns (not in a list), into their arrow representation + fn into_list_with_arrow_list_hint( + self, + parquet_field_type: &Type, + list_data_type: Option, + ) -> Self { let arrow_field = match &list_data_type { Some(DataType::List(field_hint)) | Some(DataType::LargeList(field_hint)) @@ -127,6 +146,13 @@ struct VisitorContext { def_level: i16, /// An optional [`DataType`] sourced from the embedded arrow schema data_type: Option, + + /// Whether to treat repeated types as list from arrow types or + /// when true, if data_type provided it should be DataType::List() (or other list type) + /// and the list field data type would be treated as the hint for the parquet type + /// + /// when false, if data_type provided it will be treated as the hint without unwrapping + treat_repeated_as_list_arrow_hint: bool, } impl VisitorContext { @@ -171,7 +197,7 @@ impl Visitor { let (def_level, rep_level, nullable) = context.levels(repetition); let primitive_arrow_data_type = match repetition { - Repetition::REPEATED => { + Repetition::REPEATED if context.treat_repeated_as_list_arrow_hint => { let arrow_field = match &context.data_type { Some(DataType::List(f)) => Some(f.as_ref()), Some(DataType::LargeList(f)) => Some(f.as_ref()), @@ -202,7 +228,10 @@ impl Visitor { }; Ok(Some(match repetition { - Repetition::REPEATED => primitive_field.into_list(primitive_type, context.data_type), + Repetition::REPEATED if context.treat_repeated_as_list_arrow_hint => { + primitive_field.into_list_with_arrow_list_hint(primitive_type, context.data_type) + } + Repetition::REPEATED => primitive_field.into_list(primitive_type.name()), _ => primitive_field, })) } @@ -220,7 +249,7 @@ impl Visitor { // Extract any arrow fields from the hints let arrow_struct = match repetition { - Repetition::REPEATED => { + Repetition::REPEATED if context.treat_repeated_as_list_arrow_hint => { let arrow_field = match &context.data_type { Some(DataType::List(f)) => Some(f.as_ref()), Some(DataType::LargeList(f)) => Some(f.as_ref()), @@ -284,6 +313,7 @@ impl Visitor { rep_level, def_level, data_type, + treat_repeated_as_list_arrow_hint: true, }; if let Some(child) = self.dispatch(parquet_field, child_ctx)? { @@ -307,7 +337,10 @@ impl Visitor { }; Ok(Some(match repetition { - Repetition::REPEATED => struct_field.into_list(struct_type, context.data_type), + Repetition::REPEATED if context.treat_repeated_as_list_arrow_hint => { + struct_field.into_list_with_arrow_list_hint(struct_type, context.data_type) + } + Repetition::REPEATED => struct_field.into_list(struct_type.name()), _ => struct_field, })) } @@ -401,6 +434,7 @@ impl Visitor { rep_level, def_level, data_type: arrow_key.map(|x| x.data_type().clone()), + treat_repeated_as_list_arrow_hint: true, }; self.dispatch(map_key, context)? @@ -411,6 +445,7 @@ impl Visitor { rep_level, def_level, data_type: arrow_value.map(|x| x.data_type().clone()), + treat_repeated_as_list_arrow_hint: true, }; self.dispatch(map_value, context)? @@ -507,6 +542,7 @@ impl Visitor { rep_level: context.rep_level, def_level, data_type: arrow_field.map(|f| f.data_type().clone()), + treat_repeated_as_list_arrow_hint: false, }; return match self.visit_primitive(repeated_field, context) { @@ -538,6 +574,7 @@ impl Visitor { rep_level: context.rep_level, def_level, data_type: arrow_field.map(|f| f.data_type().clone()), + treat_repeated_as_list_arrow_hint: false, }; return match self.visit_struct(repeated_field, context) { @@ -558,6 +595,7 @@ impl Visitor { def_level, rep_level, data_type: arrow_field.map(|f| f.data_type().clone()), + treat_repeated_as_list_arrow_hint: true, }; match self.dispatch(item_type, new_context) { @@ -670,6 +708,7 @@ pub fn convert_schema( rep_level: 0, def_level: 0, data_type: embedded_arrow_schema.map(|fields| DataType::Struct(fields.clone())), + treat_repeated_as_list_arrow_hint: true, }; visitor.dispatch(&schema.root_schema_ptr(), context) @@ -686,6 +725,8 @@ pub fn convert_type(parquet_type: &TypePtr) -> Result { rep_level: 0, def_level: 0, data_type: None, + // We might be inside list + treat_repeated_as_list_arrow_hint: false, }; Ok(visitor.dispatch(parquet_type, context)?.unwrap()) @@ -723,7 +764,7 @@ mod tests { // Should be able to convert the same thing let converted_again = - convert_schema(&schema, ProjectionMask::all(), Some(schema_fields))?.unwrap(); + convert_schema(&schema, ProjectionMask::all(), Some(schema_fields))?.unwrap(); // Assert that we changed to Utf8 assert_eq!(converted_again.arrow_type, converted.arrow_type); @@ -731,10 +772,12 @@ mod tests { Ok(()) } - fn test_expected_type(message_type: &str, expected_fields: Fields) -> crate::errors::Result<()> { + fn test_expected_type( + message_type: &str, + expected_fields: Fields, + ) -> crate::errors::Result<()> { test_roundtrip(message_type)?; - let parsed_input_schema = Arc::new(parse_message_type(message_type)?); let schema = SchemaDescriptor::new(parsed_input_schema); @@ -750,24 +793,31 @@ mod tests { } #[test] - fn basic_backward_compatible_list() -> crate::errors::Result<()> { - test_expected_type(" + fn basic_backward_compatible_list_1() -> crate::errors::Result<()> { + test_expected_type( + " message schema { optional group my_list (LIST) { repeated int32 element; } } - ", Fields::from(vec![ - // Rule 1: List (nullable list, non-null elements) - Field::new("my_list", DataType::List(Arc::new( - Field::new("element", DataType::Int32, false) - )), true) - ])) + ", + Fields::from(vec![ + // Rule 1: List (nullable list, non-null elements) + Field::new( + "my_list", + DataType::List(Arc::new(Field::new("element", DataType::Int32, false))), + true, + ), + ]), + ) } #[test] + #[ignore = "not working yet"] fn basic_backward_compatible_list_2() -> crate::errors::Result<()> { - test_expected_type(" + test_expected_type( + " message schema { optional group my_list (LIST) { repeated group element { @@ -776,22 +826,30 @@ mod tests { }; } } - ", Fields::from(vec![ - // Rule 2: List> (nullable list, non-null elements) - Field::new("my_list", DataType::List(Arc::new( - Field::new("element", DataType::Struct( - Fields::from(vec![ - Field::new("str", DataType::Binary, false), - Field::new("num", DataType::Int32, false), - ]) - ), false) - )), true) - ])) + ", + Fields::from(vec![ + // Rule 2: List> (nullable list, non-null elements) + Field::new( + "my_list", + DataType::List(Arc::new(Field::new( + "element", + DataType::Struct(Fields::from(vec![ + Field::new("str", DataType::Binary, false), + Field::new("num", DataType::Int32, false), + ])), + false, + ))), + true, + ), + ]), + ) } #[test] + #[ignore = "not working yet"] fn basic_backward_compatible_list_3() -> crate::errors::Result<()> { - test_expected_type(" + test_expected_type( + " message schema { optional group my_list (LIST) { repeated group array (LIST) { @@ -799,19 +857,27 @@ mod tests { }; } } - ", Fields::from(vec![ - // Rule 3: List> (nullable outer list, non-null elements) - Field::new("my_list", DataType::List(Arc::new( - Field::new("array", DataType::List(Arc::new( - Field::new("array", DataType::Int32, false) - )), false) - )), true) - ])) + ", + Fields::from(vec![ + // Rule 3: List> (nullable outer list, non-null elements) + Field::new( + "my_list", + DataType::List(Arc::new(Field::new( + "array", + DataType::List(Arc::new(Field::new("array", DataType::Int32, false))), + false, + ))), + true, + ), + ]), + ) } #[test] + #[ignore = "not working yet"] fn basic_backward_compatible_list_4_1() -> crate::errors::Result<()> { - test_roundtrip(" + test_roundtrip( + " message schema { optional group my_list (LIST) { repeated group array { @@ -819,12 +885,15 @@ mod tests { }; } } - ") + ", + ) } #[test] + #[ignore = "not working yet"] fn basic_backward_compatible_list_4_2() -> crate::errors::Result<()> { - test_roundtrip(" + test_roundtrip( + " message schema { optional group my_list (LIST) { repeated group my_list_tuple { @@ -832,12 +901,15 @@ mod tests { }; } } - ") + ", + ) } #[test] + #[ignore = "not working yet"] fn basic_backward_compatible_list_5() -> crate::errors::Result<()> { - test_expected_type(" + test_expected_type( + " message schema { optional group my_list (LIST) { repeated group element { @@ -845,12 +917,84 @@ mod tests { }; } } - ", Fields::from(vec![ - // Rule 5: List (nullable list, nullable elements) - Field::new("my_list", DataType::List(Arc::new( - Field::new("element", DataType::Binary, true) - )), true) - ])) + ", + Fields::from(vec![ + // Rule 5: List (nullable list, nullable elements) + Field::new( + "my_list", + DataType::List(Arc::new(Field::new("element", DataType::Binary, true))), + true, + ), + ]), + ) + } + + #[test] + fn basic_backward_compatible_map_1() -> crate::errors::Result<()> { + test_expected_type( + " + message schema { + optional group my_map (MAP) { + repeated group map { + required binary str (STRING); + required int32 num; + } + } + } + ", + Fields::from(vec![ + // Map (nullable map, non-null values) + Field::new( + "my_map", + DataType::Map( + Arc::new(Field::new( + "map", + DataType::Struct(Fields::from(vec![ + Field::new("str", DataType::Utf8, false), + Field::new("num", DataType::Int32, false), + ])), + false, + )), + false, + ), + true, + ), + ]), + ) + } + + #[test] + fn basic_backward_compatible_map_2() -> crate::errors::Result<()> { + test_expected_type( + " + message schema { + optional group my_map (MAP_KEY_VALUE) { + repeated group map { + required binary key (STRING); + optional int32 value; + } + } + } + ", + Fields::from(vec![ + // Map (nullable map, nullable values) + Field::new( + "my_map", + DataType::Map( + Arc::new(Field::new( + "map", + DataType::Struct(Fields::from(vec![ + Field::new("key", DataType::Utf8, false), + Field::new("value", DataType::Int32, true), + ])), + false, + )), + false, + ), + true, + ), + ]), + ) } #[test] @@ -881,8 +1025,8 @@ mod tests { )), false, ) - // add the field id to the outer list - .with_field_id(1), + // add the field id to the outer list + .with_field_id(1), )])); assert_eq!(converted.arrow_type, expected_schema); @@ -1023,7 +1167,8 @@ mod tests { #[test] fn convert_schema_with_repeated_struct_and_inferred_schema() -> crate::errors::Result<()> { - test_roundtrip(" + test_roundtrip( + " message schema { repeated group my_col_1 = 1 { optional binary my_col_2 = 2; @@ -1034,7 +1179,8 @@ mod tests { } } } - ") + ", + ) } #[test] From ab54f02a55394d579bd2aa5a8ec6d5f95e8f42ad Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Mon, 29 Sep 2025 22:41:35 +0300 Subject: [PATCH 09/22] align with main --- parquet/src/arrow/schema/complex.rs | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index c2e7c8d792a5..cbd390647142 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -18,7 +18,7 @@ use std::collections::HashMap; use std::sync::Arc; -use crate::arrow::schema::extension::add_extension_type; +use crate::arrow::schema::extension::try_add_extension_type; use crate::arrow::schema::primitive::convert_primitive; use crate::arrow::{ProjectionMask, PARQUET_FIELD_ID_META_KEY}; use crate::basic::{ConvertedType, Repetition}; @@ -80,7 +80,7 @@ impl ParquetField { self, parquet_field_type: &Type, list_data_type: Option, - ) -> Self { + ) -> Result { let arrow_field = match &list_data_type { Some(DataType::List(field_hint)) | Some(DataType::LargeList(field_hint)) @@ -97,10 +97,10 @@ impl ParquetField { arrow_field, // Only add the field id to the list and not to the element false, - ) + )? .with_nullable(false); - ParquetField { + Ok(ParquetField { rep_level: self.rep_level, def_level: self.def_level, nullable: false, @@ -115,7 +115,7 @@ impl ParquetField { field_type: ParquetFieldType::Group { children: vec![self], }, - } + }) } /// Returns a list of [`ParquetField`] children if this is a group type @@ -229,7 +229,7 @@ impl Visitor { Ok(Some(match repetition { Repetition::REPEATED if context.treat_repeated_as_list_arrow_hint => { - primitive_field.into_list_with_arrow_list_hint(primitive_type, context.data_type) + primitive_field.into_list_with_arrow_list_hint(primitive_type, context.data_type)? } Repetition::REPEATED => primitive_field.into_list(primitive_type.name()), _ => primitive_field, @@ -319,7 +319,7 @@ impl Visitor { if let Some(child) = self.dispatch(parquet_field, child_ctx)? { // The child type returned may be different from what is encoded in the arrow // schema in the event of a mismatch or a projection - child_fields.push(convert_field(parquet_field, &child, arrow_field, true)); + child_fields.push(convert_field(parquet_field, &child, arrow_field, true)?); children.push(child); } } @@ -338,7 +338,7 @@ impl Visitor { Ok(Some(match repetition { Repetition::REPEATED if context.treat_repeated_as_list_arrow_hint => { - struct_field.into_list_with_arrow_list_hint(struct_type, context.data_type) + struct_field.into_list_with_arrow_list_hint(struct_type, context.data_type)? } Repetition::REPEATED => struct_field.into_list(struct_type.name()), _ => struct_field, @@ -455,11 +455,11 @@ impl Visitor { match (maybe_key, maybe_value) { (Some(key), Some(value)) => { let key_field = Arc::new( - convert_field(map_key, &key, arrow_key, true) + convert_field(map_key, &key, arrow_key, true)? // The key is always non-nullable (#5630) .with_nullable(false), ); - let value_field = Arc::new(convert_field(map_value, &value, arrow_value, true)); + let value_field = Arc::new(convert_field(map_value, &value, arrow_value, true)?); let field_metadata = match arrow_map { Some(field) => field.metadata().clone(), _ => HashMap::default(), @@ -600,7 +600,7 @@ impl Visitor { match self.dispatch(item_type, new_context) { Ok(Some(item)) => { - let item_field = Arc::new(convert_field(item_type, &item, arrow_field, true)); + let item_field = Arc::new(convert_field(item_type, &item, arrow_field, true)?); // Use arrow type as hint for index size let arrow_type = match context.data_type { @@ -653,7 +653,7 @@ fn convert_field( field: &ParquetField, arrow_hint: Option<&Field>, add_field_id: bool, -) -> Field { +) -> Result { let name = parquet_type.name(); let data_type = field.arrow_type.clone(); let nullable = field.nullable; @@ -671,7 +671,7 @@ fn convert_field( _ => Field::new(name, data_type, nullable), }; - field.with_metadata(hint.metadata().clone()) + Ok(field.with_metadata(hint.metadata().clone())) } None => { let mut ret = Field::new(name, data_type, nullable); @@ -684,7 +684,7 @@ fn convert_field( ); ret.set_metadata(meta); } - add_extension_type(ret, parquet_type) + try_add_extension_type(ret, parquet_type) } } } From f0716dd83f850a052aed1d0e9167f624230f6c5a Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Tue, 30 Sep 2025 15:11:50 +0300 Subject: [PATCH 10/22] set map align --- parquet/src/arrow/schema/complex.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index cbd390647142..e2d3a55c38e3 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -434,7 +434,8 @@ impl Visitor { rep_level, def_level, data_type: arrow_key.map(|x| x.data_type().clone()), - treat_repeated_as_list_arrow_hint: true, + // Key is not repeated + treat_repeated_as_list_arrow_hint: false, }; self.dispatch(map_key, context)? @@ -445,6 +446,7 @@ impl Visitor { rep_level, def_level, data_type: arrow_value.map(|x| x.data_type().clone()), + // Value type can be repeated treat_repeated_as_list_arrow_hint: true, }; From 470acbbe4d3d87b2dc2dc90b62966be109cf6df0 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Tue, 30 Sep 2025 15:44:31 +0300 Subject: [PATCH 11/22] add more tests from spark codebase for more coverage --- parquet/src/arrow/schema/complex.rs | 355 +++++++++++++++++++++++++++- 1 file changed, 354 insertions(+), 1 deletion(-) diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index e2d3a55c38e3..1e644ca1aa76 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -1000,7 +1000,21 @@ mod tests { } #[test] - fn convert_schema_with_repeated_primitive() -> crate::errors::Result<()> { + fn convert_schema_with_nested_list_repeated_primitive() -> crate::errors::Result<()> { + test_roundtrip( + " + message schema { + optional group f1 (LIST) { + repeated group element { + repeated int32 element; + } + } + } + ") + } + + #[test] + fn convert_schema_with_repeated_primitive_keep_field_id() -> crate::errors::Result<()> { let message_type = " message schema { repeated BYTE_ARRAY col_1 = 1; @@ -1413,4 +1427,343 @@ message schema { Ok(()) } + + /// Backwards-compatibility: LIST with nullable element type - 1 - standard + /// Taken from [Spark](https://github.com/apache/spark/blob/8ab50765cd793169091d983b50d87a391f6ac1f4/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala#L452-L466) + #[test] + fn a1() -> crate::errors::Result<()> { + test_expected_type( + " + message root { + optional group f1 (LIST) { + repeated group list { + optional int32 element; + } + } + }", + Fields::from(vec![Field::new( + "f1", + DataType::List(Arc::new(Field::new("element", DataType::Int32, true))), + true, + )]), + ) + } + + /// Backwards-compatibility: LIST with nullable element type - 2 + /// Taken from [Spark](https://github.com/apache/spark/blob/8ab50765cd793169091d983b50d87a391f6ac1f4/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala#L468-L482) + #[test] + fn a2() -> crate::errors::Result<()> { + test_expected_type( + " + message root { + optional group f1 (LIST) { + repeated group element { + optional int32 num; + } + } + }", + Fields::from(vec![Field::new( + "f1", + DataType::List(Arc::new(Field::new( + "num", + DataType::Int32, + true, + ))), + true, + )]), + ) + } + + /// Backwards-compatibility: LIST with non-nullable element type - 1 - standard + /// Taken from [Spark](https://github.com/apache/spark/blob/8ab50765cd793169091d983b50d87a391f6ac1f4/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala#L484-L495) + #[test] + fn a3() -> crate::errors::Result<()> { + test_expected_type( + " + message root { + optional group f1 (LIST) { + repeated group list { + required int32 element; + } + } + }", + Fields::from(vec![Field::new( + "f1", + DataType::List(Arc::new(Field::new("element", DataType::Int32, false))), + true, + )]), + ) + } + + /// Backwards-compatibility: LIST with non-nullable element type - 2 + /// Taken from [Spark](https://github.com/apache/spark/blob/8ab50765cd793169091d983b50d87a391f6ac1f4/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala#L497-L508) + #[test] + fn a4() -> crate::errors::Result<()> { + test_expected_type( + " + message root { + optional group f1 (LIST) { + repeated group element { + required int32 num; + } + } + }", + Fields::from(vec![Field::new( + "f1", + DataType::List(Arc::new(Field::new("num", DataType::Int32, false))), + true, + )]), + ) + } + + /// Backwards-compatibility: LIST with non-nullable element type - 3 + /// Taken from [Spark](https://github.com/apache/spark/blob/8ab50765cd793169091d983b50d87a391f6ac1f4/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala#L510-L519) + #[test] + fn a5() -> crate::errors::Result<()> { + test_expected_type( + " + message root { + optional group f1 (LIST) { + repeated int32 element; + } + }", + Fields::from(vec![Field::new( + "f1", + DataType::List(Arc::new(Field::new("element", DataType::Int32, false))), + true, + )]), + ) + } + + /// Backwards-compatibility: LIST with non-nullable element type - 4 + /// Taken from [Spark](https://github.com/apache/spark/blob/8ab50765cd793169091d983b50d87a391f6ac1f4/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala#L521-L540) + #[test] + fn a6() -> crate::errors::Result<()> { + test_expected_type( + " + message root { + optional group f1 (LIST) { + repeated group element { + required binary str (UTF8); + required int32 num; + } + } + }", + Fields::from(vec![Field::new( + "f1", + DataType::List(Arc::new(Field::new( + "element", + DataType::Struct(Fields::from(vec![ + Field::new("str", DataType::Utf8, false), + Field::new("num", DataType::Int32, false), + ])), + false, + ))), true, + )]), + ) + } + + /// Backwards-compatibility: LIST with non-nullable element type - 5 - parquet-avro style + /// Taken from [Spark](https://github.com/apache/spark/blob/8ab50765cd793169091d983b50d87a391f6ac1f4/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala#L542-L559) + #[test] + fn a7() -> crate::errors::Result<()> { + test_expected_type( + " + message root { + optional group f1 (LIST) { + repeated group array { + required binary str (UTF8); + } + } + }", + Fields::from(vec![Field::new( + "f1", + DataType::List(Arc::new(Field::new( + "array", + DataType::Struct(Fields::from(vec![ + Field::new("str", DataType::Utf8, false), + ])), + false, + ))), + true, + )]), + ) + } + + /// Backwards-compatibility: LIST with non-nullable element type - 6 - parquet-thrift style + /// Taken from [Spark](https://github.com/apache/spark/blob/8ab50765cd793169091d983b50d87a391f6ac1f4/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala#L561-L578) + #[test] + fn a8() -> crate::errors::Result<()> { + test_expected_type( + " + message root { + optional group f1 (LIST) { + repeated group f1_tuple { + required binary str (UTF8); + } + } + }", + Fields::from(vec![Field::new( + "f1", + DataType::List(Arc::new(Field::new( + "f1_tuple", + DataType::Struct(Fields::from(vec![ + Field::new("str", DataType::Utf8, false), + ])), + false, + ))), + true, + )]), + ) + } + + /// Backwards-compatibility: MAP with non-nullable value type - 1 - standard + /// Taken from [Spark](https://github.com/apache/spark/blob/8ab50765cd793169091d983b50d87a391f6ac1f4/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala#L652-L667) + #[test] + fn a9() -> crate::errors::Result<()> { + test_expected_type( + " + message root { + optional group f1 (MAP) { + repeated group key_value { + required int32 key; + required binary value (UTF8); + } + } + }", + Fields::from(vec![Field::new_map( + "f1", + "key_value", + Field::new("key", DataType::Int32, false), + Field::new("value", DataType::Utf8, false), + false, + true, + )]), + ) + } + + /// Backwards-compatibility: MAP with non-nullable value type - 2 + /// Taken from [Spark](https://github.com/apache/spark/blob/8ab50765cd793169091d983b50d87a391f6ac1f4/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala#L669-L684) + #[test] + fn a10() -> crate::errors::Result<()> { + test_expected_type( + " + message root { + optional group f1 (MAP_KEY_VALUE) { + repeated group map { + required int32 num; + required binary str (UTF8); + } + } + }", + Fields::from(vec![Field::new_map( + "f1", + "map", + Field::new("num", DataType::Int32, false), + Field::new("str", DataType::Utf8, false), + false, + true, + )]), + ) + } + + /// Backwards-compatibility: MAP with non-nullable value type - 3 - prior to 1.4.x + /// Taken from [Spark](https://github.com/apache/spark/blob/8ab50765cd793169091d983b50d87a391f6ac1f4/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala#L686-L701) + #[test] + fn a11() -> crate::errors::Result<()> { + test_expected_type( + " + message root { + optional group f1 (MAP) { + repeated group map (MAP_KEY_VALUE) { + required int32 key; + required binary value (UTF8); + } + } + }", + Fields::from(vec![Field::new_map( + "f1", + "map", + Field::new("key", DataType::Int32, false), + Field::new("value", DataType::Utf8, false), + false, + true, + )]), + ) + } + + /// Backwards-compatibility: MAP with nullable value type - 1 - standard + /// Taken from [Spark](https://github.com/apache/spark/blob/8ab50765cd793169091d983b50d87a391f6ac1f4/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala#L703-L718) + #[test] + fn a12() -> crate::errors::Result<()> { + test_expected_type( + " + message root { + optional group f1 (MAP) { + repeated group key_value { + required int32 key; + optional binary value (UTF8); + } + } + }", + Fields::from(vec![Field::new_map( + "f1", + "key_value", + Field::new("key", DataType::Int32, false), + Field::new("value", DataType::Utf8, true), + false, + true, + )]), + ) + } + + /// Backwards-compatibility: MAP with nullable value type - 2 + /// Taken from [Spark](https://github.com/apache/spark/blob/8ab50765cd793169091d983b50d87a391f6ac1f4/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala#L720-L735) + #[test] + fn a13() -> crate::errors::Result<()> { + test_expected_type( + " + message root { + optional group f1 (MAP_KEY_VALUE) { + repeated group map { + required int32 num; + optional binary str (UTF8); + } + } + }", + Fields::from(vec![Field::new_map( + "f1", + "map", + Field::new("num", DataType::Int32, false), + Field::new("str", DataType::Utf8, true), + false, + true, + )]), + ) + } + + /// Backwards-compatibility: MAP with nullable value type - 3 - parquet-avro style + /// Taken from [Spark](https://github.com/apache/spark/blob/8ab50765cd793169091d983b50d87a391f6ac1f4/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala#L737-L752) + #[test] + fn a14() -> crate::errors::Result<()> { + test_expected_type( + " + message root { + optional group f1 (MAP) { + repeated group map (MAP_KEY_VALUE) { + required int32 key; + optional binary value (UTF8); + } + } + }", + Fields::from(vec![Field::new_map( + "f1", + "map", + Field::new("key", DataType::Int32, false), + Field::new("value", DataType::Utf8, true), + false, + true, + )]), + ) + } } From 03dfa5453a1dbeeda6e6fbc277f334b91e7332e1 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Tue, 30 Sep 2025 15:44:58 +0300 Subject: [PATCH 12/22] rename --- parquet/src/arrow/schema/complex.rs | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index 1e644ca1aa76..741ee06a33b2 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -1431,7 +1431,7 @@ message schema { /// Backwards-compatibility: LIST with nullable element type - 1 - standard /// Taken from [Spark](https://github.com/apache/spark/blob/8ab50765cd793169091d983b50d87a391f6ac1f4/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala#L452-L466) #[test] - fn a1() -> crate::errors::Result<()> { + fn list_nullable_element_standard() -> crate::errors::Result<()> { test_expected_type( " message root { @@ -1452,7 +1452,7 @@ message schema { /// Backwards-compatibility: LIST with nullable element type - 2 /// Taken from [Spark](https://github.com/apache/spark/blob/8ab50765cd793169091d983b50d87a391f6ac1f4/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala#L468-L482) #[test] - fn a2() -> crate::errors::Result<()> { + fn list_nullable_element_nested() -> crate::errors::Result<()> { test_expected_type( " message root { @@ -1477,7 +1477,7 @@ message schema { /// Backwards-compatibility: LIST with non-nullable element type - 1 - standard /// Taken from [Spark](https://github.com/apache/spark/blob/8ab50765cd793169091d983b50d87a391f6ac1f4/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala#L484-L495) #[test] - fn a3() -> crate::errors::Result<()> { + fn list_required_element_standard() -> crate::errors::Result<()> { test_expected_type( " message root { @@ -1498,7 +1498,7 @@ message schema { /// Backwards-compatibility: LIST with non-nullable element type - 2 /// Taken from [Spark](https://github.com/apache/spark/blob/8ab50765cd793169091d983b50d87a391f6ac1f4/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala#L497-L508) #[test] - fn a4() -> crate::errors::Result<()> { + fn list_required_element_nested() -> crate::errors::Result<()> { test_expected_type( " message root { @@ -1519,7 +1519,7 @@ message schema { /// Backwards-compatibility: LIST with non-nullable element type - 3 /// Taken from [Spark](https://github.com/apache/spark/blob/8ab50765cd793169091d983b50d87a391f6ac1f4/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala#L510-L519) #[test] - fn a5() -> crate::errors::Result<()> { + fn list_required_element_primitive() -> crate::errors::Result<()> { test_expected_type( " message root { @@ -1538,7 +1538,7 @@ message schema { /// Backwards-compatibility: LIST with non-nullable element type - 4 /// Taken from [Spark](https://github.com/apache/spark/blob/8ab50765cd793169091d983b50d87a391f6ac1f4/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala#L521-L540) #[test] - fn a6() -> crate::errors::Result<()> { + fn list_required_element_struct() -> crate::errors::Result<()> { test_expected_type( " message root { @@ -1566,7 +1566,7 @@ message schema { /// Backwards-compatibility: LIST with non-nullable element type - 5 - parquet-avro style /// Taken from [Spark](https://github.com/apache/spark/blob/8ab50765cd793169091d983b50d87a391f6ac1f4/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala#L542-L559) #[test] - fn a7() -> crate::errors::Result<()> { + fn list_required_element_avro_style() -> crate::errors::Result<()> { test_expected_type( " message root { @@ -1593,7 +1593,7 @@ message schema { /// Backwards-compatibility: LIST with non-nullable element type - 6 - parquet-thrift style /// Taken from [Spark](https://github.com/apache/spark/blob/8ab50765cd793169091d983b50d87a391f6ac1f4/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala#L561-L578) #[test] - fn a8() -> crate::errors::Result<()> { + fn list_required_element_thrift_style() -> crate::errors::Result<()> { test_expected_type( " message root { @@ -1620,7 +1620,7 @@ message schema { /// Backwards-compatibility: MAP with non-nullable value type - 1 - standard /// Taken from [Spark](https://github.com/apache/spark/blob/8ab50765cd793169091d983b50d87a391f6ac1f4/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala#L652-L667) #[test] - fn a9() -> crate::errors::Result<()> { + fn map_required_value_standard() -> crate::errors::Result<()> { test_expected_type( " message root { @@ -1645,7 +1645,7 @@ message schema { /// Backwards-compatibility: MAP with non-nullable value type - 2 /// Taken from [Spark](https://github.com/apache/spark/blob/8ab50765cd793169091d983b50d87a391f6ac1f4/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala#L669-L684) #[test] - fn a10() -> crate::errors::Result<()> { + fn map_required_value_map_key_value() -> crate::errors::Result<()> { test_expected_type( " message root { @@ -1670,7 +1670,7 @@ message schema { /// Backwards-compatibility: MAP with non-nullable value type - 3 - prior to 1.4.x /// Taken from [Spark](https://github.com/apache/spark/blob/8ab50765cd793169091d983b50d87a391f6ac1f4/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala#L686-L701) #[test] - fn a11() -> crate::errors::Result<()> { + fn map_required_value_legacy() -> crate::errors::Result<()> { test_expected_type( " message root { @@ -1695,7 +1695,7 @@ message schema { /// Backwards-compatibility: MAP with nullable value type - 1 - standard /// Taken from [Spark](https://github.com/apache/spark/blob/8ab50765cd793169091d983b50d87a391f6ac1f4/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala#L703-L718) #[test] - fn a12() -> crate::errors::Result<()> { + fn map_optional_value_standard() -> crate::errors::Result<()> { test_expected_type( " message root { @@ -1720,7 +1720,7 @@ message schema { /// Backwards-compatibility: MAP with nullable value type - 2 /// Taken from [Spark](https://github.com/apache/spark/blob/8ab50765cd793169091d983b50d87a391f6ac1f4/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala#L720-L735) #[test] - fn a13() -> crate::errors::Result<()> { + fn map_optional_value_map_key_value() -> crate::errors::Result<()> { test_expected_type( " message root { @@ -1745,7 +1745,7 @@ message schema { /// Backwards-compatibility: MAP with nullable value type - 3 - parquet-avro style /// Taken from [Spark](https://github.com/apache/spark/blob/8ab50765cd793169091d983b50d87a391f6ac1f4/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala#L737-L752) #[test] - fn a14() -> crate::errors::Result<()> { + fn map_optional_value_avro_style() -> crate::errors::Result<()> { test_expected_type( " message root { From 301c7a7de400d58aa339f61d735195d41bc1a1f5 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Tue, 30 Sep 2025 15:45:41 +0300 Subject: [PATCH 13/22] format --- parquet/src/arrow/schema/complex.rs | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index 741ee06a33b2..ee3802a77076 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -1010,7 +1010,8 @@ mod tests { } } } - ") + ", + ) } #[test] @@ -1464,11 +1465,7 @@ message schema { }", Fields::from(vec![Field::new( "f1", - DataType::List(Arc::new(Field::new( - "num", - DataType::Int32, - true, - ))), + DataType::List(Arc::new(Field::new("num", DataType::Int32, true))), true, )]), ) @@ -1558,7 +1555,8 @@ message schema { Field::new("num", DataType::Int32, false), ])), false, - ))), true, + ))), + true, )]), ) } @@ -1580,9 +1578,7 @@ message schema { "f1", DataType::List(Arc::new(Field::new( "array", - DataType::Struct(Fields::from(vec![ - Field::new("str", DataType::Utf8, false), - ])), + DataType::Struct(Fields::from(vec![Field::new("str", DataType::Utf8, false)])), false, ))), true, @@ -1607,9 +1603,7 @@ message schema { "f1", DataType::List(Arc::new(Field::new( "f1_tuple", - DataType::Struct(Fields::from(vec![ - Field::new("str", DataType::Utf8, false), - ])), + DataType::Struct(Fields::from(vec![Field::new("str", DataType::Utf8, false)])), false, ))), true, From 39b9263e3f33772b11d575bfd723a0fc0b38c7bc Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Tue, 30 Sep 2025 16:02:34 +0300 Subject: [PATCH 14/22] remove tests from ignore and added more comments --- parquet/src/arrow/schema/complex.rs | 61 ++++++++++++++++++++++------- 1 file changed, 47 insertions(+), 14 deletions(-) diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index ee3802a77076..b7b635717dc2 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -794,6 +794,7 @@ mod tests { Ok(()) } + /// Taken from the example in [Parquet Format - Nested Types - Lists - Backward-compatibility rules](https://github.com/apache/parquet-format/blob/9fd57b59e0ce1a82a69237dcf8977d3e72a2965d/LogicalTypes.md?plain=1#L766-L769) #[test] fn basic_backward_compatible_list_1() -> crate::errors::Result<()> { test_expected_type( @@ -815,8 +816,8 @@ mod tests { ) } + /// Taken from the example in [Parquet Format - Nested Types - Lists - Backward-compatibility rules](https://github.com/apache/parquet-format/blob/9fd57b59e0ce1a82a69237dcf8977d3e72a2965d/LogicalTypes.md?plain=1#L771-L777) #[test] - #[ignore = "not working yet"] fn basic_backward_compatible_list_2() -> crate::errors::Result<()> { test_expected_type( " @@ -825,7 +826,7 @@ mod tests { repeated group element { required binary str (STRING); required int32 num; - }; + } } } ", @@ -836,7 +837,7 @@ mod tests { DataType::List(Arc::new(Field::new( "element", DataType::Struct(Fields::from(vec![ - Field::new("str", DataType::Binary, false), + Field::new("str", DataType::Utf8, false), Field::new("num", DataType::Int32, false), ])), false, @@ -847,8 +848,8 @@ mod tests { ) } + /// Taken from the example in [Parquet Format - Nested Types - Lists - Backward-compatibility rules](https://github.com/apache/parquet-format/blob/9fd57b59e0ce1a82a69237dcf8977d3e72a2965d/LogicalTypes.md?plain=1#L779-L784) #[test] - #[ignore = "not working yet"] fn basic_backward_compatible_list_3() -> crate::errors::Result<()> { test_expected_type( " @@ -856,7 +857,7 @@ mod tests { optional group my_list (LIST) { repeated group array (LIST) { repeated int32 array; - }; + } } } ", @@ -875,40 +876,72 @@ mod tests { ) } + /// Taken from the example in [Parquet Format - Nested Types - Lists - Backward-compatibility rules](https://github.com/apache/parquet-format/blob/9fd57b59e0ce1a82a69237dcf8977d3e72a2965d/LogicalTypes.md?plain=1#L786-L791) #[test] - #[ignore = "not working yet"] fn basic_backward_compatible_list_4_1() -> crate::errors::Result<()> { - test_roundtrip( + test_expected_type( " message schema { optional group my_list (LIST) { repeated group array { required binary str (STRING); - }; + } } } ", + Fields::from(vec![ + // Rule 4: List> (nullable list, non-null elements) + Field::new( + "my_list", + DataType::List(Arc::new(Field::new( + "array", + DataType::Struct(Fields::from(vec![Field::new( + "str", + DataType::Utf8, + false, + )])), + false, + ))), + true, + ), + ]), ) } + /// Taken from the example in [Parquet Format - Nested Types - Lists - Backward-compatibility rules](https://github.com/apache/parquet-format/blob/9fd57b59e0ce1a82a69237dcf8977d3e72a2965d/LogicalTypes.md?plain=1#L793-L798) #[test] - #[ignore = "not working yet"] fn basic_backward_compatible_list_4_2() -> crate::errors::Result<()> { - test_roundtrip( + test_expected_type( " message schema { optional group my_list (LIST) { repeated group my_list_tuple { required binary str (STRING); - }; + } } } ", + Fields::from(vec![ + // Rule 4: List> (nullable list, non-null elements) + Field::new( + "my_list", + DataType::List(Arc::new(Field::new( + "my_list_tuple", + DataType::Struct(Fields::from(vec![Field::new( + "str", + DataType::Utf8, + false, + )])), + false, + ))), + true, + ), + ]), ) } + /// Taken from the example in [Parquet Format - Nested Types - Lists - Backward-compatibility rules](https://github.com/apache/parquet-format/blob/9fd57b59e0ce1a82a69237dcf8977d3e72a2965d/LogicalTypes.md?plain=1#L800-L805) #[test] - #[ignore = "not working yet"] fn basic_backward_compatible_list_5() -> crate::errors::Result<()> { test_expected_type( " @@ -916,7 +949,7 @@ mod tests { optional group my_list (LIST) { repeated group element { optional binary str (STRING); - }; + } } } ", @@ -924,7 +957,7 @@ mod tests { // Rule 5: List (nullable list, nullable elements) Field::new( "my_list", - DataType::List(Arc::new(Field::new("element", DataType::Binary, true))), + DataType::List(Arc::new(Field::new("str", DataType::Utf8, true))), true, ), ]), From 59303fdde56c377b5cedd6f18449709684931497 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Tue, 30 Sep 2025 18:28:36 +0300 Subject: [PATCH 15/22] remove or --- parquet/src/arrow/schema/complex.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index b7b635717dc2..00e2449a050a 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -147,7 +147,7 @@ struct VisitorContext { /// An optional [`DataType`] sourced from the embedded arrow schema data_type: Option, - /// Whether to treat repeated types as list from arrow types or + /// Whether to treat repeated types as list from arrow types /// when true, if data_type provided it should be DataType::List() (or other list type) /// and the list field data type would be treated as the hint for the parquet type /// From 4d7485a8540197edfeb48621635adf4b6a5604ba Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Wed, 8 Oct 2025 14:52:52 +0300 Subject: [PATCH 16/22] add link to deprecated repeated columns and replace panic with error --- parquet/src/arrow/schema/complex.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index 00e2449a050a..1d527eab3b57 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -75,7 +75,9 @@ impl ParquetField { /// Converts `self` into an arrow list, with its current type as the field type /// accept an optional `list_data_type` to specify the type of list to create /// - /// This is used to convert deprecated repeated columns (not in a list), into their arrow representation + /// This is used to convert [deprecated repeated columns] (not in a list), into their arrow representation + /// + /// [deprecated repeated columns]: https://github.com/apache/parquet-format/blob/9fd57b59e0ce1a82a69237dcf8977d3e72a2965d/LogicalTypes.md?plain=1#L649-L650 fn into_list_with_arrow_list_hint( self, parquet_field_type: &Type, @@ -85,9 +87,9 @@ impl ParquetField { Some(DataType::List(field_hint)) | Some(DataType::LargeList(field_hint)) | Some(DataType::FixedSizeList(field_hint, _)) => Some(field_hint.as_ref()), - Some(_) => unreachable!( - "should be validated earlier that list_data_type is only a type of list" - ), + Some(_) => return Err(general_err!( + "Internal error: should be validated earlier that list_data_type is only a type of list" + )), None => None, }; From 3eaf7a7a0ee54b725d9c7e612a4a4cbf026ba0fd Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Wed, 4 Feb 2026 14:18:01 +0200 Subject: [PATCH 17/22] add comment and test why changing it to inherit the value is wrong --- parquet/src/arrow/schema/complex.rs | 94 +++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index 065fda03ea3b..da40b73dd788 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -329,6 +329,9 @@ impl Visitor { rep_level, def_level, data_type, + + // We set this to true to reset because children of a struct should + // independently use their arrow hints for repeated fields. treat_repeated_as_list_arrow_hint: true, }; @@ -1098,6 +1101,97 @@ mod tests { ) } + + + /// Test that arrow hints are respected for nested repeated primitives inside backward-compatible + /// LIST struct elements. + /// + /// This tests that when we have: + /// - A backward-compatible LIST (rule 4) containing a struct + /// - That struct contains a repeated primitive + /// - An arrow schema hint specifies the inner repeated primitive should be a LargeList + /// + /// The conversion should respect the LargeList hint for the inner repeated primitive. + #[test] + fn backward_compat_list_struct_with_nested_repeated_primitive_respects_arrow_hint( + ) -> crate::errors::Result<()> { + // This is a backward-compatible LIST (rule 4) where the struct element contains + // a repeated primitive. The arrow hint specifies that the inner repeated primitive + // should be LargeList. + let message_type = " + message schema { + optional group my_list (LIST) { + repeated group my_list_tuple { + required binary str (STRING); + repeated int32 values; + } + } + } + "; + + test_roundtrip(message_type)?; + + let parsed_input_schema = Arc::new(parse_message_type(message_type)?); + let schema = SchemaDescriptor::new(parsed_input_schema); + + // First, convert without hints to get the default schema + let converted = convert_schema(&schema, ProjectionMask::all(), None)?.unwrap(); + + let DataType::Struct(schema_fields) = &converted.arrow_type else { + panic!("Expected struct from convert_schema"); + }; + + let expected_default = Fields::from(vec![Field::new( + "my_list", + DataType::List(Arc::new(Field::new( + "my_list_tuple", + DataType::Struct(Fields::from(vec![ + Field::new("str", DataType::Utf8, false), + Field::new( + "values", + DataType::List(Arc::new(Field::new("values", DataType::Int32, false))), + false, + ), + ])), + false, + ))), + true, + )]); + + assert_eq!(schema_fields, &expected_default); + + let modified_schema = Fields::from(vec![Field::new( + "my_list", + DataType::List(Arc::new(Field::new( + "my_list_tuple", + DataType::Struct(Fields::from(vec![ + Field::new("str", DataType::Utf8, false), + Field::new( + "values", + // Arrow hint says this should be LargeList instead of List + DataType::LargeList(Arc::new(Field::new("values", DataType::Int32, false))), + false, + ), + ])), + false, + ))), + true, + )]); + + // Convert with the modified schema hint + let converted_with_hint = + convert_schema(&schema, ProjectionMask::all(), Some(&modified_schema))?.unwrap(); + + // The conversion should respect the LargeList hint + assert_eq!( + converted_with_hint.arrow_type, + DataType::Struct(modified_schema), + "The inner repeated primitive should respect the LargeList arrow hint" + ); + + Ok(()) + } + #[test] fn convert_schema_with_nested_list_repeated_primitive() -> crate::errors::Result<()> { test_roundtrip( From 9fb3a938bcabc000de134a5942efcfd1fa242cee Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Wed, 4 Feb 2026 14:20:37 +0200 Subject: [PATCH 18/22] format --- parquet/src/arrow/schema/complex.rs | 46 +++++++++++++++-------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index da40b73dd788..d7970eef6bf0 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -88,9 +88,11 @@ impl ParquetField { Some(DataType::List(field_hint)) | Some(DataType::LargeList(field_hint)) | Some(DataType::FixedSizeList(field_hint, _)) => Some(field_hint.as_ref()), - Some(_) => return Err(general_err!( - "Internal error: should be validated earlier that list_data_type is only a type of list" - )), + Some(_) => { + return Err(general_err!( + "Internal error: should be validated earlier that list_data_type is only a type of list" + )); + } None => None, }; @@ -218,10 +220,12 @@ impl Visitor { Some(DataType::List(f)) => Some(f.as_ref()), Some(DataType::LargeList(f)) => Some(f.as_ref()), Some(DataType::FixedSizeList(f, _)) => Some(f.as_ref()), - Some(d) => return Err(arrow_err!( - "incompatible arrow schema, expected list got {} for repeated primitive field", - d - )), + Some(d) => { + return Err(arrow_err!( + "incompatible arrow schema, expected list got {} for repeated primitive field", + d + )); + } None => None, }; @@ -272,9 +276,9 @@ impl Visitor { Some(DataType::FixedSizeList(f, _)) => Some(f.as_ref()), Some(d) => { return Err(arrow_err!( - "incompatible arrow schema, expected list got {} for repeated struct field", - d - )) + "incompatible arrow schema, expected list got {} for repeated struct field", + d + )); } None => None, }; @@ -806,7 +810,7 @@ pub fn convert_type(parquet_type: &TypePtr) -> Result { #[cfg(test)] mod tests { use crate::arrow::schema::complex::convert_schema; - use crate::arrow::{ProjectionMask, PARQUET_FIELD_ID_META_KEY}; + use crate::arrow::{PARQUET_FIELD_ID_META_KEY, ProjectionMask}; use crate::schema::parser::parse_message_type; use crate::schema::types::SchemaDescriptor; use arrow_schema::{DataType, Field, Fields}; @@ -1101,8 +1105,6 @@ mod tests { ) } - - /// Test that arrow hints are respected for nested repeated primitives inside backward-compatible /// LIST struct elements. /// @@ -1113,8 +1115,8 @@ mod tests { /// /// The conversion should respect the LargeList hint for the inner repeated primitive. #[test] - fn backward_compat_list_struct_with_nested_repeated_primitive_respects_arrow_hint( - ) -> crate::errors::Result<()> { + fn backward_compat_list_struct_with_nested_repeated_primitive_respects_arrow_hint() + -> crate::errors::Result<()> { // This is a backward-compatible LIST (rule 4) where the struct element contains // a repeated primitive. The arrow hint specifies that the inner repeated primitive // should be LargeList. @@ -1180,7 +1182,7 @@ mod tests { // Convert with the modified schema hint let converted_with_hint = - convert_schema(&schema, ProjectionMask::all(), Some(&modified_schema))?.unwrap(); + convert_schema(&schema, ProjectionMask::all(), Some(&modified_schema))?.unwrap(); // The conversion should respect the LargeList hint assert_eq!( @@ -1245,8 +1247,8 @@ mod tests { } #[test] - fn convert_schema_with_repeated_primitive_should_use_inferred_schema( - ) -> crate::errors::Result<()> { + fn convert_schema_with_repeated_primitive_should_use_inferred_schema() + -> crate::errors::Result<()> { let message_type = " message schema { repeated BYTE_ARRAY col_1 = 1; @@ -1310,8 +1312,8 @@ mod tests { } #[test] - fn convert_schema_with_repeated_primitive_should_use_inferred_schema_for_list_as_well( - ) -> crate::errors::Result<()> { + fn convert_schema_with_repeated_primitive_should_use_inferred_schema_for_list_as_well() + -> crate::errors::Result<()> { let message_type = " message schema { repeated BYTE_ARRAY col_1 = 1; @@ -1394,8 +1396,8 @@ mod tests { } #[test] - fn convert_schema_with_repeated_struct_and_inferred_schema_and_field_id( - ) -> crate::errors::Result<()> { + fn convert_schema_with_repeated_struct_and_inferred_schema_and_field_id() + -> crate::errors::Result<()> { let message_type = " message schema { repeated group my_col_1 = 1 { From ef7d611dd84fe7d059ea14f73b84f7e76e86fcc6 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Thu, 5 Mar 2026 14:38:05 +0200 Subject: [PATCH 19/22] remove bad test --- parquet/src/arrow/schema/complex.rs | 89 ----------------------------- 1 file changed, 89 deletions(-) diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index 5d1afd4508c1..d496a5eb1a4c 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -1109,95 +1109,6 @@ mod tests { ) } - /// Test that arrow hints are respected for nested repeated primitives inside backward-compatible - /// LIST struct elements. - /// - /// This tests that when we have: - /// - A backward-compatible LIST (rule 4) containing a struct - /// - That struct contains a repeated primitive - /// - An arrow schema hint specifies the inner repeated primitive should be a LargeList - /// - /// The conversion should respect the LargeList hint for the inner repeated primitive. - #[test] - fn backward_compat_list_struct_with_nested_repeated_primitive_respects_arrow_hint() - -> crate::errors::Result<()> { - // This is a backward-compatible LIST (rule 4) where the struct element contains - // a repeated primitive. The arrow hint specifies that the inner repeated primitive - // should be LargeList. - let message_type = " - message schema { - optional group my_list (LIST) { - repeated group my_list_tuple { - required binary str (STRING); - repeated int32 values; - } - } - } - "; - - test_roundtrip(message_type)?; - - let parsed_input_schema = Arc::new(parse_message_type(message_type)?); - let schema = SchemaDescriptor::new(parsed_input_schema); - - // First, convert without hints to get the default schema - let converted = convert_schema(&schema, ProjectionMask::all(), None)?.unwrap(); - - let DataType::Struct(schema_fields) = &converted.arrow_type else { - panic!("Expected struct from convert_schema"); - }; - - let expected_default = Fields::from(vec![Field::new( - "my_list", - DataType::List(Arc::new(Field::new( - "my_list_tuple", - DataType::Struct(Fields::from(vec![ - Field::new("str", DataType::Utf8, false), - Field::new( - "values", - DataType::List(Arc::new(Field::new("values", DataType::Int32, false))), - false, - ), - ])), - false, - ))), - true, - )]); - - assert_eq!(schema_fields, &expected_default); - - let modified_schema = Fields::from(vec![Field::new( - "my_list", - DataType::List(Arc::new(Field::new( - "my_list_tuple", - DataType::Struct(Fields::from(vec![ - Field::new("str", DataType::Utf8, false), - Field::new( - "values", - // Arrow hint says this should be LargeList instead of List - DataType::LargeList(Arc::new(Field::new("values", DataType::Int32, false))), - false, - ), - ])), - false, - ))), - true, - )]); - - // Convert with the modified schema hint - let converted_with_hint = - convert_schema(&schema, ProjectionMask::all(), Some(&modified_schema))?.unwrap(); - - // The conversion should respect the LargeList hint - assert_eq!( - converted_with_hint.arrow_type, - DataType::Struct(modified_schema), - "The inner repeated primitive should respect the LargeList arrow hint" - ); - - Ok(()) - } - #[test] fn convert_schema_with_nested_list_repeated_primitive() -> crate::errors::Result<()> { test_roundtrip( From e322de0d6fae4421ce4c458977ccb0322b0ce52f Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Thu, 5 Mar 2026 18:51:26 +0200 Subject: [PATCH 20/22] update comment --- parquet/src/arrow/schema/complex.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index d496a5eb1a4c..e55c27b41862 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -334,8 +334,12 @@ impl Visitor { def_level, data_type, - // We set this to true to reset because children of a struct should - // independently use their arrow hints for repeated fields. + // Always true: each child is independently responsible for its own + // repeated-to-list conversion. The parent's flag may be false when + // this struct's own repetition is consumed by an outer visit_list + // backward-compat path, but that only applies to the struct itself, + // not its children. A repeated child's arrow hint will be List<...> + // and needs to be unwrapped accordingly. treat_repeated_as_list_arrow_hint: true, }; From b3bc31c584d7b869dad4bb7b5305e87138a0bf5e Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Thu, 5 Mar 2026 18:53:11 +0200 Subject: [PATCH 21/22] update comment --- parquet/src/arrow/schema/complex.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index e55c27b41862..83fe7c87e3ff 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -170,6 +170,8 @@ struct VisitorContext { /// and the list field data type would be treated as the hint for the parquet type /// /// when false, if data_type provided it will be treated as the hint without unwrapping + /// + /// This is for supporting deprecated parquet list representation treat_repeated_as_list_arrow_hint: bool, } From 6d9e68c2f281a7431df6e8cd3ddb92e3e403d854 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Thu, 5 Mar 2026 18:54:15 +0200 Subject: [PATCH 22/22] update comment --- parquet/src/arrow/schema/complex.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/parquet/src/arrow/schema/complex.rs b/parquet/src/arrow/schema/complex.rs index 83fe7c87e3ff..161b26302825 100644 --- a/parquet/src/arrow/schema/complex.rs +++ b/parquet/src/arrow/schema/complex.rs @@ -171,7 +171,9 @@ struct VisitorContext { /// /// when false, if data_type provided it will be treated as the hint without unwrapping /// - /// This is for supporting deprecated parquet list representation + /// This is for supporting [deprecated parquet list representation][1] + /// + /// [1]: https://github.com/apache/parquet-format/blob/38818fa0e7efd54b535001a4448030a40619c2a3/LogicalTypes.md?plain=1#L718-L806 treat_repeated_as_list_arrow_hint: bool, }