@@ -24,6 +24,8 @@ use arrow_schema::{ArrowError, DataType, FieldRef};
2424use parquet_variant:: { VariantPath , VariantPathElement } ;
2525
2626use crate :: VariantArray ;
27+ use crate :: VariantArrayBuilder ;
28+ use crate :: VariantType ;
2729use crate :: variant_array:: BorrowedShreddingState ;
2830use crate :: variant_to_arrow:: make_variant_to_arrow_row_builder;
2931
@@ -86,15 +88,14 @@ pub(crate) fn follow_shredded_path_element<'a>(
8688 return Ok ( missing_path_step ( ) ) ;
8789 } ;
8890
89- let struct_array = field. as_struct_opt ( ) . ok_or_else ( || {
90- // TODO: Should we blow up? Or just end the traversal and let the normal
91- // variant pathing code sort out the mess that it must anyway be
92- // prepared to handle?
93- ArrowError :: InvalidArgumentError ( format ! (
94- "Expected Struct array while following path, got {}" ,
95- field. data_type( ) ,
96- ) )
97- } ) ?;
91+ // The field might be a VariantArray (StructArray) if shredded,
92+ // or it might be a primitive array. Only proceed if it's a StructArray.
93+ let Some ( struct_array) = field. as_struct_opt ( ) else {
94+ // Field exists but is not a StructArray, so it cannot be
95+ // followed further. Fall back to the value column if present,
96+ // otherwise the path is missing.
97+ return Ok ( missing_path_step ( ) ) ;
98+ } ;
9899
99100 let state = BorrowedShreddingState :: try_from ( struct_array) ?;
100101 Ok ( ShreddedPathStep :: Success ( state) )
@@ -223,26 +224,61 @@ fn shredded_get_path(
223224 // For shredded/partially-shredded targets (`typed_value` present), recurse into each field
224225 // separately to take advantage of deeper shredding in child fields.
225226 if let DataType :: Struct ( fields) = as_field. data_type ( ) {
226- if target. typed_value_field ( ) . is_none ( ) {
227+ let has_variant_fields = fields
228+ . iter ( )
229+ . any ( |f| f. try_extension_type :: < VariantType > ( ) . is_ok ( ) ) ;
230+ if target. typed_value_field ( ) . is_none ( ) && !has_variant_fields {
227231 return shred_basic_variant ( target, VariantPath :: default ( ) , Some ( as_field) ) ;
228232 }
229233
230- let children = fields
231- . iter ( )
232- . map ( |field| {
233- shredded_get_path (
234- & target,
235- & [ VariantPathElement :: from ( field. name ( ) . as_str ( ) ) ] ,
236- Some ( field) ,
237- cast_options,
238- )
239- } )
240- . collect :: < Result < Vec < _ > > > ( ) ?;
234+ let mut updated_fields = Vec :: with_capacity ( fields. len ( ) ) ;
235+ let mut children = Vec :: with_capacity ( fields. len ( ) ) ;
236+ for field in fields. iter ( ) {
237+ // If the field has VariantType extension metadata, extract it as a
238+ // VariantArray instead of casting to the declared data type. This allows
239+ // callers to request structs where some fields remain as variants.
240+ // See test_struct_extraction_with_variant_fields for usage example.
241+ let is_variant_field = field. try_extension_type :: < VariantType > ( ) . is_ok ( ) ;
242+ let field_as_type = ( !is_variant_field) . then ( || field. as_ref ( ) ) ;
243+ let child = shredded_get_path (
244+ & target,
245+ & [ VariantPathElement :: from ( field. name ( ) . as_str ( ) ) ] ,
246+ field_as_type,
247+ cast_options,
248+ ) ?;
249+
250+ // When the field is entirely absent in the data, shredded_get_path
251+ // returns a NullArray. For variant fields, construct an all-null
252+ // VariantArray so the extension metadata is preserved.
253+ let child = if is_variant_field && child. data_type ( ) == & DataType :: Null {
254+ let mut builder = VariantArrayBuilder :: new ( child. len ( ) ) ;
255+ for _ in 0 ..child. len ( ) {
256+ builder. append_null ( ) ;
257+ }
258+ let null_variant = builder. build ( ) ;
259+ Arc :: new ( null_variant. into_inner ( ) ) as ArrayRef
260+ } else {
261+ child
262+ } ;
263+
264+ // Update field data type to match the actual child array.
265+ // Preserve VariantType extension metadata for variant fields so
266+ // downstream consumers can recognize them as Variant columns.
267+ let mut new_field = field
268+ . as_ref ( )
269+ . clone ( )
270+ . with_data_type ( child. data_type ( ) . clone ( ) ) ;
271+ if is_variant_field {
272+ new_field = new_field. with_extension_type ( VariantType ) ;
273+ }
274+ updated_fields. push ( new_field) ;
275+ children. push ( child) ;
276+ }
241277
242278 let struct_nulls = target. nulls ( ) . cloned ( ) ;
243279
244280 return Ok ( Arc :: new ( StructArray :: try_new (
245- fields . clone ( ) ,
281+ updated_fields . into ( ) ,
246282 children,
247283 struct_nulls,
248284 ) ?) ) ;
@@ -263,9 +299,9 @@ fn try_perfect_shredding(variant_array: &VariantArray, as_field: &Field) -> Opti
263299 . value_field ( )
264300 . is_none_or ( |v| v. null_count ( ) == v. len ( ) )
265301 {
266- // Here we need to gate against the case where the `typed_value` is null but data is in the `value` column.
267- // 1. If the `value` column is null, or
268- // 2. If every row in the `value` column is null
302+ // When shredding is partial, some values may remain in the `value` column
303+ // (as raw variant binary) while `typed_value` is null. Only return the
304+ // typed value if the `value` column is entirely null (complete shredding).
269305
270306 // This is a perfect shredding, where the value is entirely shredded out,
271307 // so we can just return the typed value.
@@ -276,15 +312,30 @@ fn try_perfect_shredding(variant_array: &VariantArray, as_field: &Field) -> Opti
276312
277313/// Returns an array with the specified path extracted from the variant values.
278314///
279- /// The return array type depends on the `as_type` field of the options parameter
315+ /// The return array type depends on the `as_type` field of the options parameter:
280316/// 1. `as_type: None`: a VariantArray is returned. The values in this new VariantArray will point
281317/// to the specified path.
282318/// 2. `as_type: Some(<specific field>)`: an array of the specified type is returned.
283319///
284- /// TODO: How would a caller request a struct or list type where the fields/elements can be any
285- /// variant? Caller can pass None as the requested type to fetch a specific path, but it would
286- /// quickly become annoying (and inefficient) to call `variant_get` for each leaf value in a struct or
287- /// list and then try to assemble the results.
320+ /// When extracting a struct type (`DataType::Struct`), you can mix typed fields with variant fields
321+ /// by marking fields with the [`VariantType`] extension type. Fields with `VariantType` metadata
322+ /// will be extracted as VariantArrays, preserving the original variant representation.
323+ ///
324+ /// Example:
325+ /// ```rust,ignore
326+ /// use parquet_variant_compute::VariantType;
327+ /// use arrow_schema::extension::ExtensionType;
328+ ///
329+ /// // Extract a struct where "name" is converted to Int32, but "data" remains a Variant
330+ /// let fields = Fields::from(vec![
331+ /// Field::new("name", DataType::Int32, true),
332+ /// // Use VariantType extension metadata to request extraction as VariantArray
333+ /// Field::new("data", DataType::Struct(Fields::empty()), true)
334+ /// .with_extension_type(VariantType),
335+ /// ]);
336+ /// let options = GetOptions::new()
337+ /// .with_as_type(Some(Arc::new(Field::new("result", DataType::Struct(fields), true))));
338+ /// ```
288339pub fn variant_get ( input : & ArrayRef , options : GetOptions ) -> Result < ArrayRef > {
289340 let variant_array = VariantArray :: try_new ( input) ?;
290341
@@ -346,7 +397,8 @@ mod test {
346397 use super :: { GetOptions , variant_get} ;
347398 use crate :: variant_array:: { ShreddedVariantFieldArray , StructArrayBuilder } ;
348399 use crate :: {
349- VariantArray , VariantArrayBuilder , cast_to_variant, json_to_variant, shred_variant,
400+ VariantArray , VariantArrayBuilder , VariantType , cast_to_variant, json_to_variant,
401+ shred_variant,
350402 } ;
351403 use arrow:: array:: {
352404 Array , ArrayRef , AsArray , BinaryArray , BinaryViewArray , BooleanArray , Date32Array ,
@@ -4323,4 +4375,167 @@ mod test {
43234375 ) ;
43244376 }
43254377 }
4378+
4379+ /// Test extracting a struct with mixed typed and variant fields.
4380+ /// Fields with VariantType extension metadata should be extracted as VariantArrays.
4381+ #[ test]
4382+ fn test_struct_extraction_with_variant_fields ( ) {
4383+ // Create test data: [{"id": 1, "name": "Alice", "data": {"score": 95}},
4384+ // {"id": 2, "name": "Bob", "data": null}]
4385+ let json_strings = vec ! [
4386+ r#"{"id": 1, "name": "Alice", "data": {"score": 95}}"# ,
4387+ r#"{"id": 2, "name": "Bob", "data": null}"# ,
4388+ r#"{"id": 3, "name": null, "data": {"level": 5}}"# ,
4389+ ] ;
4390+ let string_array: Arc < dyn Array > = Arc :: new ( StringArray :: from ( json_strings) ) ;
4391+ let variant_array = json_to_variant ( & string_array) . unwrap ( ) ;
4392+
4393+ // Request struct where:
4394+ // - "id" is extracted as Int32
4395+ // - "name" is extracted as String (Utf8)
4396+ // - "data" is extracted as Variant (using VariantType extension metadata)
4397+ let struct_fields = Fields :: from ( vec ! [
4398+ Field :: new( "id" , DataType :: Int32 , true ) ,
4399+ Field :: new( "name" , DataType :: Utf8 , true ) ,
4400+ // Use VariantType extension metadata to request extraction as VariantArray.
4401+ // The data type must be Struct to satisfy VariantType::supports_data_type.
4402+ Field :: new( "data" , DataType :: Struct ( Fields :: empty( ) ) , true )
4403+ . with_extension_type( VariantType ) ,
4404+ ] ) ;
4405+ let struct_type = DataType :: Struct ( struct_fields) ;
4406+
4407+ let options = GetOptions {
4408+ path : VariantPath :: default ( ) ,
4409+ as_type : Some ( Arc :: new ( Field :: new ( "result" , struct_type, true ) ) ) ,
4410+ cast_options : CastOptions :: default ( ) ,
4411+ } ;
4412+
4413+ let variant_array_ref = ArrayRef :: from ( variant_array) ;
4414+ let result = variant_get ( & variant_array_ref, options) . unwrap ( ) ;
4415+
4416+ // Verify the result is a StructArray with 3 fields
4417+ let struct_result = result. as_any ( ) . downcast_ref :: < StructArray > ( ) . unwrap ( ) ;
4418+ assert_eq ! ( struct_result. len( ) , 3 ) ;
4419+ assert_eq ! ( struct_result. num_columns( ) , 3 ) ;
4420+
4421+ // Verify "id" field (Int32)
4422+ let id_field = struct_result
4423+ . column ( 0 )
4424+ . as_any ( )
4425+ . downcast_ref :: < Int32Array > ( )
4426+ . unwrap ( ) ;
4427+ assert_eq ! ( id_field. value( 0 ) , 1 ) ;
4428+ assert_eq ! ( id_field. value( 1 ) , 2 ) ;
4429+ assert_eq ! ( id_field. value( 2 ) , 3 ) ;
4430+
4431+ // Verify "name" field (String/Utf8)
4432+ let name_field = struct_result
4433+ . column ( 1 )
4434+ . as_any ( )
4435+ . downcast_ref :: < StringArray > ( )
4436+ . unwrap ( ) ;
4437+ assert_eq ! ( name_field. value( 0 ) , "Alice" ) ;
4438+ assert_eq ! ( name_field. value( 1 ) , "Bob" ) ;
4439+ assert ! ( name_field. is_null( 2 ) ) ; // null name in row 2
4440+
4441+ // Verify "data" field schema has VariantType extension metadata
4442+ let data_schema_field = struct_result
4443+ . fields ( )
4444+ . iter ( )
4445+ . find ( |f| f. name ( ) == "data" )
4446+ . unwrap ( ) ;
4447+ assert ! (
4448+ data_schema_field
4449+ . try_extension_type:: <VariantType >( )
4450+ . is_ok( ) ,
4451+ "data field should have VariantType extension metadata"
4452+ ) ;
4453+
4454+ // Verify "data" field (VariantArray)
4455+ let data_field = struct_result. column ( 2 ) ;
4456+ // The data field should be a StructArray representing VariantArray's internal structure
4457+ // It has columns: metadata, value (optional), typed_value (optional)
4458+ let data_as_struct = data_field. as_any ( ) . downcast_ref :: < StructArray > ( ) ;
4459+ assert ! (
4460+ data_as_struct. is_some( ) ,
4461+ "data field should be a VariantArray (represented as StructArray)"
4462+ ) ;
4463+
4464+ // Verify we can access the variant values
4465+ let data_variant_array = VariantArray :: try_new ( data_field) . unwrap ( ) ;
4466+ assert_eq ! ( data_variant_array. len( ) , 3 ) ;
4467+
4468+ // Row 0: data = {"score": 95}
4469+ let data0 = data_variant_array. value ( 0 ) ;
4470+ let obj0 = data0. as_object ( ) . expect ( "row 0 data should be an object" ) ;
4471+ let score = obj0. get ( "score" ) . expect ( "row 0 data should have 'score'" ) ;
4472+ assert_eq ! ( score. as_int16( ) , Some ( 95 ) ) ;
4473+
4474+ // Row 1: data = null
4475+ assert ! (
4476+ data_variant_array. is_null( 1 ) || matches!( data_variant_array. value( 1 ) , Variant :: Null )
4477+ ) ;
4478+
4479+ // Row 2: data = {"level": 5}
4480+ let data2 = data_variant_array. value ( 2 ) ;
4481+ let obj2 = data2. as_object ( ) . expect ( "row 2 data should be an object" ) ;
4482+ let level = obj2. get ( "level" ) . expect ( "row 2 data should have 'level'" ) ;
4483+ assert_eq ! ( level. as_int8( ) , Some ( 5 ) ) ;
4484+ }
4485+
4486+ /// Test that requesting a variant field absent in all rows does not panic.
4487+ /// Regression test: with_extension_type(VariantType) used to panic on NullArray.
4488+ #[ test]
4489+ fn test_struct_extraction_missing_variant_field_no_panic ( ) {
4490+ // Data has "id" but NOT "missing_field"
4491+ let json_strings = vec ! [ r#"{"id": 1}"# , r#"{"id": 2}"# ] ;
4492+ let string_array: Arc < dyn Array > = Arc :: new ( StringArray :: from ( json_strings) ) ;
4493+ let variant_array = json_to_variant ( & string_array) . unwrap ( ) ;
4494+
4495+ // Request struct with a variant field that doesn't exist in any row
4496+ let struct_fields = Fields :: from ( vec ! [
4497+ Field :: new( "id" , DataType :: Int32 , true ) ,
4498+ Field :: new( "missing_field" , DataType :: Struct ( Fields :: empty( ) ) , true )
4499+ . with_extension_type( VariantType ) ,
4500+ ] ) ;
4501+ let struct_type = DataType :: Struct ( struct_fields) ;
4502+
4503+ let options = GetOptions {
4504+ path : VariantPath :: default ( ) ,
4505+ as_type : Some ( Arc :: new ( Field :: new ( "result" , struct_type, true ) ) ) ,
4506+ cast_options : CastOptions :: default ( ) ,
4507+ } ;
4508+
4509+ let variant_array_ref = ArrayRef :: from ( variant_array) ;
4510+ // This should not panic
4511+ let result = variant_get ( & variant_array_ref, options) . unwrap ( ) ;
4512+
4513+ let struct_result = result. as_any ( ) . downcast_ref :: < StructArray > ( ) . unwrap ( ) ;
4514+ assert_eq ! ( struct_result. len( ) , 2 ) ;
4515+ assert_eq ! ( struct_result. num_columns( ) , 2 ) ;
4516+
4517+ // The missing variant field should be all nulls
4518+ let missing_col = struct_result. column ( 1 ) ;
4519+ assert_eq ! ( missing_col. null_count( ) , missing_col. len( ) ) ;
4520+
4521+ // The missing variant field should preserve VariantType extension metadata
4522+ let missing_schema_field = struct_result
4523+ . fields ( )
4524+ . iter ( )
4525+ . find ( |f| f. name ( ) == "missing_field" )
4526+ . unwrap ( ) ;
4527+ assert ! (
4528+ missing_schema_field
4529+ . try_extension_type:: <VariantType >( )
4530+ . is_ok( ) ,
4531+ "missing variant field should preserve VariantType extension metadata"
4532+ ) ;
4533+
4534+ // The missing variant field should be a valid VariantArray
4535+ let missing_variant = VariantArray :: try_new ( missing_col) ;
4536+ assert ! (
4537+ missing_variant. is_ok( ) ,
4538+ "missing variant field should be a valid VariantArray"
4539+ ) ;
4540+ }
43264541}
0 commit comments