diff --git a/parquet-variant-compute/src/shred_variant.rs b/parquet-variant-compute/src/shred_variant.rs index d80d2f9863f6..d2021e12f6e5 100644 --- a/parquet-variant-compute/src/shred_variant.rs +++ b/parquet-variant-compute/src/shred_variant.rs @@ -303,6 +303,7 @@ impl<'a> VariantToShreddedArrayVariantRowBuilder<'a> { data_type, cast_options, capacity, + true, )?, nulls: NullBufferBuilder::new(capacity), null_value, diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 73906f70eb77..896900cc5d38 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -365,8 +365,8 @@ mod test { use arrow_schema::{DataType, Field, FieldRef, Fields, IntervalUnit, TimeUnit}; use chrono::DateTime; use parquet_variant::{ - EMPTY_VARIANT_METADATA_BYTES, Variant, VariantBuilder, VariantDecimal4, VariantDecimal8, - VariantDecimal16, VariantDecimalType, VariantPath, + EMPTY_VARIANT_METADATA_BYTES, Variant, VariantDecimal4, VariantDecimal8, VariantDecimal16, + VariantDecimalType, VariantPath, }; fn single_variant_get_test(input_json: &str, path: VariantPath, expected_json: &str) { @@ -4131,69 +4131,45 @@ mod test { ])); let variant_array = ArrayRef::from(json_to_variant(&string_array).unwrap()); - let value_array: ArrayRef = { - let mut builder = VariantBuilder::new(); - builder.append_value("two"); - let (_, value_bytes) = builder.finish(); - Arc::new(BinaryViewArray::from(vec![ - None, - Some(value_bytes.as_slice()), - None, - ])) - }; - let typed_value_array: ArrayRef = Arc::new(Int64Array::from(vec![Some(1), None, Some(3)])); - let struct_fields = Fields::from(vec![ - Field::new("value", DataType::BinaryView, true), - Field::new("typed_value", DataType::Int64, true), - ]); - let struct_array: ArrayRef = Arc::new( - StructArray::try_new( - struct_fields.clone(), - vec![value_array.clone(), typed_value_array.clone()], - None, - ) - .unwrap(), - ); - - let request_field = Arc::new(Field::new("item", DataType::Int64, true)); - let result_field = Arc::new(Field::new("item", DataType::Struct(struct_fields), true)); + let element_array: ArrayRef = Arc::new(Int64Array::from(vec![Some(1), None, Some(3)])); + let field = Arc::new(Field::new("item", Int64, true)); let expectations = vec![ ( - DataType::List(request_field.clone()), + DataType::List(field.clone()), Arc::new(ListArray::new( - result_field.clone(), + field.clone(), OffsetBuffer::new(ScalarBuffer::from(vec![0, 3, 3])), - struct_array.clone(), + element_array.clone(), Some(NullBuffer::from(vec![true, false])), )) as ArrayRef, ), ( - DataType::LargeList(request_field.clone()), + DataType::LargeList(field.clone()), Arc::new(LargeListArray::new( - result_field.clone(), + field.clone(), OffsetBuffer::new(ScalarBuffer::from(vec![0, 3, 3])), - struct_array.clone(), + element_array.clone(), Some(NullBuffer::from(vec![true, false])), )) as ArrayRef, ), ( - DataType::ListView(request_field.clone()), + DataType::ListView(field.clone()), Arc::new(ListViewArray::new( - result_field.clone(), + field.clone(), ScalarBuffer::from(vec![0, 3]), ScalarBuffer::from(vec![3, 0]), - struct_array.clone(), + element_array.clone(), Some(NullBuffer::from(vec![true, false])), )) as ArrayRef, ), ( - DataType::LargeListView(request_field), + DataType::LargeListView(field.clone()), Arc::new(LargeListViewArray::new( - result_field, + field, ScalarBuffer::from(vec![0, 3]), ScalarBuffer::from(vec![3, 0]), - struct_array, + element_array, Some(NullBuffer::from(vec![true, false])), )) as ArrayRef, ), @@ -4235,6 +4211,52 @@ mod test { } } + #[test] + fn test_variant_get_nested_list() { + use arrow::datatypes::Int64Type; + + let string_array: ArrayRef = Arc::new(StringArray::from(vec![ + r#"[[1, 2], [3]]"#, + r#"[[4], "not a list", [5, 6]]"#, + ])); + let variant_array = ArrayRef::from(json_to_variant(&string_array).unwrap()); + + let inner_field = Arc::new(Field::new("item", Int64, true)); + let outer_field = Arc::new(Field::new( + "item", + DataType::List(inner_field.clone()), + true, + )); + let request_type = DataType::List(outer_field.clone()); + + let options = GetOptions::new().with_as_type(Some(FieldRef::from(Field::new( + "result", + request_type, + true, + )))); + let result = variant_get(&variant_array, options).unwrap(); + let outer = result.as_list::(); + + // Row 0: [[1, 2], [3]] + let row0 = outer.value(0); + let row0 = row0.as_list::(); + assert_eq!(row0.len(), 2); + let elem0 = row0.value(0); + assert_eq!(elem0.as_primitive::().values(), &[1, 2]); + let elem1 = row0.value(1); + assert_eq!(elem1.as_primitive::().values(), &[3]); + + // Row 1: [[4], null, [5, 6]] — "not a list" becomes null inner list + let row1 = outer.value(1); + let row1 = row1.as_list::(); + assert_eq!(row1.len(), 3); + let elem0 = row1.value(0); + assert_eq!(elem0.as_primitive::().values(), &[4]); + assert!(row1.is_null(1)); + let elem2 = row1.value(2); + assert_eq!(elem2.as_primitive::().values(), &[5, 6]); + } + #[test] fn test_variant_get_list_like_unsafe_cast_errors_on_element_mismatch() { let string_array: ArrayRef = @@ -4287,40 +4309,17 @@ mod test { .with_cast_options(cast_options); let result = variant_get(&variant_array, options).unwrap(); - let element_struct = result - .as_any() - .downcast_ref::() - .unwrap() + let list_array = result.as_any().downcast_ref::().unwrap(); + let values = list_array .values() .as_any() - .downcast_ref::() - .unwrap(); - - let value = element_struct - .column_by_name("value") - .unwrap() - .as_any() - .downcast_ref::() - .unwrap(); - let typed_value = element_struct - .column_by_name("typed_value") - .unwrap() - .as_any() .downcast_ref::() .unwrap(); - assert_eq!(typed_value.len(), 3); - assert_eq!(typed_value.value(0), 1); - assert!(typed_value.is_null(1)); - assert_eq!(typed_value.value(2), 3); - - assert!(value.is_null(0)); - assert!(value.is_valid(1)); - assert_eq!( - Variant::new(EMPTY_VARIANT_METADATA_BYTES, value.value(1)), - Variant::Null - ); - assert!(value.is_null(2)); + assert_eq!(values.len(), 3); + assert_eq!(values.value(0), 1); + assert!(values.is_null(1)); + assert_eq!(values.value(2), 3); } #[test] diff --git a/parquet-variant-compute/src/variant_to_arrow.rs b/parquet-variant-compute/src/variant_to_arrow.rs index dd396117d22d..dd054a5f7d82 100644 --- a/parquet-variant-compute/src/variant_to_arrow.rs +++ b/parquet-variant-compute/src/variant_to_arrow.rs @@ -107,7 +107,7 @@ fn make_typed_variant_to_arrow_row_builder<'a>( | DataType::LargeListView(_) | DataType::FixedSizeList(..)) => { let builder = - ArrayVariantToArrowRowBuilder::try_new(data_type, cast_options, capacity)?; + ArrayVariantToArrowRowBuilder::try_new(data_type, cast_options, capacity, false)?; Ok(Array(builder)) } data_type => { @@ -587,10 +587,17 @@ impl<'a> StructVariantToArrowRowBuilder<'a> { } impl<'a> ArrayVariantToArrowRowBuilder<'a> { + /// Creates a new list builder for the given data type. + /// + /// # Arguments + /// * `shredded` - If true, element builders produce shredded structs with `value`/`typed_value` + /// fields (for [`crate::shred_variant()`]). If false, element builders produce strongly typed + /// arrays directly (for [`crate::variant_get()`]). pub(crate) fn try_new( data_type: &'a DataType, cast_options: &'a CastOptions, capacity: usize, + shredded: bool, ) -> Result { use ArrayVariantToArrowRowBuilder::*; @@ -602,6 +609,7 @@ impl<'a> ArrayVariantToArrowRowBuilder<'a> { $field.data_type(), cast_options, capacity, + shredded, )?) }; } @@ -893,13 +901,45 @@ impl<'a> VariantToUuidArrowRowBuilder<'a> { } } +/// Element builder for list variants, supporting both typed (for [`crate::variant_get()`]) +/// and shredded (for [`crate::shred_variant()`]) output modes. +enum ListElementBuilder<'a> { + /// Produces the target array type directly. + Typed(Box>), + /// Produces a shredded struct with `value` and `typed_value` fields. + Shredded(Box>), +} + +impl<'a> ListElementBuilder<'a> { + fn append_value(&mut self, value: Variant<'_, '_>) -> Result { + match self { + Self::Typed(b) => b.append_value(value), + Self::Shredded(b) => b.append_value(value), + } + } + + fn finish(self) -> Result { + match self { + Self::Typed(b) => b.finish(), + Self::Shredded(b) => { + let (value, typed_value, nulls) = b.finish()?; + Ok(ArrayRef::from(ShreddedVariantFieldArray::from_parts( + Some(value), + Some(typed_value), + nulls, + ))) + } + } + } +} + pub(crate) struct VariantToListArrowRowBuilder<'a, O, const IS_VIEW: bool> where O: OffsetSizeTrait + ArrowNativeTypeOp, { field: FieldRef, offsets: Vec, - element_builder: Box>, + element_builder: ListElementBuilder<'a>, nulls: NullBufferBuilder, current_offset: O, cast_options: &'a CastOptions<'a>, @@ -914,6 +954,7 @@ where element_data_type: &'a DataType, cast_options: &'a CastOptions, capacity: usize, + shredded: bool, ) -> Result { if capacity >= isize::MAX as usize { return Err(ArrowError::ComputeError( @@ -922,16 +963,24 @@ where } let mut offsets = Vec::with_capacity(capacity + 1); offsets.push(O::ZERO); - let element_builder = make_variant_to_shredded_variant_arrow_row_builder( - element_data_type, - cast_options, - capacity, - NullValue::ArrayElement, - )?; + let element_builder = if shredded { + let builder = make_variant_to_shredded_variant_arrow_row_builder( + element_data_type, + cast_options, + capacity, + NullValue::ArrayElement, + )?; + ListElementBuilder::Shredded(Box::new(builder)) + } else { + let builder = + make_typed_variant_to_arrow_row_builder(element_data_type, cast_options, capacity)?; + ListElementBuilder::Typed(Box::new(builder)) + }; + Ok(Self { field, offsets, - element_builder: Box::new(element_builder), + element_builder, nulls: NullBufferBuilder::new(capacity), current_offset: O::ZERO, cast_options, @@ -966,9 +1015,7 @@ where } fn finish(mut self) -> Result { - let (value, typed_value, nulls) = self.element_builder.finish()?; - let element_array = - ShreddedVariantFieldArray::from_parts(Some(value), Some(typed_value), nulls); + let element_array: ArrayRef = self.element_builder.finish()?; let field = Arc::new( self.field .as_ref() @@ -987,7 +1034,7 @@ where field, ScalarBuffer::from(self.offsets), ScalarBuffer::from(sizes), - ArrayRef::from(element_array), + element_array, self.nulls.finish(), ); Ok(Arc::new(list_view_array)) @@ -995,7 +1042,7 @@ where let list_array = GenericListArray::::new( field, OffsetBuffer::::new(ScalarBuffer::from(self.offsets)), - ArrayRef::from(element_array), + element_array, self.nulls.finish(), ); Ok(Arc::new(list_array))