Skip to content

Commit 3ecc20d

Browse files
committed
Preserve VariantType for missing variant fields
Import VariantArrayBuilder and change shredded path handling to gracefully handle non-Struct fields by falling back to the value column instead of erroring.
1 parent 368d86c commit 3ecc20d

File tree

1 file changed

+72
-53
lines changed

1 file changed

+72
-53
lines changed

parquet-variant-compute/src/variant_get.rs

Lines changed: 72 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ use arrow_schema::{ArrowError, DataType, FieldRef};
2424
use parquet_variant::{VariantPath, VariantPathElement};
2525

2626
use crate::VariantArray;
27+
use crate::VariantArrayBuilder;
2728
use crate::VariantType;
2829
use crate::variant_array::BorrowedShreddingState;
2930
use crate::variant_to_arrow::make_variant_to_arrow_row_builder;
@@ -90,17 +91,10 @@ pub(crate) fn follow_shredded_path_element<'a>(
9091
// The field might be a VariantArray (StructArray) if shredded,
9192
// or it might be a primitive array. Only proceed if it's a StructArray.
9293
let Some(struct_array) = field.as_struct_opt() else {
93-
// Field exists but is not a StructArray (VariantArray),
94-
// which means it's not shredded further.
95-
if !cast_options.safe {
96-
return Err(ArrowError::CastError(format!(
97-
"Expected Struct array while following path, got {}",
98-
field.data_type(),
99-
)));
100-
}
101-
// In safe mode, return NotShredded to let the caller
102-
// handle it via value column.
103-
return Ok(ShreddedPathStep::NotShredded);
94+
// Field exists but is not a StructArray, so it cannot be
95+
// followed further. Fall back to the value column if present,
96+
// otherwise the path is missing.
97+
return Ok(missing_path_step());
10498
};
10599

106100
let state = BorrowedShreddingState::try_from(struct_array)?;
@@ -224,47 +218,48 @@ fn shredded_get_path(
224218
// further, and build up the final struct from those individually shredded results.
225219
if let DataType::Struct(fields) = as_field.data_type() {
226220
let mut updated_fields = Vec::with_capacity(fields.len());
227-
let children: Result<Vec<_>> = fields
228-
.iter()
229-
.map(|field| {
230-
// If the field has VariantType extension metadata, extract it as a
231-
// VariantArray instead of casting to the declared data type. This allows
232-
// callers to request structs where some fields remain as variants.
233-
// See test_struct_extraction_with_variant_fields for usage example.
234-
let is_strongly_typed = field.try_extension_type::<VariantType>().is_err();
235-
let field_as_type = is_strongly_typed.then(|| field.as_ref());
236-
let child = shredded_get_path(
237-
&target,
238-
&[VariantPathElement::from(field.name().as_str())],
239-
field_as_type,
240-
cast_options,
241-
)?;
242-
243-
// Update field type if it was a Variant marker (extracted as VariantArray).
244-
// The actual data type will be the internal structure of VariantArray.
245-
// Preserve VariantType extension metadata so downstream consumers
246-
// can recognize this field as a Variant column.
247-
//
248-
// When the field is entirely absent in the data, shredded_get_path
249-
// returns a NullArray (DataType::Null). VariantType only supports
250-
// Struct storage, so we must skip the extension in that case.
251-
let is_variant_field = !is_strongly_typed;
252-
let new_field = field
253-
.as_ref()
254-
.clone()
255-
.with_data_type(child.data_type().clone());
256-
let updated_field =
257-
if is_variant_field && matches!(child.data_type(), DataType::Struct(_)) {
258-
new_field.with_extension_type(VariantType)
259-
} else {
260-
new_field
261-
};
262-
updated_fields.push(updated_field);
221+
let mut children = Vec::with_capacity(fields.len());
222+
for field in fields.iter() {
223+
// If the field has VariantType extension metadata, extract it as a
224+
// VariantArray instead of casting to the declared data type. This allows
225+
// callers to request structs where some fields remain as variants.
226+
// See test_struct_extraction_with_variant_fields for usage example.
227+
let is_variant_field = field.try_extension_type::<VariantType>().is_ok();
228+
let field_as_type = (!is_variant_field).then(|| field.as_ref());
229+
let child = shredded_get_path(
230+
&target,
231+
&[VariantPathElement::from(field.name().as_str())],
232+
field_as_type,
233+
cast_options,
234+
)?;
235+
236+
// When the field is entirely absent in the data, shredded_get_path
237+
// returns a NullArray. For variant fields, construct an all-null
238+
// VariantArray so the extension metadata is preserved.
239+
let child = if is_variant_field && child.data_type() == &DataType::Null {
240+
let mut builder = VariantArrayBuilder::new(child.len());
241+
for _ in 0..child.len() {
242+
builder.append_null();
243+
}
244+
let null_variant = builder.build();
245+
Arc::new(null_variant.into_inner()) as ArrayRef
246+
} else {
247+
child
248+
};
263249

264-
Ok(child)
265-
})
266-
.collect();
267-
let children = children?;
250+
// Update field data type to match the actual child array.
251+
// Preserve VariantType extension metadata for variant fields so
252+
// downstream consumers can recognize them as Variant columns.
253+
let mut new_field = field
254+
.as_ref()
255+
.clone()
256+
.with_data_type(child.data_type().clone());
257+
if is_variant_field {
258+
new_field = new_field.with_extension_type(VariantType);
259+
}
260+
updated_fields.push(new_field);
261+
children.push(child);
262+
}
268263

269264
let struct_nulls = target.nulls().cloned();
270265

@@ -4383,7 +4378,9 @@ mod test {
43834378

43844379
// Row 0: data = {"score": 95}
43854380
let data0 = data_variant_array.value(0);
4386-
assert!(matches!(data0, Variant::Object(_)));
4381+
let obj0 = data0.as_object().expect("row 0 data should be an object");
4382+
let score = obj0.get("score").expect("row 0 data should have 'score'");
4383+
assert_eq!(score.as_int16(), Some(95));
43874384

43884385
// Row 1: data = null
43894386
assert!(
@@ -4392,7 +4389,9 @@ mod test {
43924389

43934390
// Row 2: data = {"level": 5}
43944391
let data2 = data_variant_array.value(2);
4395-
assert!(matches!(data2, Variant::Object(_)));
4392+
let obj2 = data2.as_object().expect("row 2 data should be an object");
4393+
let level = obj2.get("level").expect("row 2 data should have 'level'");
4394+
assert_eq!(level.as_int8(), Some(5));
43964395
}
43974396

43984397
/// Test that requesting a variant field absent in all rows does not panic.
@@ -4429,5 +4428,25 @@ mod test {
44294428
// The missing variant field should be all nulls
44304429
let missing_col = struct_result.column(1);
44314430
assert_eq!(missing_col.null_count(), missing_col.len());
4431+
4432+
// The missing variant field should preserve VariantType extension metadata
4433+
let missing_schema_field = struct_result
4434+
.fields()
4435+
.iter()
4436+
.find(|f| f.name() == "missing_field")
4437+
.unwrap();
4438+
assert!(
4439+
missing_schema_field
4440+
.try_extension_type::<VariantType>()
4441+
.is_ok(),
4442+
"missing variant field should preserve VariantType extension metadata"
4443+
);
4444+
4445+
// The missing variant field should be a valid VariantArray
4446+
let missing_variant = VariantArray::try_new(missing_col);
4447+
assert!(
4448+
missing_variant.is_ok(),
4449+
"missing variant field should be a valid VariantArray"
4450+
);
44324451
}
44334452
}

0 commit comments

Comments
 (0)