Skip to content

Commit 97e687e

Browse files
codephage2020claude
andcommitted
Support extracting struct fields as Variant using ExtensionType
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent aa9432c commit 97e687e

File tree

1 file changed

+246
-31
lines changed

1 file changed

+246
-31
lines changed

parquet-variant-compute/src/variant_get.rs

Lines changed: 246 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ use arrow_schema::{ArrowError, DataType, FieldRef};
2424
use parquet_variant::{VariantPath, VariantPathElement};
2525

2626
use crate::VariantArray;
27+
use crate::VariantArrayBuilder;
28+
use crate::VariantType;
2729
use crate::variant_array::BorrowedShreddingState;
2830
use crate::variant_to_arrow::make_variant_to_arrow_row_builder;
2931

@@ -86,15 +88,14 @@ pub(crate) fn follow_shredded_path_element<'a>(
8688
return Ok(missing_path_step());
8789
};
8890

89-
let struct_array = field.as_struct_opt().ok_or_else(|| {
90-
// TODO: Should we blow up? Or just end the traversal and let the normal
91-
// variant pathing code sort out the mess that it must anyway be
92-
// prepared to handle?
93-
ArrowError::InvalidArgumentError(format!(
94-
"Expected Struct array while following path, got {}",
95-
field.data_type(),
96-
))
97-
})?;
91+
// The field might be a VariantArray (StructArray) if shredded,
92+
// or it might be a primitive array. Only proceed if it's a StructArray.
93+
let Some(struct_array) = field.as_struct_opt() else {
94+
// Field exists but is not a StructArray, so it cannot be
95+
// followed further. Fall back to the value column if present,
96+
// otherwise the path is missing.
97+
return Ok(missing_path_step());
98+
};
9899

99100
let state = BorrowedShreddingState::try_from(struct_array)?;
100101
Ok(ShreddedPathStep::Success(state))
@@ -223,26 +224,61 @@ fn shredded_get_path(
223224
// For shredded/partially-shredded targets (`typed_value` present), recurse into each field
224225
// separately to take advantage of deeper shredding in child fields.
225226
if let DataType::Struct(fields) = as_field.data_type() {
226-
if target.typed_value_field().is_none() {
227+
let has_variant_fields = fields
228+
.iter()
229+
.any(|f| f.try_extension_type::<VariantType>().is_ok());
230+
if target.typed_value_field().is_none() && !has_variant_fields {
227231
return shred_basic_variant(target, VariantPath::default(), Some(as_field));
228232
}
229233

230-
let children = fields
231-
.iter()
232-
.map(|field| {
233-
shredded_get_path(
234-
&target,
235-
&[VariantPathElement::from(field.name().as_str())],
236-
Some(field),
237-
cast_options,
238-
)
239-
})
240-
.collect::<Result<Vec<_>>>()?;
234+
let mut updated_fields = Vec::with_capacity(fields.len());
235+
let mut children = Vec::with_capacity(fields.len());
236+
for field in fields.iter() {
237+
// If the field has VariantType extension metadata, extract it as a
238+
// VariantArray instead of casting to the declared data type. This allows
239+
// callers to request structs where some fields remain as variants.
240+
// See test_struct_extraction_with_variant_fields for usage example.
241+
let is_variant_field = field.try_extension_type::<VariantType>().is_ok();
242+
let field_as_type = (!is_variant_field).then(|| field.as_ref());
243+
let child = shredded_get_path(
244+
&target,
245+
&[VariantPathElement::from(field.name().as_str())],
246+
field_as_type,
247+
cast_options,
248+
)?;
249+
250+
// When the field is entirely absent in the data, shredded_get_path
251+
// returns a NullArray. For variant fields, construct an all-null
252+
// VariantArray so the extension metadata is preserved.
253+
let child = if is_variant_field && child.data_type() == &DataType::Null {
254+
let mut builder = VariantArrayBuilder::new(child.len());
255+
for _ in 0..child.len() {
256+
builder.append_null();
257+
}
258+
let null_variant = builder.build();
259+
Arc::new(null_variant.into_inner()) as ArrayRef
260+
} else {
261+
child
262+
};
263+
264+
// Update field data type to match the actual child array.
265+
// Preserve VariantType extension metadata for variant fields so
266+
// downstream consumers can recognize them as Variant columns.
267+
let mut new_field = field
268+
.as_ref()
269+
.clone()
270+
.with_data_type(child.data_type().clone());
271+
if is_variant_field {
272+
new_field = new_field.with_extension_type(VariantType);
273+
}
274+
updated_fields.push(new_field);
275+
children.push(child);
276+
}
241277

242278
let struct_nulls = target.nulls().cloned();
243279

244280
return Ok(Arc::new(StructArray::try_new(
245-
fields.clone(),
281+
updated_fields.into(),
246282
children,
247283
struct_nulls,
248284
)?));
@@ -263,9 +299,9 @@ fn try_perfect_shredding(variant_array: &VariantArray, as_field: &Field) -> Opti
263299
.value_field()
264300
.is_none_or(|v| v.null_count() == v.len())
265301
{
266-
// Here we need to gate against the case where the `typed_value` is null but data is in the `value` column.
267-
// 1. If the `value` column is null, or
268-
// 2. If every row in the `value` column is null
302+
// When shredding is partial, some values may remain in the `value` column
303+
// (as raw variant binary) while `typed_value` is null. Only return the
304+
// typed value if the `value` column is entirely null (complete shredding).
269305

270306
// This is a perfect shredding, where the value is entirely shredded out,
271307
// so we can just return the typed value.
@@ -276,15 +312,30 @@ fn try_perfect_shredding(variant_array: &VariantArray, as_field: &Field) -> Opti
276312

277313
/// Returns an array with the specified path extracted from the variant values.
278314
///
279-
/// The return array type depends on the `as_type` field of the options parameter
315+
/// The return array type depends on the `as_type` field of the options parameter:
280316
/// 1. `as_type: None`: a VariantArray is returned. The values in this new VariantArray will point
281317
/// to the specified path.
282318
/// 2. `as_type: Some(<specific field>)`: an array of the specified type is returned.
283319
///
284-
/// TODO: How would a caller request a struct or list type where the fields/elements can be any
285-
/// variant? Caller can pass None as the requested type to fetch a specific path, but it would
286-
/// quickly become annoying (and inefficient) to call `variant_get` for each leaf value in a struct or
287-
/// list and then try to assemble the results.
320+
/// When extracting a struct type (`DataType::Struct`), you can mix typed fields with variant fields
321+
/// by marking fields with the [`VariantType`] extension type. Fields with `VariantType` metadata
322+
/// will be extracted as VariantArrays, preserving the original variant representation.
323+
///
324+
/// Example:
325+
/// ```rust,ignore
326+
/// use parquet_variant_compute::VariantType;
327+
/// use arrow_schema::extension::ExtensionType;
328+
///
329+
/// // Extract a struct where "name" is converted to Int32, but "data" remains a Variant
330+
/// let fields = Fields::from(vec![
331+
/// Field::new("name", DataType::Int32, true),
332+
/// // Use VariantType extension metadata to request extraction as VariantArray
333+
/// Field::new("data", DataType::Struct(Fields::empty()), true)
334+
/// .with_extension_type(VariantType),
335+
/// ]);
336+
/// let options = GetOptions::new()
337+
/// .with_as_type(Some(Arc::new(Field::new("result", DataType::Struct(fields), true))));
338+
/// ```
288339
pub fn variant_get(input: &ArrayRef, options: GetOptions) -> Result<ArrayRef> {
289340
let variant_array = VariantArray::try_new(input)?;
290341

@@ -346,7 +397,8 @@ mod test {
346397
use super::{GetOptions, variant_get};
347398
use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder};
348399
use crate::{
349-
VariantArray, VariantArrayBuilder, cast_to_variant, json_to_variant, shred_variant,
400+
VariantArray, VariantArrayBuilder, VariantType, cast_to_variant, json_to_variant,
401+
shred_variant,
350402
};
351403
use arrow::array::{
352404
Array, ArrayRef, AsArray, BinaryArray, BinaryViewArray, BooleanArray, Date32Array,
@@ -4323,4 +4375,167 @@ mod test {
43234375
);
43244376
}
43254377
}
4378+
4379+
/// Test extracting a struct with mixed typed and variant fields.
4380+
/// Fields with VariantType extension metadata should be extracted as VariantArrays.
4381+
#[test]
4382+
fn test_struct_extraction_with_variant_fields() {
4383+
// Create test data: [{"id": 1, "name": "Alice", "data": {"score": 95}},
4384+
// {"id": 2, "name": "Bob", "data": null}]
4385+
let json_strings = vec![
4386+
r#"{"id": 1, "name": "Alice", "data": {"score": 95}}"#,
4387+
r#"{"id": 2, "name": "Bob", "data": null}"#,
4388+
r#"{"id": 3, "name": null, "data": {"level": 5}}"#,
4389+
];
4390+
let string_array: Arc<dyn Array> = Arc::new(StringArray::from(json_strings));
4391+
let variant_array = json_to_variant(&string_array).unwrap();
4392+
4393+
// Request struct where:
4394+
// - "id" is extracted as Int32
4395+
// - "name" is extracted as String (Utf8)
4396+
// - "data" is extracted as Variant (using VariantType extension metadata)
4397+
let struct_fields = Fields::from(vec![
4398+
Field::new("id", DataType::Int32, true),
4399+
Field::new("name", DataType::Utf8, true),
4400+
// Use VariantType extension metadata to request extraction as VariantArray.
4401+
// The data type must be Struct to satisfy VariantType::supports_data_type.
4402+
Field::new("data", DataType::Struct(Fields::empty()), true)
4403+
.with_extension_type(VariantType),
4404+
]);
4405+
let struct_type = DataType::Struct(struct_fields);
4406+
4407+
let options = GetOptions {
4408+
path: VariantPath::default(),
4409+
as_type: Some(Arc::new(Field::new("result", struct_type, true))),
4410+
cast_options: CastOptions::default(),
4411+
};
4412+
4413+
let variant_array_ref = ArrayRef::from(variant_array);
4414+
let result = variant_get(&variant_array_ref, options).unwrap();
4415+
4416+
// Verify the result is a StructArray with 3 fields
4417+
let struct_result = result.as_any().downcast_ref::<StructArray>().unwrap();
4418+
assert_eq!(struct_result.len(), 3);
4419+
assert_eq!(struct_result.num_columns(), 3);
4420+
4421+
// Verify "id" field (Int32)
4422+
let id_field = struct_result
4423+
.column(0)
4424+
.as_any()
4425+
.downcast_ref::<Int32Array>()
4426+
.unwrap();
4427+
assert_eq!(id_field.value(0), 1);
4428+
assert_eq!(id_field.value(1), 2);
4429+
assert_eq!(id_field.value(2), 3);
4430+
4431+
// Verify "name" field (String/Utf8)
4432+
let name_field = struct_result
4433+
.column(1)
4434+
.as_any()
4435+
.downcast_ref::<StringArray>()
4436+
.unwrap();
4437+
assert_eq!(name_field.value(0), "Alice");
4438+
assert_eq!(name_field.value(1), "Bob");
4439+
assert!(name_field.is_null(2)); // null name in row 2
4440+
4441+
// Verify "data" field schema has VariantType extension metadata
4442+
let data_schema_field = struct_result
4443+
.fields()
4444+
.iter()
4445+
.find(|f| f.name() == "data")
4446+
.unwrap();
4447+
assert!(
4448+
data_schema_field
4449+
.try_extension_type::<VariantType>()
4450+
.is_ok(),
4451+
"data field should have VariantType extension metadata"
4452+
);
4453+
4454+
// Verify "data" field (VariantArray)
4455+
let data_field = struct_result.column(2);
4456+
// The data field should be a StructArray representing VariantArray's internal structure
4457+
// It has columns: metadata, value (optional), typed_value (optional)
4458+
let data_as_struct = data_field.as_any().downcast_ref::<StructArray>();
4459+
assert!(
4460+
data_as_struct.is_some(),
4461+
"data field should be a VariantArray (represented as StructArray)"
4462+
);
4463+
4464+
// Verify we can access the variant values
4465+
let data_variant_array = VariantArray::try_new(data_field).unwrap();
4466+
assert_eq!(data_variant_array.len(), 3);
4467+
4468+
// Row 0: data = {"score": 95}
4469+
let data0 = data_variant_array.value(0);
4470+
let obj0 = data0.as_object().expect("row 0 data should be an object");
4471+
let score = obj0.get("score").expect("row 0 data should have 'score'");
4472+
assert_eq!(score.as_int16(), Some(95));
4473+
4474+
// Row 1: data = null
4475+
assert!(
4476+
data_variant_array.is_null(1) || matches!(data_variant_array.value(1), Variant::Null)
4477+
);
4478+
4479+
// Row 2: data = {"level": 5}
4480+
let data2 = data_variant_array.value(2);
4481+
let obj2 = data2.as_object().expect("row 2 data should be an object");
4482+
let level = obj2.get("level").expect("row 2 data should have 'level'");
4483+
assert_eq!(level.as_int8(), Some(5));
4484+
}
4485+
4486+
/// Test that requesting a variant field absent in all rows does not panic.
4487+
/// Regression test: with_extension_type(VariantType) used to panic on NullArray.
4488+
#[test]
4489+
fn test_struct_extraction_missing_variant_field_no_panic() {
4490+
// Data has "id" but NOT "missing_field"
4491+
let json_strings = vec![r#"{"id": 1}"#, r#"{"id": 2}"#];
4492+
let string_array: Arc<dyn Array> = Arc::new(StringArray::from(json_strings));
4493+
let variant_array = json_to_variant(&string_array).unwrap();
4494+
4495+
// Request struct with a variant field that doesn't exist in any row
4496+
let struct_fields = Fields::from(vec![
4497+
Field::new("id", DataType::Int32, true),
4498+
Field::new("missing_field", DataType::Struct(Fields::empty()), true)
4499+
.with_extension_type(VariantType),
4500+
]);
4501+
let struct_type = DataType::Struct(struct_fields);
4502+
4503+
let options = GetOptions {
4504+
path: VariantPath::default(),
4505+
as_type: Some(Arc::new(Field::new("result", struct_type, true))),
4506+
cast_options: CastOptions::default(),
4507+
};
4508+
4509+
let variant_array_ref = ArrayRef::from(variant_array);
4510+
// This should not panic
4511+
let result = variant_get(&variant_array_ref, options).unwrap();
4512+
4513+
let struct_result = result.as_any().downcast_ref::<StructArray>().unwrap();
4514+
assert_eq!(struct_result.len(), 2);
4515+
assert_eq!(struct_result.num_columns(), 2);
4516+
4517+
// The missing variant field should be all nulls
4518+
let missing_col = struct_result.column(1);
4519+
assert_eq!(missing_col.null_count(), missing_col.len());
4520+
4521+
// The missing variant field should preserve VariantType extension metadata
4522+
let missing_schema_field = struct_result
4523+
.fields()
4524+
.iter()
4525+
.find(|f| f.name() == "missing_field")
4526+
.unwrap();
4527+
assert!(
4528+
missing_schema_field
4529+
.try_extension_type::<VariantType>()
4530+
.is_ok(),
4531+
"missing variant field should preserve VariantType extension metadata"
4532+
);
4533+
4534+
// The missing variant field should be a valid VariantArray
4535+
let missing_variant = VariantArray::try_new(missing_col);
4536+
assert!(
4537+
missing_variant.is_ok(),
4538+
"missing variant field should be a valid VariantArray"
4539+
);
4540+
}
43264541
}

0 commit comments

Comments
 (0)