Skip to content
4 changes: 3 additions & 1 deletion parquet-variant-compute/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,9 @@ pub use variant_array_builder::{VariantArrayBuilder, VariantValueArrayBuilder};

pub use cast_to_variant::{cast_to_variant, cast_to_variant_with_options};
pub use from_json::json_to_variant;
pub use shred_variant::{IntoShreddingField, ShreddedSchemaBuilder, shred_variant};
pub use shred_variant::{
IntoShreddingField, ShreddedSchemaBuilder, shred_variant, shred_variant_with_options,
};
pub use to_json::variant_to_json;
pub use unshred_variant::unshred_variant;
pub use variant_get::{GetOptions, variant_get};
130 changes: 116 additions & 14 deletions parquet-variant-compute/src/shred_variant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,14 @@ use std::sync::Arc;
/// See [`ShreddedSchemaBuilder`] for a convenient way to build the `as_type`
/// value passed to this function.
pub fn shred_variant(array: &VariantArray, as_type: &DataType) -> Result<VariantArray> {
shred_variant_with_options(array, as_type, &CastOptions::default())
}

pub fn shred_variant_with_options(
array: &VariantArray,
as_type: &DataType,
cast_options: &CastOptions,
) -> Result<VariantArray> {
if array.typed_value_field().is_some() {
return Err(ArrowError::InvalidArgumentError(
"Input is already shredded".to_string(),
Expand All @@ -79,10 +87,9 @@ pub fn shred_variant(array: &VariantArray, as_type: &DataType) -> Result<Variant
return Ok(array.clone());
};

let cast_options = CastOptions::default();
let mut builder = make_variant_to_shredded_variant_arrow_row_builder(
as_type,
&cast_options,
cast_options,
array.len(),
NullValue::TopLevelVariant,
)?;
Expand Down Expand Up @@ -321,12 +328,19 @@ impl<'a> VariantToShreddedArrayVariantRowBuilder<'a> {
// If the variant is not an array, typed_value must be null.
Comment thread
rishvin marked this conversation as resolved.
// If the variant is an array, value must be null.
match variant {
Variant::List(list) => {
Variant::List(ref list) => {
self.nulls.append_non_null();
self.value_builder.append_null();
self.typed_value_builder
.append_value(&Variant::List(list))?;
Ok(true)

// With `safe` cast option set to false, appending list of wrong size to
// `typed_value_builder` of type `FixedSizeList` will result in an error. In such a
// case, the provided list should be appended to the `value_builder.
Comment on lines +334 to +336
Copy link
Copy Markdown
Contributor

@scovich scovich Apr 14, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure the variant shredding spec allows for shredding as a fixed size list, if the resulting layout differs physically from a normal list?

Arrays can be shredded by using a 3-level Parquet list for typed_value.

If the value is not an array, typed_value must be null. If the value is an array, value must be null.

It looks to me like any attempt to shred as fixed-sized list must either succeed (if the size is correct) or hard-fail (because value as fallback is not allowed).

Copy link
Copy Markdown
Contributor

@sdf-jkl sdf-jkl Apr 14, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It differs physically on the Arrow side, but once we write it to Parquet it'd be same as other ListLikeArrays. But this leads to further discussion on adding FixedSizeList support for VariantArray as well as implementing other types, currently not supported in spec.

We're keeping value because we consider this a cast from Variant to FixedSizeList. The extra len check is there because there is no Variant::FixedSizeList enum to match to. If len is incorrect we consider the cast failed and proceed following the safe cast option as if typed_value is Null.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When casting from variant to arrow, we can do whatever we want.

But this code here is about going from binary variant to shredded variant. And the variant shredding spec directly forbids value to contain a variant array, when shredding as array.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True. I think the core issue is that Parquet currently has only one logical LIST type. If Parquet had a dedicated logical type for FixedSizeList, the spec wording could be more explicit.

Btw, there’s ongoing work on this too: apache/parquet-format#241 (recently revived).

Given the current spec text:

Arrays can be shredded by using a 3-level Parquet list for typed_value.

If the value is not an array, typed_value must be null. If the value is an array, value must be null.

I read “array” as "a value matching the specific list shape we’re shredding into". For List/LargeList/ListView it's List values, for FixedSizeList array it's a FixedSizeList value.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From what I understand, the variant spec neither knows nor cares about the intricacies of arrow array types (it also doesn't care about spark or SQL). If we're shredding to a 3-level parquet list, and we encounter a variant array value, the resulting value column entry must be null.

Copy link
Copy Markdown

@cashmand cashmand Apr 15, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi, I worked on the shredding spec, and the intent of that line of the spec was to apply to any array, not just one that perfectly matches the shredding schema. For example, in a query with try_cast(v as array<variant>), an engine would be entitled to only fetch the typed_value column from parquet, and produce null for all of the rows where typed_value is null. This would break if value could contain arrays.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. Thanks for the clarification @cashmand, @scovich!

let shredded = self.typed_value_builder.append_value(&variant)?;
if shredded {
self.value_builder.append_null();
} else {
self.value_builder.append_value(Variant::List(list.clone()));
}
Ok(shredded)
}
other => {
self.nulls.append_non_null();
Expand Down Expand Up @@ -690,9 +704,9 @@ mod tests {
use super::*;
use crate::VariantArrayBuilder;
use arrow::array::{
Array, BinaryViewArray, FixedSizeBinaryArray, Float64Array, GenericListArray,
GenericListViewArray, Int64Array, LargeBinaryArray, LargeStringArray, ListArray,
ListLikeArray, OffsetSizeTrait, PrimitiveArray, StringArray,
Array, BinaryViewArray, FixedSizeBinaryArray, FixedSizeListArray, Float64Array,
GenericListArray, GenericListViewArray, Int64Array, LargeBinaryArray, LargeStringArray,
ListArray, ListLikeArray, OffsetSizeTrait, PrimitiveArray, StringArray, StructArray,
};
use arrow::datatypes::{
ArrowPrimitiveType, DataType, Field, Fields, Int64Type, TimeUnit, UnionFields, UnionMode,
Expand Down Expand Up @@ -1608,17 +1622,105 @@ mod tests {

#[test]
fn test_array_shredding_as_fixed_size_list() {
let input = build_variant_array(vec![
VariantRow::List(vec![VariantValue::from(1i64), VariantValue::from(2i64)]),
VariantRow::Value(VariantValue::from("This should not be shredded")),
VariantRow::List(vec![VariantValue::from(3i64), VariantValue::from(4i64)]),
]);

let list_schema =
DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int64, true)), 2);
let result = shred_variant(&input, &list_schema).unwrap();
assert_eq!(result.len(), 3);

// The first row should be shredded, so the `value` field should be null and the
// `typed_value` field should contain the list
assert!(result.is_valid(0));
assert!(result.value_field().unwrap().is_null(0));
assert!(result.typed_value_field().unwrap().is_valid(0));

// The second row should not be shredded because the provided schema for shredding did not
// match. Hence, the `value` field should contain the raw value and the `typed_value` field
// should be null.
assert!(result.is_valid(1));
assert!(result.value_field().unwrap().is_valid(1));
assert!(result.typed_value_field().unwrap().is_null(1));

// The third row should be shredded, so the `value` field should be null and the
// `typed_value` field should contain the list
assert!(result.is_valid(2));
assert!(result.value_field().unwrap().is_null(2));
assert!(result.typed_value_field().unwrap().is_valid(2));

let typed_value = result.typed_value_field().unwrap();
let fixed_size_list = typed_value
.as_any()
.downcast_ref::<FixedSizeListArray>()
.expect("Expected FixedSizeListArray");

// Verify that typed value is `FixedSizeList`.
assert_eq!(fixed_size_list.len(), 3);
assert_eq!(fixed_size_list.value_length(), 2);

// Verify that the first entry in the `FixedSizeList` contains the expected value.
let val0 = fixed_size_list.value(0);
let val0_struct = val0.as_any().downcast_ref::<StructArray>().unwrap();
let val0_typed = val0_struct.column_by_name("typed_value").unwrap();
let val0_ints = val0_typed.as_any().downcast_ref::<Int64Array>().unwrap();
assert_eq!(val0_ints.values(), &[1i64, 2i64]);

// Verify that second entry in the `FixedSizeList` cannot be shredded hence the value is
// invalid.
assert!(fixed_size_list.is_null(1));

// Verify that the third entry in the `FixedSizeList` contains the expected value.
let val2 = fixed_size_list.value(2);
let val2_struct = val2.as_any().downcast_ref::<StructArray>().unwrap();
let val2_typed = val2_struct.column_by_name("typed_value").unwrap();
let val2_ints = val2_typed.as_any().downcast_ref::<Int64Array>().unwrap();
assert_eq!(val2_ints.values(), &[3i64, 4i64]);
}

#[test]
fn test_array_shredding_as_fixed_size_list_wrong_size() {
Comment thread
rishvin marked this conversation as resolved.
let input = build_variant_array(vec![VariantRow::List(vec![
VariantValue::from(1i64),
VariantValue::from(2i64),
VariantValue::from(3i64),
])]);
let list_schema =
DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int64, true)), 2);
let err = shred_variant(&input, &list_schema).unwrap_err();
assert_eq!(
err.to_string(),
"Not yet implemented: Converting unshredded variant arrays to arrow fixed-size lists"

let result = shred_variant_with_options(
&input,
&list_schema,
&CastOptions {
safe: true,
..Default::default()
},
)
.unwrap();
assert_eq!(result.len(), 1);

// With `safe` set to to true, the incorrect size should not raise error.
assert!(result.is_valid(0));
assert!(result.value_field().unwrap().is_valid(0));
assert!(result.typed_value_field().unwrap().is_null(0));

// With `safe` set to false, the incorrect size should raise error.
let err = shred_variant_with_options(
&input,
&list_schema,
&CastOptions {
safe: false,
..Default::default()
},
)
.unwrap_err();
assert!(
err.to_string()
.contains("Expected fixed size list of size 2, got size 3"),
"got: {err}",
);
}

Expand Down
84 changes: 60 additions & 24 deletions parquet-variant-compute/src/variant_get.rs
Original file line number Diff line number Diff line change
Expand Up @@ -344,9 +344,9 @@ mod test {
use arrow::array::{
Array, ArrayRef, AsArray, BinaryArray, BinaryViewArray, BooleanArray, Date32Array,
Date64Array, Decimal32Array, Decimal64Array, Decimal128Array, Decimal256Array,
Float32Array, Float64Array, Int8Array, Int16Array, Int32Array, Int64Array,
LargeBinaryArray, LargeListArray, LargeListViewArray, LargeStringArray, ListArray,
ListViewArray, NullBuilder, StringArray, StringViewArray, StructArray,
FixedSizeListArray, Float32Array, Float64Array, Int8Array, Int16Array, Int32Array,
Int64Array, LargeBinaryArray, LargeListArray, LargeListViewArray, LargeStringArray,
ListArray, ListViewArray, NullBuilder, StringArray, StringViewArray, StructArray,
Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray,
};
use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer};
Expand Down Expand Up @@ -4112,13 +4112,29 @@ mod test {
(
DataType::LargeListView(field.clone()),
Arc::new(LargeListViewArray::new(
field,
field.clone(),
ScalarBuffer::from(vec![0, 3]),
ScalarBuffer::from(vec![3, 0]),
element_array,
Some(NullBuffer::from(vec![true, false])),
)) as ArrayRef,
),
(
DataType::FixedSizeList(field.clone(), 3),
Arc::new(FixedSizeListArray::new(
field,
3,
Arc::new(Int64Array::from(vec![
Some(1),
None,
Some(3),
None,
None,
None,
])),
Some(NullBuffer::from(vec![true, false])),
)) as ArrayRef,
),
];

for (request_type, expected) in expectations {
Expand Down Expand Up @@ -4281,7 +4297,8 @@ mod test {
DataType::List(item_field.clone()),
DataType::LargeList(item_field.clone()),
DataType::ListView(item_field.clone()),
DataType::LargeListView(item_field),
DataType::LargeListView(item_field.clone()),
DataType::FixedSizeList(item_field, 2),
];

for data_type in data_types {
Expand All @@ -4298,27 +4315,46 @@ mod test {
}

#[test]
fn test_variant_get_fixed_size_list_not_implemented() {
let string_array: ArrayRef = Arc::new(StringArray::from(vec!["[1, 2]", "\"not a list\""]));
fn test_variant_get_fixed_size_list_wrong_size() {
let string_array: ArrayRef = Arc::new(StringArray::from(vec!["[1, 2, 3]"]));
let variant_array = ArrayRef::from(json_to_variant(&string_array).unwrap());
let item_field = Arc::new(Field::new("item", Int64, true));
for safe in [true, false] {
let options = GetOptions::new()
.with_as_type(Some(FieldRef::from(Field::new(
"result",
DataType::FixedSizeList(item_field.clone(), 2),
true,
))))
.with_cast_options(CastOptions {
safe,
..Default::default()
});

let err = variant_get(&variant_array, options).unwrap_err();
assert!(
err.to_string()
.contains("Converting unshredded variant arrays to arrow fixed-size lists")
);
}
// With `safe` set to true, size mismatch should return Null.
let options = GetOptions::new()
.with_as_type(Some(FieldRef::from(Field::new(
"result",
DataType::FixedSizeList(item_field.clone(), 2),
true,
))))
.with_cast_options(CastOptions {
safe: true,
..Default::default()
});
let result = variant_get(&variant_array, options).unwrap();
let fixed_size_list = result
.as_any()
.downcast_ref::<FixedSizeListArray>()
.expect("Expected FixedSizeListArray");
assert_eq!(fixed_size_list.len(), 1);
assert!(fixed_size_list.is_null(0));

// With `safe` set to false, error should be raised on wrong sized fixed list.
let options = GetOptions::new()
.with_as_type(Some(FieldRef::from(Field::new(
"result",
DataType::FixedSizeList(item_field.clone(), 2),
true,
))))
.with_cast_options(CastOptions {
safe: false,
..Default::default()
});
let err = variant_get(&variant_array, options).unwrap_err();
assert!(
err.to_string()
.contains("Expected fixed size list of size 2, got size 3"),
"got: {err}",
);
}
}
Loading