Skip to content

Commit 808b1b5

Browse files
committed
Add support for FixedSizeList to variant_to_arrow
1 parent ec771cc commit 808b1b5

File tree

3 files changed

+236
-22
lines changed

3 files changed

+236
-22
lines changed

parquet-variant-compute/src/shred_variant.rs

Lines changed: 70 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -689,9 +689,9 @@ mod tests {
689689
use super::*;
690690
use crate::VariantArrayBuilder;
691691
use arrow::array::{
692-
Array, BinaryViewArray, FixedSizeBinaryArray, Float64Array, GenericListArray,
693-
GenericListViewArray, Int64Array, LargeBinaryArray, LargeStringArray, ListArray,
694-
ListLikeArray, OffsetSizeTrait, PrimitiveArray, StringArray,
692+
Array, BinaryViewArray, FixedSizeBinaryArray, FixedSizeListArray, Float64Array,
693+
GenericListArray, GenericListViewArray, Int64Array, LargeBinaryArray, LargeStringArray,
694+
ListArray, ListLikeArray, OffsetSizeTrait, PrimitiveArray, StringArray, StructArray,
695695
};
696696
use arrow::datatypes::{
697697
ArrowPrimitiveType, DataType, Field, Fields, Int64Type, TimeUnit, UnionFields, UnionMode,
@@ -1607,6 +1607,67 @@ mod tests {
16071607

16081608
#[test]
16091609
fn test_array_shredding_as_fixed_size_list() {
1610+
let input = build_variant_array(vec![
1611+
VariantRow::List(vec![VariantValue::from(1i64), VariantValue::from(2i64)]),
1612+
VariantRow::Value(VariantValue::from("This should not be shredded")),
1613+
VariantRow::List(vec![VariantValue::from(3i64), VariantValue::from(4i64)]),
1614+
]);
1615+
1616+
let list_schema =
1617+
DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int64, true)), 2);
1618+
let result = shred_variant(&input, &list_schema).unwrap();
1619+
assert_eq!(result.len(), 3);
1620+
1621+
// The first row should be shredded, so the `value` field should be null and the
1622+
// `typed_value` field should contain the list
1623+
assert!(result.is_valid(0));
1624+
assert!(result.value_field().unwrap().is_null(0));
1625+
assert!(result.typed_value_field().unwrap().is_valid(0));
1626+
1627+
// The second row should not be shredded because the provided schema for shredding did not
1628+
// match. Hence, the `value` field should contain the raw value and the `typed_value` field
1629+
// should be null.
1630+
assert!(result.is_valid(1));
1631+
assert!(result.value_field().unwrap().is_valid(1));
1632+
assert!(result.typed_value_field().unwrap().is_null(1));
1633+
1634+
// The third row should be shredded, so the `value` field should be null and the
1635+
// `typed_value` field should contain the list
1636+
assert!(result.is_valid(2));
1637+
assert!(result.value_field().unwrap().is_null(2));
1638+
assert!(result.typed_value_field().unwrap().is_valid(2));
1639+
1640+
let typed_value = result.typed_value_field().unwrap();
1641+
let fixed_size_list = typed_value
1642+
.as_any()
1643+
.downcast_ref::<FixedSizeListArray>()
1644+
.expect("Expected FixedSizeListArray");
1645+
1646+
// Verify that typed value is `FixedSizeList`.
1647+
assert_eq!(fixed_size_list.len(), 3);
1648+
assert_eq!(fixed_size_list.value_length(), 2);
1649+
1650+
// Verify that the first entry in the `FixedSizeList` contains the expected value.
1651+
let val0 = fixed_size_list.value(0);
1652+
let val0_struct = val0.as_any().downcast_ref::<StructArray>().unwrap();
1653+
let val0_typed = val0_struct.column_by_name("typed_value").unwrap();
1654+
let val0_ints = val0_typed.as_any().downcast_ref::<Int64Array>().unwrap();
1655+
assert_eq!(val0_ints.values(), &[1i64, 2i64]);
1656+
1657+
// Verify that second entry in the `FixedSizeList` cannot be shredded hence the value is
1658+
// invalid.
1659+
assert!(fixed_size_list.is_null(1));
1660+
1661+
// Verify that the third entry in the `FixedSizeList` contains the expected value.
1662+
let val2 = fixed_size_list.value(2);
1663+
let val2_struct = val2.as_any().downcast_ref::<StructArray>().unwrap();
1664+
let val2_typed = val2_struct.column_by_name("typed_value").unwrap();
1665+
let val2_ints = val2_typed.as_any().downcast_ref::<Int64Array>().unwrap();
1666+
assert_eq!(val2_ints.values(), &[3i64, 4i64]);
1667+
}
1668+
1669+
#[test]
1670+
fn test_array_shredding_as_fixed_size_list_wrong_size() {
16101671
let input = build_variant_array(vec![VariantRow::List(vec![
16111672
VariantValue::from(1i64),
16121673
VariantValue::from(2i64),
@@ -1615,9 +1676,12 @@ mod tests {
16151676
let list_schema =
16161677
DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Int64, true)), 2);
16171678
let err = shred_variant(&input, &list_schema).unwrap_err();
1618-
assert_eq!(
1619-
err.to_string(),
1620-
"Not yet implemented: Converting unshredded variant arrays to arrow fixed-size lists"
1679+
println!("{}", err);
1680+
assert!(
1681+
err.to_string()
1682+
.contains("Expected fixed size list of size 2, got size 3"),
1683+
"got: {}",
1684+
err
16211685
);
16221686
}
16231687

parquet-variant-compute/src/variant_get.rs

Lines changed: 64 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -351,9 +351,9 @@ mod test {
351351
use arrow::array::{
352352
Array, ArrayRef, AsArray, BinaryArray, BinaryViewArray, BooleanArray, Date32Array,
353353
Date64Array, Decimal32Array, Decimal64Array, Decimal128Array, Decimal256Array,
354-
Float32Array, Float64Array, Int8Array, Int16Array, Int32Array, Int64Array,
355-
LargeBinaryArray, LargeListArray, LargeListViewArray, LargeStringArray, ListArray,
356-
ListViewArray, NullBuilder, StringArray, StringViewArray, StructArray,
354+
FixedSizeListArray, Float32Array, Float64Array, Int8Array, Int16Array, Int32Array,
355+
Int64Array, LargeBinaryArray, LargeListArray, LargeListViewArray, LargeStringArray,
356+
ListArray, ListViewArray, NullBuilder, StringArray, StringViewArray, StructArray,
357357
Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray,
358358
};
359359
use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer};
@@ -4336,7 +4336,8 @@ mod test {
43364336
DataType::List(item_field.clone()),
43374337
DataType::LargeList(item_field.clone()),
43384338
DataType::ListView(item_field.clone()),
4339-
DataType::LargeListView(item_field),
4339+
DataType::LargeListView(item_field.clone()),
4340+
DataType::FixedSizeList(item_field, 2),
43404341
];
43414342

43424343
for data_type in data_types {
@@ -4353,10 +4354,65 @@ mod test {
43534354
}
43544355

43554356
#[test]
4356-
fn test_variant_get_fixed_size_list_not_implemented() {
4357-
let string_array: ArrayRef = Arc::new(StringArray::from(vec!["[1, 2]", "\"not a list\""]));
4357+
fn test_variant_get_fixed_size_list_with_safe_option() {
4358+
let string_array: ArrayRef = Arc::new(StringArray::from(vec![
4359+
"[1, 2]",
4360+
"[3, 4]",
4361+
"\"not a list\"",
4362+
]));
43584363
let variant_array = ArrayRef::from(json_to_variant(&string_array).unwrap());
43594364
let item_field = Arc::new(Field::new("item", Int64, true));
4365+
4366+
// Request shredding on `FixedSizeList` with `safe` set to true, such that `variant_get`
4367+
// does not raise error on type mismatch.
4368+
let options = GetOptions::new()
4369+
.with_as_type(Some(FieldRef::from(Field::new(
4370+
"result",
4371+
DataType::FixedSizeList(item_field.clone(), 2),
4372+
true,
4373+
))))
4374+
.with_cast_options(CastOptions {
4375+
safe: true,
4376+
..Default::default()
4377+
});
4378+
4379+
// Verify that the shredded value is a `FixedSizeList`.
4380+
let result = variant_get(&variant_array, options).unwrap();
4381+
let fixed_size_list = result
4382+
.as_any()
4383+
.downcast_ref::<FixedSizeListArray>()
4384+
.expect("Expected FixedSizeListArray");
4385+
assert_eq!(fixed_size_list.len(), 3);
4386+
assert_eq!(fixed_size_list.value_length(), 2);
4387+
4388+
// Verify that the first entry in the `FixedSizeList` contains the expected value.
4389+
assert!(fixed_size_list.is_valid(0));
4390+
let val0 = fixed_size_list.value(0);
4391+
let val0_struct = val0.as_any().downcast_ref::<StructArray>().unwrap();
4392+
let val0_typed = val0_struct.column_by_name("typed_value").unwrap();
4393+
let val0_ints = val0_typed.as_any().downcast_ref::<Int64Array>().unwrap();
4394+
assert_eq!(val0_ints.values(), &[1i64, 2i64]);
4395+
4396+
// Verify that the second entry in the `FixedSizeList` contains the expected value.
4397+
assert!(fixed_size_list.is_valid(1));
4398+
let val1 = fixed_size_list.value(1);
4399+
let val1_struct = val1.as_any().downcast_ref::<StructArray>().unwrap();
4400+
let val1_typed = val1_struct.column_by_name("typed_value").unwrap();
4401+
let val1_ints = val1_typed.as_any().downcast_ref::<Int64Array>().unwrap();
4402+
assert_eq!(val1_ints.values(), &[3i64, 4i64]);
4403+
4404+
// Verify that the third entry is null due to type mismatch.
4405+
assert!(fixed_size_list.is_null(2));
4406+
}
4407+
4408+
#[test]
4409+
fn test_variant_get_fixed_size_list_wrong_size() {
4410+
let string_array: ArrayRef = Arc::new(StringArray::from(vec!["[1, 2, 3]"]));
4411+
let variant_array = ArrayRef::from(json_to_variant(&string_array).unwrap());
4412+
let item_field = Arc::new(Field::new("item", Int64, true));
4413+
4414+
// Set the safe flag to both true and false and verify that size mismatch raises an error
4415+
// for `FixedSizeList`, regardless.
43604416
for safe in [true, false] {
43614417
let options = GetOptions::new()
43624418
.with_as_type(Some(FieldRef::from(Field::new(
@@ -4368,11 +4424,11 @@ mod test {
43684424
safe,
43694425
..Default::default()
43704426
});
4371-
43724427
let err = variant_get(&variant_array, options).unwrap_err();
43734428
assert!(
43744429
err.to_string()
4375-
.contains("Converting unshredded variant arrays to arrow fixed-size lists")
4430+
.contains("Expected fixed size list of size 2, got size 3"),
4431+
"safe={safe}, got: {err}",
43764432
);
43774433
}
43784434
}

parquet-variant-compute/src/variant_to_arrow.rs

Lines changed: 102 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,10 @@ use crate::variant_array::ShreddedVariantFieldArray;
2727
use crate::{VariantArray, VariantValueArrayBuilder};
2828
use arrow::array::{
2929
ArrayRef, ArrowNativeTypeOp, BinaryBuilder, BinaryLikeArrayBuilder, BinaryViewArray,
30-
BinaryViewBuilder, BooleanBuilder, FixedSizeBinaryBuilder, GenericListArray,
31-
GenericListViewArray, LargeBinaryBuilder, LargeStringBuilder, NullArray, NullBufferBuilder,
32-
OffsetSizeTrait, PrimitiveBuilder, StringBuilder, StringLikeArrayBuilder, StringViewBuilder,
33-
StructArray,
30+
BinaryViewBuilder, BooleanBuilder, FixedSizeBinaryBuilder, FixedSizeListArray,
31+
GenericListArray, GenericListViewArray, LargeBinaryBuilder, LargeStringBuilder, NullArray,
32+
NullBufferBuilder, OffsetSizeTrait, PrimitiveBuilder, StringBuilder, StringLikeArrayBuilder,
33+
StringViewBuilder, StructArray,
3434
};
3535
use arrow::buffer::{OffsetBuffer, ScalarBuffer};
3636
use arrow::compute::{CastOptions, DecimalCast};
@@ -507,6 +507,7 @@ pub(crate) enum ArrayVariantToArrowRowBuilder<'a> {
507507
LargeList(VariantToListArrowRowBuilder<'a, i64, false>),
508508
ListView(VariantToListArrowRowBuilder<'a, i32, true>),
509509
LargeListView(VariantToListArrowRowBuilder<'a, i64, true>),
510+
FixedSizeList(VariantToFixedSizeListArrowRowBuilder<'a>),
510511
}
511512

512513
pub(crate) struct StructVariantToArrowRowBuilder<'a> {
@@ -611,10 +612,14 @@ impl<'a> ArrayVariantToArrowRowBuilder<'a> {
611612
DataType::LargeList(field) => make_list_builder!(LargeList, i64, false, field),
612613
DataType::ListView(field) => make_list_builder!(ListView, i32, true, field),
613614
DataType::LargeListView(field) => make_list_builder!(LargeListView, i64, true, field),
614-
DataType::FixedSizeList(..) => {
615-
return Err(ArrowError::NotYetImplemented(
616-
"Converting unshredded variant arrays to arrow fixed-size lists".to_string(),
617-
));
615+
DataType::FixedSizeList(field, size) => {
616+
FixedSizeList(VariantToFixedSizeListArrowRowBuilder::try_new(
617+
field.clone(),
618+
field.data_type(),
619+
*size,
620+
cast_options,
621+
capacity,
622+
)?)
618623
}
619624
other => {
620625
return Err(ArrowError::InvalidArgumentError(format!(
@@ -631,6 +636,7 @@ impl<'a> ArrayVariantToArrowRowBuilder<'a> {
631636
Self::LargeList(builder) => builder.append_null(),
632637
Self::ListView(builder) => builder.append_null(),
633638
Self::LargeListView(builder) => builder.append_null(),
639+
Self::FixedSizeList(builder) => builder.append_null(),
634640
}
635641
}
636642

@@ -640,6 +646,7 @@ impl<'a> ArrayVariantToArrowRowBuilder<'a> {
640646
Self::LargeList(builder) => builder.append_value(value),
641647
Self::ListView(builder) => builder.append_value(value),
642648
Self::LargeListView(builder) => builder.append_value(value),
649+
Self::FixedSizeList(builder) => builder.append_value(value),
643650
}
644651
}
645652

@@ -649,6 +656,7 @@ impl<'a> ArrayVariantToArrowRowBuilder<'a> {
649656
Self::LargeList(builder) => builder.finish(),
650657
Self::ListView(builder) => builder.finish(),
651658
Self::LargeListView(builder) => builder.finish(),
659+
Self::FixedSizeList(builder) => builder.finish(),
652660
}
653661
}
654662
}
@@ -1003,6 +1011,92 @@ where
10031011
}
10041012
}
10051013

1014+
pub(crate) struct VariantToFixedSizeListArrowRowBuilder<'a> {
1015+
field: FieldRef,
1016+
list_size: i32,
1017+
element_builder: Box<VariantToShreddedVariantRowBuilder<'a>>,
1018+
nulls: NullBufferBuilder,
1019+
cast_options: &'a CastOptions<'a>,
1020+
}
1021+
1022+
impl<'a> VariantToFixedSizeListArrowRowBuilder<'a> {
1023+
fn try_new(
1024+
field: FieldRef,
1025+
element_data_type: &'a DataType,
1026+
list_size: i32,
1027+
cast_options: &'a CastOptions,
1028+
capacity: usize,
1029+
) -> Result<Self> {
1030+
let element_builder = make_variant_to_shredded_variant_arrow_row_builder(
1031+
element_data_type,
1032+
cast_options,
1033+
capacity,
1034+
NullValue::ArrayElement,
1035+
)?;
1036+
Ok(Self {
1037+
field,
1038+
list_size,
1039+
element_builder: Box::new(element_builder),
1040+
nulls: NullBufferBuilder::new(capacity),
1041+
cast_options,
1042+
})
1043+
}
1044+
1045+
fn append_null(&mut self) -> Result<()> {
1046+
for _ in 0..self.list_size {
1047+
self.element_builder.append_null()?;
1048+
}
1049+
self.nulls.append_null();
1050+
Ok(())
1051+
}
1052+
1053+
fn append_value(&mut self, value: &Variant<'_, '_>) -> Result<bool> {
1054+
match value {
1055+
Variant::List(list) => {
1056+
let len = list.len();
1057+
if len != self.list_size as usize {
1058+
return Err(ArrowError::CastError(format!(
1059+
"Expected fixed size list of size {}, got size {}",
1060+
self.list_size, len
1061+
)));
1062+
}
1063+
for element in list.iter() {
1064+
self.element_builder.append_value(element)?;
1065+
}
1066+
self.nulls.append_non_null();
1067+
Ok(true)
1068+
}
1069+
_ if self.cast_options.safe => {
1070+
self.append_null()?;
1071+
Ok(false)
1072+
}
1073+
_ => Err(ArrowError::CastError(format!(
1074+
"Failed to extract fixed size list from variant {:?}",
1075+
value
1076+
))),
1077+
}
1078+
}
1079+
1080+
fn finish(mut self) -> Result<ArrayRef> {
1081+
let (value, typed_value, nulls) = self.element_builder.finish()?;
1082+
let element_array =
1083+
ShreddedVariantFieldArray::from_parts(Some(value), Some(typed_value), nulls);
1084+
let field = Arc::new(
1085+
self.field
1086+
.as_ref()
1087+
.clone()
1088+
.with_data_type(element_array.data_type().clone()),
1089+
);
1090+
let fixed_size_list_array = FixedSizeListArray::try_new(
1091+
field,
1092+
self.list_size,
1093+
ArrayRef::from(element_array),
1094+
self.nulls.finish(),
1095+
)?;
1096+
Ok(Arc::new(fixed_size_list_array))
1097+
}
1098+
}
1099+
10061100
/// Builder for creating VariantArray output (for path extraction without type conversion)
10071101
pub(crate) struct VariantToBinaryVariantArrowRowBuilder {
10081102
metadata: BinaryViewArray,

0 commit comments

Comments
 (0)