Skip to content

Commit 72b4576

Browse files
AdamGSscovich
andauthored
[Variant] Support Binary/LargeBinary children (#9610)
# Which issue does this PR close? - Closes #8387 # Rationale for this change Improves spec compliance and improves performance, allowing more zero-copy copies between parquet and/or other implementation. # What changes are included in this PR? The main change is replacing `VariantArray`'s children with opaque `ArrayRef`, and handling validation accordingly. # Are these changes tested? All existing tests are still in place. # Are there any user-facing changes? Yes, some functions on `VariantArray` now take or return `ArrayRef`, `from_parts`, `value_field`, `metadata_field`. --------- Signed-off-by: Adam Gutglick <adam@spiraldb.com> Co-authored-by: Ryan Johnson <scovich@users.noreply.github.com>
1 parent b946165 commit 72b4576

File tree

6 files changed

+331
-233
lines changed

6 files changed

+331
-233
lines changed

parquet-variant-compute/src/shred_variant.rs

Lines changed: 62 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ pub fn shred_variant(array: &VariantArray, as_type: &DataType) -> Result<Variant
9696
let (value, typed_value, nulls) = builder.finish()?;
9797
Ok(VariantArray::from_parts(
9898
array.metadata_field().clone(),
99-
Some(value),
99+
Some(Arc::new(value)),
100100
Some(typed_value),
101101
nulls,
102102
))
@@ -443,8 +443,11 @@ impl<'a> VariantToShreddedObjectVariantRowBuilder<'a> {
443443
let mut builder = StructArrayBuilder::new();
444444
for (field_name, typed_value_builder) in self.typed_value_builders {
445445
let (value, typed_value, nulls) = typed_value_builder.finish()?;
446-
let array =
447-
ShreddedVariantFieldArray::from_parts(Some(value), Some(typed_value), nulls);
446+
let array = ShreddedVariantFieldArray::from_parts(
447+
Some(Arc::new(value)),
448+
Some(typed_value),
449+
nulls,
450+
);
448451
builder = builder.with_field(field_name, ArrayRef::from(array), false);
449452
}
450453
if let Some(nulls) = self.typed_value_nulls.finish() {
@@ -689,6 +692,7 @@ impl VariantSchemaNode {
689692
mod tests {
690693
use super::*;
691694
use crate::VariantArrayBuilder;
695+
use crate::variant_array::{binary_array_value, variant_from_arrays_at};
692696
use arrow::array::{
693697
Array, BinaryViewArray, FixedSizeBinaryArray, Float64Array, GenericListArray,
694698
GenericListViewArray, Int64Array, LargeBinaryArray, LargeStringArray, ListArray,
@@ -867,7 +871,8 @@ mod tests {
867871
) {
868872
assert_eq!(array.len(), expected_len);
869873

870-
let fallbacks = (array.value_field().unwrap(), Some(array.metadata_field()));
874+
let fallback_value = array.value_field().unwrap();
875+
let fallback_metadata = array.metadata_field();
871876
let array = downcast_list_like_array::<O>(array);
872877

873878
assert_eq!(
@@ -887,7 +892,7 @@ mod tests {
887892
);
888893
assert_eq!(
889894
array.len(),
890-
fallbacks.0.len(),
895+
fallback_value.len(),
891896
"fallbacks value field should match array length"
892897
);
893898

@@ -902,28 +907,33 @@ mod tests {
902907
// Successfully shredded: typed list value present, no fallback value
903908
assert!(array.is_valid(idx));
904909
assert_eq!(array.value_size(idx), *len);
905-
assert!(fallbacks.0.is_null(idx));
910+
assert!(fallback_value.is_null(idx));
906911
}
907912
None => {
908913
// Unable to shred: typed list value absent, fallback should carry the variant
909914
assert!(array.is_null(idx));
910915
assert_eq!(array.value_size(idx), O::zero());
911916
match expected_fallback {
912917
Some(expected_variant) => {
913-
assert!(fallbacks.0.is_valid(idx));
914-
let metadata_bytes = fallbacks
915-
.1
916-
.filter(|m| m.is_valid(idx))
917-
.map(|m| m.value(idx))
918-
.filter(|bytes| !bytes.is_empty())
919-
.unwrap_or(EMPTY_VARIANT_METADATA_BYTES);
918+
assert!(fallback_value.is_valid(idx));
919+
let metadata_bytes =
920+
binary_array_value(fallback_metadata.as_ref(), idx).unwrap();
921+
let metadata_bytes =
922+
if fallback_metadata.is_valid(idx) && !metadata_bytes.is_empty() {
923+
metadata_bytes
924+
} else {
925+
EMPTY_VARIANT_METADATA_BYTES
926+
};
920927
assert_eq!(
921-
Variant::new(metadata_bytes, fallbacks.0.value(idx)),
928+
Variant::new(
929+
metadata_bytes,
930+
binary_array_value(fallback_value.as_ref(), idx).unwrap()
931+
),
922932
expected_variant.clone()
923933
);
924934
}
925935
None => {
926-
assert!(fallbacks.0.is_null(idx));
936+
assert!(fallback_value.is_null(idx));
927937
}
928938
}
929939
}
@@ -983,7 +993,10 @@ mod tests {
983993
Some(expected_variant) => {
984994
assert!(element_fallbacks.is_valid(idx));
985995
assert_eq!(
986-
Variant::new(EMPTY_VARIANT_METADATA_BYTES, element_fallbacks.value(idx)),
996+
Variant::new(
997+
EMPTY_VARIANT_METADATA_BYTES,
998+
binary_array_value(element_fallbacks.as_ref(), idx).unwrap()
999+
),
9871000
expected_variant.clone()
9881001
);
9891002
}
@@ -1129,7 +1142,7 @@ mod tests {
11291142
#[test]
11301143
fn test_all_null_input() {
11311144
// Create VariantArray with no value field (all null case)
1132-
let metadata = BinaryViewArray::from_iter_values([&[1u8, 0u8]]); // minimal valid metadata
1145+
let metadata = Arc::new(BinaryViewArray::from_iter_values([&[1u8, 0u8]])); // minimal valid metadata
11331146
let all_null_array = VariantArray::from_parts(metadata, None, None, None);
11341147
let result = shred_variant(&all_null_array, &DataType::Int64).unwrap();
11351148

@@ -1243,7 +1256,7 @@ mod tests {
12431256
assert!(!value_field.is_null(1)); // value should contain original
12441257
assert!(typed_value_field.is_null(1)); // typed_value should be null
12451258
assert_eq!(
1246-
Variant::new(metadata_field.value(1), value_field.value(1)),
1259+
variant_from_arrays_at(metadata_field, value_field, 1).unwrap(),
12471260
Variant::from("hello")
12481261
);
12491262

@@ -1259,7 +1272,7 @@ mod tests {
12591272
assert!(!result.is_null(4));
12601273
assert!(!value_field.is_null(4)); // should contain Variant::Null
12611274
assert_eq!(
1262-
Variant::new(metadata_field.value(4), value_field.value(4)),
1275+
variant_from_arrays_at(metadata_field, value_field, 4).unwrap(),
12631276
Variant::Null
12641277
);
12651278
assert!(typed_value_field.is_null(4));
@@ -1336,7 +1349,7 @@ mod tests {
13361349
assert!(value.is_valid(1));
13371350
assert!(typed_value.is_null(1));
13381351
assert_eq!(
1339-
Variant::new(metadata.value(1), value.value(1)),
1352+
variant_from_arrays_at(metadata, value, 1).unwrap(),
13401353
Variant::from(42i64)
13411354
);
13421355

@@ -1350,7 +1363,7 @@ mod tests {
13501363
assert!(value.is_valid(3));
13511364
assert!(typed_value.is_null(3));
13521365
assert_eq!(
1353-
Variant::new(metadata.value(3), value.value(3)),
1366+
variant_from_arrays_at(metadata, value, 3).unwrap(),
13541367
Variant::Null
13551368
);
13561369

@@ -1392,7 +1405,7 @@ mod tests {
13921405
assert!(value.is_valid(1));
13931406
assert!(typed_value.is_null(1));
13941407
assert_eq!(
1395-
Variant::new(metadata.value(1), value.value(1)),
1408+
variant_from_arrays_at(metadata, value, 1).unwrap(),
13961409
Variant::from("not_binary")
13971410
);
13981411

@@ -1406,7 +1419,7 @@ mod tests {
14061419
assert!(value.is_valid(3));
14071420
assert!(typed_value.is_null(3));
14081421
assert_eq!(
1409-
Variant::new(metadata.value(3), value.value(3)),
1422+
variant_from_arrays_at(metadata, value, 3).unwrap(),
14101423
Variant::Null
14111424
);
14121425

@@ -1682,10 +1695,10 @@ mod tests {
16821695
.unwrap();
16831696
let outer_fallbacks = outer_elements.value_field().unwrap();
16841697

1685-
let outer_metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(
1698+
let outer_metadata = Arc::new(BinaryViewArray::from_iter_values(std::iter::repeat_n(
16861699
EMPTY_VARIANT_METADATA_BYTES,
16871700
outer_elements.len(),
1688-
));
1701+
)));
16891702
let outer_variant = VariantArray::from_parts(
16901703
outer_metadata,
16911704
Some(outer_fallbacks.clone()),
@@ -1792,7 +1805,10 @@ mod tests {
17921805
// null is stored as Variant::Null in values
17931806
assert!(id_values.is_valid(1));
17941807
assert_eq!(
1795-
Variant::new(EMPTY_VARIANT_METADATA_BYTES, id_values.value(1)),
1808+
Variant::new(
1809+
EMPTY_VARIANT_METADATA_BYTES,
1810+
binary_array_value(id_values.as_ref(), 1).unwrap()
1811+
),
17961812
Variant::Null
17971813
);
17981814
assert!(id_typed_values.is_null(1));
@@ -1866,7 +1882,6 @@ mod tests {
18661882
assert_eq!(result.len(), 9);
18671883

18681884
let metadata = result.metadata_field();
1869-
18701885
let value = result.value_field().unwrap();
18711886
let typed_value = result
18721887
.typed_value_field()
@@ -1882,24 +1897,14 @@ mod tests {
18821897
let age_field =
18831898
ShreddedVariantFieldArray::try_new(typed_value.column_by_name("age").unwrap()).unwrap();
18841899

1885-
let score_value = score_field
1886-
.value_field()
1887-
.unwrap()
1888-
.as_any()
1889-
.downcast_ref::<BinaryViewArray>()
1890-
.unwrap();
1900+
let score_value = score_field.value_field().unwrap();
18911901
let score_typed_value = score_field
18921902
.typed_value_field()
18931903
.unwrap()
18941904
.as_any()
18951905
.downcast_ref::<Float64Array>()
18961906
.unwrap();
1897-
let age_value = age_field
1898-
.value_field()
1899-
.unwrap()
1900-
.as_any()
1901-
.downcast_ref::<BinaryViewArray>()
1902-
.unwrap();
1907+
let age_value = age_field.value_field().unwrap();
19031908
let age_typed_value = age_field
19041909
.typed_value_field()
19051910
.unwrap()
@@ -1918,10 +1923,10 @@ mod tests {
19181923
}
19191924
fn get_value<'m, 'v>(
19201925
i: usize,
1921-
metadata: &'m BinaryViewArray,
1922-
value: &'v BinaryViewArray,
1926+
metadata: &'m dyn Array,
1927+
value: &'v dyn Array,
19231928
) -> Variant<'m, 'v> {
1924-
Variant::new(metadata.value(i), value.value(i))
1929+
variant_from_arrays_at(metadata, value, i).unwrap()
19251930
}
19261931
let expect = |i, expected_result: Option<ShreddedValue<ShreddedStruct>>| {
19271932
match expected_result {
@@ -1933,7 +1938,10 @@ mod tests {
19331938
match expected_value {
19341939
Some(expected_value) => {
19351940
assert!(value.is_valid(i));
1936-
assert_eq!(expected_value, get_value(i, metadata, value));
1941+
assert_eq!(
1942+
expected_value,
1943+
get_value(i, metadata.as_ref(), value.as_ref())
1944+
);
19371945
}
19381946
None => {
19391947
assert!(value.is_null(i));
@@ -1952,7 +1960,7 @@ mod tests {
19521960
assert!(score_value.is_valid(i));
19531961
assert_eq!(
19541962
expected_score_value,
1955-
get_value(i, metadata, score_value)
1963+
get_value(i, metadata.as_ref(), score_value.as_ref())
19561964
);
19571965
}
19581966
None => {
@@ -1973,7 +1981,7 @@ mod tests {
19731981
assert!(age_value.is_valid(i));
19741982
assert_eq!(
19751983
expected_age_value,
1976-
get_value(i, metadata, age_value)
1984+
get_value(i, metadata.as_ref(), age_value.as_ref())
19771985
);
19781986
}
19791987
None => {
@@ -2114,7 +2122,7 @@ mod tests {
21142122
// Helper to correctly create a variant object using a row's existing metadata
21152123
let object_with_foo_field = |i| {
21162124
use parquet_variant::{ParentState, ValueBuilder, VariantMetadata};
2117-
let metadata = VariantMetadata::new(metadata.value(i));
2125+
let metadata = VariantMetadata::new(binary_array_value(metadata.as_ref(), i).unwrap());
21182126
let mut metadata_builder = ReadOnlyMetadataBuilder::new(&metadata);
21192127
let mut value_builder = ValueBuilder::new();
21202128
let state = ParentState::variant(&mut value_builder, &mut metadata_builder);
@@ -2213,7 +2221,7 @@ mod tests {
22132221
assert!(value_field.is_null(2));
22142222
assert!(value_field.is_valid(3));
22152223
assert_eq!(
2216-
Variant::new(result.metadata_field().value(3), value_field.value(3)),
2224+
variant_from_arrays_at(result.metadata_field(), value_field, 3).unwrap(),
22172225
Variant::from("not an object")
22182226
);
22192227
assert!(value_field.is_null(4));
@@ -2231,10 +2239,10 @@ mod tests {
22312239
.unwrap();
22322240
assert_list_structure_and_elements::<Int64Type, i32>(
22332241
&VariantArray::from_parts(
2234-
BinaryViewArray::from_iter_values(std::iter::repeat_n(
2242+
Arc::new(BinaryViewArray::from_iter_values(std::iter::repeat_n(
22352243
EMPTY_VARIANT_METADATA_BYTES,
22362244
scores_field.len(),
2237-
)),
2245+
))),
22382246
Some(scores_field.value_field().unwrap().clone()),
22392247
Some(scores_field.typed_value_field().unwrap().clone()),
22402248
None,
@@ -2350,24 +2358,14 @@ mod tests {
23502358
ShreddedVariantFieldArray::try_new(typed_value.column_by_name("session_id").unwrap())
23512359
.unwrap();
23522360

2353-
let id_value = id_field
2354-
.value_field()
2355-
.unwrap()
2356-
.as_any()
2357-
.downcast_ref::<BinaryViewArray>()
2358-
.unwrap();
2361+
let id_value = id_field.value_field().unwrap();
23592362
let id_typed_value = id_field
23602363
.typed_value_field()
23612364
.unwrap()
23622365
.as_any()
23632366
.downcast_ref::<FixedSizeBinaryArray>()
23642367
.unwrap();
2365-
let session_id_value = session_id_field
2366-
.value_field()
2367-
.unwrap()
2368-
.as_any()
2369-
.downcast_ref::<BinaryViewArray>()
2370-
.unwrap();
2368+
let session_id_value = session_id_field.value_field().unwrap();
23712369
let session_id_typed_value = session_id_field
23722370
.typed_value_field()
23732371
.unwrap()
@@ -2404,7 +2402,7 @@ mod tests {
24042402
assert_eq!(session_id_typed_value.value(1), mock_uuid_3.as_bytes());
24052403

24062404
// Verify the value field contains the name field
2407-
let row_1_variant = Variant::new(metadata.value(1), value.value(1));
2405+
let row_1_variant = variant_from_arrays_at(metadata, value, 1).unwrap();
24082406
let Variant::Object(obj) = row_1_variant else {
24092407
panic!("Expected object");
24102408
};
@@ -2436,7 +2434,7 @@ mod tests {
24362434

24372435
assert!(session_id_value.is_valid(3)); // type mismatch, stored in value
24382436
assert!(session_id_typed_value.is_null(3));
2439-
let session_id_variant = Variant::new(metadata.value(3), session_id_value.value(3));
2437+
let session_id_variant = variant_from_arrays_at(metadata, session_id_value, 3).unwrap();
24402438
assert_eq!(session_id_variant, Variant::from("not-a-uuid"));
24412439

24422440
// Row 4: Type mismatch - id is int64, not UUID
@@ -2447,7 +2445,7 @@ mod tests {
24472445

24482446
assert!(id_value.is_valid(4)); // type mismatch, stored in value
24492447
assert!(id_typed_value.is_null(4));
2450-
let id_variant = Variant::new(metadata.value(4), id_value.value(4));
2448+
let id_variant = variant_from_arrays_at(metadata, id_value, 4).unwrap();
24512449
assert_eq!(id_variant, Variant::from(12345i64));
24522450

24532451
assert!(session_id_value.is_null(4));

0 commit comments

Comments
 (0)