Skip to content

Commit 4676c06

Browse files
authored
[Variant] Take top-level nulls into consideration when extracting perfectly shredded children (#9702)
# Which issue does this PR close? - Closes #9701 # Rationale for this change Fixes a correctness issue, where top-level nullability will be dropped in these cases. Its important to note that due to the current canonicalization behavior, some types (like `Binary`) actually do behave correctly, this will be fully addressed in #9610 where we can support more underlying types, which simplifies it significantly. # What changes are included in this PR? Union the nullability buffers of perfectly shredded variant children with the array's top-level nullability. # Are these changes tested? In addition to existing tests, add tests that verify that the nulls are applied, both when the child is has no-nulls and when it does. # Are there any user-facing changes? Fixes incorrect behavior
1 parent 370d426 commit 4676c06

File tree

1 file changed

+151
-4
lines changed

1 file changed

+151
-4
lines changed

parquet-variant-compute/src/variant_get.rs

Lines changed: 151 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717
use arrow::{
18-
array::{self, Array, ArrayRef, StructArray},
18+
array::{self, Array, ArrayRef, StructArray, make_array},
1919
buffer::NullBuffer,
2020
compute::CastOptions,
2121
datatypes::Field,
@@ -261,8 +261,24 @@ fn try_perfect_shredding(variant_array: &VariantArray, as_field: &Field) -> Opti
261261
// 2. If every row in the `value` column is null
262262

263263
// This is a perfect shredding, where the value is entirely shredded out,
264-
// so we can just return the typed value.
265-
return Some(typed_value.clone());
264+
// so we can just return the typed value after merging the accumulated nulls.
265+
let parent_nulls = variant_array.nulls();
266+
267+
// If we have no nulls OR the shredded array is `Null`, which doesn't support external nulls.
268+
let target_array = if parent_nulls.is_none() || typed_value.data_type().is_null() {
269+
typed_value.clone()
270+
} else {
271+
let merged_nulls = NullBuffer::union(parent_nulls, typed_value.nulls());
272+
let data = typed_value
273+
.to_data()
274+
.into_builder()
275+
.nulls(merged_nulls)
276+
.build()
277+
.ok()?;
278+
make_array(data)
279+
};
280+
281+
return Some(target_array);
266282
}
267283

268284
None
@@ -347,7 +363,7 @@ mod test {
347363
Date64Array, Decimal32Array, Decimal64Array, Decimal128Array, Decimal256Array,
348364
Float32Array, Float64Array, Int8Array, Int16Array, Int32Array, Int64Array,
349365
LargeBinaryArray, LargeListArray, LargeListViewArray, LargeStringArray, ListArray,
350-
ListViewArray, NullBuilder, StringArray, StringViewArray, StructArray,
366+
ListViewArray, NullArray, NullBuilder, StringArray, StringViewArray, StructArray,
351367
Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray,
352368
};
353369
use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer};
@@ -4331,4 +4347,135 @@ mod test {
43314347
);
43324348
}
43334349
}
4350+
4351+
macro_rules! perfectly_shredded_preserves_top_level_nulls_test {
4352+
($name:ident, $result_type:expr, $typed_value:expr, $expected_array:expr) => {
4353+
perfectly_shredded_preserves_top_level_nulls_test!(
4354+
$name,
4355+
$result_type,
4356+
$typed_value,
4357+
Some(NullBuffer::from(vec![true, false, true])),
4358+
$expected_array
4359+
);
4360+
};
4361+
($name:ident, $result_type:expr, $typed_value:expr, $parent_nulls:expr, $expected_array:expr) => {
4362+
#[test]
4363+
fn $name() {
4364+
let metadata = Arc::new(BinaryViewArray::from_iter_values(std::iter::repeat_n(
4365+
EMPTY_VARIANT_METADATA_BYTES,
4366+
3,
4367+
)));
4368+
let typed_value: ArrayRef = Arc::new($typed_value);
4369+
let variant_array: ArrayRef =
4370+
VariantArray::from_parts(metadata, None, Some(typed_value), $parent_nulls)
4371+
.into();
4372+
4373+
let result = variant_get(
4374+
&variant_array,
4375+
GetOptions::new().with_as_type(Some(FieldRef::from(Field::new(
4376+
"result",
4377+
$result_type,
4378+
true,
4379+
)))),
4380+
)
4381+
.unwrap();
4382+
4383+
let expected_array: ArrayRef = Arc::new($expected_array);
4384+
assert_eq!(&result, &expected_array);
4385+
}
4386+
};
4387+
}
4388+
4389+
perfectly_shredded_preserves_top_level_nulls_test!(
4390+
test_variant_get_perfectly_shredded_integer_preserves_top_level_nulls,
4391+
DataType::Int32,
4392+
Int32Array::from(vec![Some(0_i32), Some(1_i32), Some(2_i32)]),
4393+
Int32Array::from(vec![Some(0_i32), None, Some(2_i32)])
4394+
);
4395+
4396+
perfectly_shredded_preserves_top_level_nulls_test!(
4397+
test_variant_get_perfectly_shredded_integer_unions_child_and_top_level_nulls,
4398+
DataType::Int32,
4399+
Int32Array::from(vec![None, Some(1_i32), Some(2_i32)]),
4400+
Some(NullBuffer::from(vec![true, false, true])),
4401+
Int32Array::from(vec![None, None, Some(2_i32)])
4402+
);
4403+
4404+
perfectly_shredded_preserves_top_level_nulls_test!(
4405+
test_variant_get_perfectly_shredded_null_preserves_top_level_nulls,
4406+
DataType::Null,
4407+
NullArray::new(3),
4408+
NullArray::new(3)
4409+
);
4410+
4411+
perfectly_shredded_preserves_top_level_nulls_test!(
4412+
test_variant_get_perfectly_shredded_binary_view_preserves_top_level_nulls,
4413+
DataType::BinaryView,
4414+
BinaryViewArray::from(vec![
4415+
Some(b"Apache" as &[u8]),
4416+
Some(b"masked-null" as &[u8]),
4417+
Some(b"Parquet-variant" as &[u8]),
4418+
]),
4419+
BinaryViewArray::from(vec![
4420+
Some(b"Apache" as &[u8]),
4421+
None,
4422+
Some(b"Parquet-variant" as &[u8]),
4423+
])
4424+
);
4425+
4426+
perfectly_shredded_preserves_top_level_nulls_test!(
4427+
test_variant_get_perfectly_shredded_binary_preserves_top_level_nulls,
4428+
DataType::Binary,
4429+
BinaryArray::from(vec![
4430+
Some(b"Apache" as &[u8]),
4431+
Some(b"masked-null" as &[u8]),
4432+
Some(b"Parquet-variant" as &[u8]),
4433+
]),
4434+
BinaryArray::from(vec![
4435+
Some(b"Apache" as &[u8]),
4436+
None,
4437+
Some(b"Parquet-variant" as &[u8]),
4438+
])
4439+
);
4440+
4441+
perfectly_shredded_preserves_top_level_nulls_test!(
4442+
test_variant_get_perfectly_shredded_decimal4_preserves_top_level_nulls,
4443+
DataType::Decimal32(5, 2),
4444+
Decimal32Array::from(vec![Some(12345), Some(23400), Some(-12342)])
4445+
.with_precision_and_scale(5, 2)
4446+
.unwrap(),
4447+
Decimal32Array::from(vec![Some(12345), None, Some(-12342)])
4448+
.with_precision_and_scale(5, 2)
4449+
.unwrap()
4450+
);
4451+
4452+
perfectly_shredded_preserves_top_level_nulls_test!(
4453+
test_variant_get_perfectly_shredded_decimal8_preserves_top_level_nulls,
4454+
DataType::Decimal64(10, 1),
4455+
Decimal64Array::from(vec![Some(1234567809), Some(1456787000), Some(-1234561203)])
4456+
.with_precision_and_scale(10, 1)
4457+
.unwrap(),
4458+
Decimal64Array::from(vec![Some(1234567809), None, Some(-1234561203)])
4459+
.with_precision_and_scale(10, 1)
4460+
.unwrap()
4461+
);
4462+
4463+
perfectly_shredded_preserves_top_level_nulls_test!(
4464+
test_variant_get_perfectly_shredded_decimal16_preserves_top_level_nulls,
4465+
DataType::Decimal128(20, 3),
4466+
Decimal128Array::from(vec![
4467+
Some(i128::from_str("12345678901234567899").unwrap()),
4468+
Some(i128::from_str("23445677483748324300").unwrap()),
4469+
Some(i128::from_str("-12345678901234567899").unwrap()),
4470+
])
4471+
.with_precision_and_scale(20, 3)
4472+
.unwrap(),
4473+
Decimal128Array::from(vec![
4474+
Some(i128::from_str("12345678901234567899").unwrap()),
4475+
None,
4476+
Some(i128::from_str("-12345678901234567899").unwrap()),
4477+
])
4478+
.with_precision_and_scale(20, 3)
4479+
.unwrap()
4480+
);
43344481
}

0 commit comments

Comments
 (0)