Skip to content

Commit 1e5f581

Browse files
committed
Merge branch 'main' into issue-9497-list-json-reader-bench
2 parents 6789ab4 + d2e2cda commit 1e5f581

File tree

10 files changed

+518
-64
lines changed

10 files changed

+518
-64
lines changed

arrow-array/src/cast.rs

Lines changed: 58 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ macro_rules! repeat_pat {
7474
/// [`DataType`]: arrow_schema::DataType
7575
#[macro_export]
7676
macro_rules! downcast_integer {
77-
($($data_type:expr),+ => ($m:path $(, $args:tt)*), $($p:pat $(if $pred:expr)* => $fallback:expr $(,)*)*) => {
77+
($($data_type:expr),+ => ($m:path $(, $args:tt)*), $($p:pat $(if $pred:expr)? => $fallback:expr $(,)?)*) => {
7878
match ($($data_type),+) {
7979
$crate::repeat_pat!($crate::cast::__private::DataType::Int8, $($data_type),+) => {
8080
$m!($crate::types::Int8Type $(, $args)*)
@@ -100,7 +100,7 @@ macro_rules! downcast_integer {
100100
$crate::repeat_pat!($crate::cast::__private::DataType::UInt64, $($data_type),+) => {
101101
$m!($crate::types::UInt64Type $(, $args)*)
102102
}
103-
$($p $(if $pred)* => $fallback,)*
103+
$($p $(if $pred)? => $fallback,)*
104104
}
105105
};
106106
}
@@ -138,21 +138,24 @@ macro_rules! downcast_integer {
138138
/// [`DataType`]: arrow_schema::DataType
139139
#[macro_export]
140140
macro_rules! downcast_integer_array {
141-
($values:ident => $e:expr, $($p:pat $(if $pred:expr)* => $fallback:expr $(,)*)*) => {
142-
$crate::downcast_integer_array!($values => {$e} $($p $(if $pred)* => $fallback)*)
143-
};
144-
(($($values:ident),+) => $e:expr, $($p:pat $(if $pred:expr)* => $fallback:expr $(,)*)*) => {
145-
$crate::downcast_integer_array!($($values),+ => {$e} $($p $(if $pred)* => $fallback)*)
146-
};
147-
($($values:ident),+ => $e:block $($p:pat $(if $pred:expr)* => $fallback:expr $(,)*)*) => {
148-
$crate::downcast_integer_array!(($($values),+) => $e $($p $(if $pred)* => $fallback)*)
149-
};
150-
(($($values:ident),+) => $e:block $($p:pat $(if $pred:expr)* => $fallback:expr $(,)*)*) => {
141+
($($values:ident),+ => $e:block $($p:pat $(if $pred:expr)? => $fallback:expr $(,)?)*) => {
151142
$crate::downcast_integer!{
152143
$($values.data_type()),+ => ($crate::downcast_primitive_array_helper, $($values),+, $e),
153-
$($p $(if $pred)* => $fallback,)*
144+
$($p $(if $pred)? => $fallback,)*
154145
}
155146
};
147+
// Turn $e into a block.
148+
($values:ident => $e:expr, $($p:pat $(if $pred:expr)? => $fallback:expr $(,)?)*) => {
149+
$crate::downcast_integer_array!($values => {$e} $($p $(if $pred)? => $fallback,)*)
150+
};
151+
// Remove $values parentheses.
152+
(($($values:ident),+) => $e:block $($p:pat $(if $pred:expr)? => $fallback:expr $(,)?)*) => {
153+
$crate::downcast_integer_array!($($values),+ => $e $($p $(if $pred)? => $fallback,)*)
154+
};
155+
// Turn $e into a block & remove $values parentheses.
156+
(($($values:ident),+) => $e:expr, $($p:pat $(if $pred:expr)? => $fallback:expr $(,)?)*) => {
157+
$crate::downcast_integer_array!($($values),+ => {$e} $($p $(if $pred)? => $fallback,)*)
158+
};
156159
}
157160

158161
/// Given one or more expressions evaluating to an integer [`DataType`] invokes the provided macro
@@ -189,7 +192,7 @@ macro_rules! downcast_integer_array {
189192
/// [`DataType`]: arrow_schema::DataType
190193
#[macro_export]
191194
macro_rules! downcast_run_end_index {
192-
($($data_type:expr),+ => ($m:path $(, $args:tt)*), $($p:pat $(if $pred:expr)* => $fallback:expr $(,)*)*) => {
195+
($($data_type:expr),+ => ($m:path $(, $args:tt)*), $($p:pat $(if $pred:expr)? => $fallback:expr $(,)?)*) => {
193196
match ($($data_type),+) {
194197
$crate::repeat_pat!($crate::cast::__private::DataType::Int16, $($data_type),+) => {
195198
$m!($crate::types::Int16Type $(, $args)*)
@@ -200,7 +203,7 @@ macro_rules! downcast_run_end_index {
200203
$crate::repeat_pat!($crate::cast::__private::DataType::Int64, $($data_type),+) => {
201204
$m!($crate::types::Int64Type $(, $args)*)
202205
}
203-
$($p $(if $pred)* => $fallback,)*
206+
$($p $(if $pred)? => $fallback,)*
204207
}
205208
};
206209
}
@@ -234,7 +237,7 @@ macro_rules! downcast_run_end_index {
234237
/// [`DataType`]: arrow_schema::DataType
235238
#[macro_export]
236239
macro_rules! downcast_temporal {
237-
($($data_type:expr),+ => ($m:path $(, $args:tt)*), $($p:pat $(if $pred:expr)* => $fallback:expr $(,)*)*) => {
240+
($($data_type:expr),+ => ($m:path $(, $args:tt)*), $($p:pat $(if $pred:expr)? => $fallback:expr $(,)?)*) => {
238241
match ($($data_type),+) {
239242
$crate::repeat_pat!($crate::cast::__private::DataType::Time32($crate::cast::__private::TimeUnit::Second), $($data_type),+) => {
240243
$m!($crate::types::Time32SecondType $(, $args)*)
@@ -266,7 +269,7 @@ macro_rules! downcast_temporal {
266269
$crate::repeat_pat!($crate::cast::__private::DataType::Timestamp($crate::cast::__private::TimeUnit::Nanosecond, _), $($data_type),+) => {
267270
$m!($crate::types::TimestampNanosecondType $(, $args)*)
268271
}
269-
$($p $(if $pred)* => $fallback,)*
272+
$($p $(if $pred)? => $fallback,)*
270273
}
271274
};
272275
}
@@ -304,21 +307,24 @@ macro_rules! downcast_temporal {
304307
/// [`DataType`]: arrow_schema::DataType
305308
#[macro_export]
306309
macro_rules! downcast_temporal_array {
307-
($values:ident => $e:expr, $($p:pat $(if $pred:expr)* => $fallback:expr $(,)*)*) => {
308-
$crate::downcast_temporal_array!($values => {$e} $($p $(if $pred)* => $fallback)*)
309-
};
310-
(($($values:ident),+) => $e:expr, $($p:pat $(if $pred:expr)* => $fallback:expr $(,)*)*) => {
311-
$crate::downcast_temporal_array!($($values),+ => {$e} $($p $(if $pred)* => $fallback)*)
312-
};
313-
($($values:ident),+ => $e:block $($p:pat $(if $pred:expr)* => $fallback:expr $(,)*)*) => {
314-
$crate::downcast_temporal_array!(($($values),+) => $e $($p $(if $pred)* => $fallback)*)
315-
};
316-
(($($values:ident),+) => $e:block $($p:pat $(if $pred:expr)* => $fallback:expr $(,)*)*) => {
310+
($($values:ident),+ => $e:block $($p:pat $(if $pred:expr)? => $fallback:expr $(,)?)*) => {
317311
$crate::downcast_temporal!{
318312
$($values.data_type()),+ => ($crate::downcast_primitive_array_helper, $($values),+, $e),
319-
$($p $(if $pred)* => $fallback,)*
313+
$($p $(if $pred)? => $fallback,)*
320314
}
321315
};
316+
// Turn $e into a block.
317+
($values:ident => $e:expr, $($p:pat $(if $pred:expr)? => $fallback:expr $(,)?)*) => {
318+
$crate::downcast_temporal_array!($values => {$e} $($p $(if $pred)? => $fallback,)*)
319+
};
320+
// Remove $values parentheses.
321+
(($($values:ident),+) => $e:block $($p:pat $(if $pred:expr)? => $fallback:expr $(,)?)*) => {
322+
$crate::downcast_temporal_array!($($values),+ => $e $($p $(if $pred)? => $fallback,)*)
323+
};
324+
// Turn $e into a block & remove $values parentheses.
325+
(($($values:ident),+) => $e:expr, $($p:pat $(if $pred:expr)? => $fallback:expr $(,)?)*) => {
326+
$crate::downcast_temporal_array!($($values),+ => {$e} $($p $(if $pred)? => $fallback,)*)
327+
};
322328
}
323329

324330
/// Given one or more expressions evaluating to primitive [`DataType`] invokes the provided macro
@@ -353,7 +359,7 @@ macro_rules! downcast_temporal_array {
353359
/// [`DataType`]: arrow_schema::DataType
354360
#[macro_export]
355361
macro_rules! downcast_primitive {
356-
($($data_type:expr),+ => ($m:path $(, $args:tt)*), $($p:pat $(if $pred:expr)* => $fallback:expr $(,)*)*) => {
362+
($($data_type:expr),+ => ($m:path $(, $args:tt)*), $($p:pat $(if $pred:expr)? => $fallback:expr $(,)?)*) => {
357363
$crate::downcast_integer! {
358364
$($data_type),+ => ($m $(, $args)*),
359365
$crate::repeat_pat!($crate::cast::__private::DataType::Float16, $($data_type),+) => {
@@ -401,7 +407,7 @@ macro_rules! downcast_primitive {
401407
_ => {
402408
$crate::downcast_temporal! {
403409
$($data_type),+ => ($m $(, $args)*),
404-
$($p $(if $pred)* => $fallback,)*
410+
$($p $(if $pred)? => $fallback,)*
405411
}
406412
}
407413
}
@@ -450,21 +456,24 @@ macro_rules! downcast_primitive_array_helper {
450456
/// [`DataType`]: arrow_schema::DataType
451457
#[macro_export]
452458
macro_rules! downcast_primitive_array {
453-
($values:ident => $e:expr, $($p:pat $(if $pred:expr)* => $fallback:expr $(,)*)*) => {
454-
$crate::downcast_primitive_array!($values => {$e} $($p $(if $pred)* => $fallback)*)
455-
};
456-
(($($values:ident),+) => $e:expr, $($p:pat $(if $pred:expr)* => $fallback:expr $(,)*)*) => {
457-
$crate::downcast_primitive_array!($($values),+ => {$e} $($p $(if $pred)* => $fallback)*)
458-
};
459-
($($values:ident),+ => $e:block $($p:pat $(if $pred:expr)* => $fallback:expr $(,)*)*) => {
460-
$crate::downcast_primitive_array!(($($values),+) => $e $($p $(if $pred)* => $fallback)*)
461-
};
462-
(($($values:ident),+) => $e:block $($p:pat $(if $pred:expr)* => $fallback:expr $(,)*)*) => {
459+
($($values:ident),+ => $e:block $($p:pat $(if $pred:expr)? => $fallback:expr $(,)?)*) => {
463460
$crate::downcast_primitive!{
464461
$($values.data_type()),+ => ($crate::downcast_primitive_array_helper, $($values),+, $e),
465-
$($p $(if $pred)* => $fallback,)*
462+
$($p $(if $pred)? => $fallback,)*
466463
}
467464
};
465+
// Turn $e into a block.
466+
($values:ident => $e:expr, $($p:pat $(if $pred:expr)? => $fallback:expr $(,)?)*) => {
467+
$crate::downcast_primitive_array!($values => {$e} $($p $(if $pred)? => $fallback,)*)
468+
};
469+
// Remove $values parentheses.
470+
(($($values:ident),+) => $e:block $($p:pat $(if $pred:expr)? => $fallback:expr $(,)?)*) => {
471+
$crate::downcast_primitive_array!($($values),+ => $e $($p $(if $pred)? => $fallback,)*)
472+
};
473+
// Turn $e into a block & remove $values parentheses.
474+
(($($values:ident),+) => $e:expr, $($p:pat $(if $pred:expr)? => $fallback:expr $(,)?)*) => {
475+
$crate::downcast_primitive_array!($($values),+ => {$e} $($p $(if $pred)? => $fallback,)*)
476+
};
468477
}
469478

470479
/// Force downcast of an [`Array`], such as an [`ArrayRef`], to
@@ -546,19 +555,19 @@ macro_rules! downcast_dictionary_array_helper {
546555
/// [`DataType`]: arrow_schema::DataType
547556
#[macro_export]
548557
macro_rules! downcast_dictionary_array {
549-
($values:ident => $e:expr, $($p:pat $(if $pred:expr)* => $fallback:expr $(,)*)*) => {
550-
downcast_dictionary_array!($values => {$e} $($p $(if $pred)* => $fallback)*)
558+
($values:ident => $e:expr, $($p:pat $(if $pred:expr)? => $fallback:expr $(,)?)*) => {
559+
downcast_dictionary_array!($values => {$e} $($p $(if $pred)? => $fallback,)*)
551560
};
552561

553-
($values:ident => $e:block $($p:pat $(if $pred:expr)* => $fallback:expr $(,)*)*) => {
562+
($values:ident => $e:block $($p:pat $(if $pred:expr)? => $fallback:expr $(,)?)*) => {
554563
match $values.data_type() {
555564
$crate::cast::__private::DataType::Dictionary(k, _) => {
556565
$crate::downcast_integer! {
557566
k.as_ref() => ($crate::downcast_dictionary_array_helper, $values, $e),
558567
k => unreachable!("unsupported dictionary key type: {}", k)
559568
}
560569
}
561-
$($p $(if $pred)* => $fallback,)*
570+
$($p $(if $pred)? => $fallback,)*
562571
}
563572
}
564573
}
@@ -654,19 +663,19 @@ macro_rules! downcast_run_array_helper {
654663
/// [`DataType`]: arrow_schema::DataType
655664
#[macro_export]
656665
macro_rules! downcast_run_array {
657-
($values:ident => $e:expr, $($p:pat $(if $pred:expr)* => $fallback:expr $(,)*)*) => {
658-
downcast_run_array!($values => {$e} $($p $(if $pred)* => $fallback)*)
666+
($values:ident => $e:expr, $($p:pat $(if $pred:expr)? => $fallback:expr $(,)?)*) => {
667+
downcast_run_array!($values => {$e} $($p $(if $pred)? => $fallback,)*)
659668
};
660669

661-
($values:ident => $e:block $($p:pat $(if $pred:expr)* => $fallback:expr $(,)*)*) => {
670+
($values:ident => $e:block $($p:pat $(if $pred:expr)? => $fallback:expr $(,)?)*) => {
662671
match $values.data_type() {
663672
$crate::cast::__private::DataType::RunEndEncoded(k, _) => {
664673
$crate::downcast_run_end_index! {
665674
k.data_type() => ($crate::downcast_run_array_helper, $values, $e),
666675
k => unreachable!("unsupported run end index type: {}", k)
667676
}
668677
}
669-
$($p $(if $pred)* => $fallback,)*
678+
$($p $(if $pred)? => $fallback,)*
670679
}
671680
}
672681
}

arrow-json/benches/json_writer.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
use arrow_array::builder::{FixedSizeListBuilder, Int64Builder, ListBuilder};
1919
use arrow_array::{Array, RecordBatch};
2020
use arrow_json::LineDelimitedWriter;
21-
use arrow_schema::{DataType, Field, Schema};
21+
use arrow_schema::{Field, Schema};
2222
use criterion::{Criterion, Throughput, criterion_group, criterion_main};
2323
use std::sync::Arc;
2424

@@ -38,7 +38,7 @@ fn build_list_batch(rows: usize, elements: usize) -> RecordBatch {
3838

3939
let schema = Arc::new(Schema::new(vec![Field::new(
4040
"list",
41-
DataType::List(Arc::new(Field::new_list_field(DataType::Int64, false))),
41+
list_array.data_type().clone(),
4242
false,
4343
)]));
4444

@@ -84,15 +84,15 @@ fn build_fixed_size_list_batch(rows: usize, elements: usize) -> RecordBatch {
8484
}
8585
builder.append(true);
8686
}
87-
let fsl_array = builder.finish();
87+
let list_array = builder.finish();
8888

8989
let schema = Arc::new(Schema::new(vec![Field::new(
9090
"fixed_size_list",
91-
fsl_array.data_type().clone(),
91+
list_array.data_type().clone(),
9292
false,
9393
)]));
9494

95-
RecordBatch::try_new(schema, vec![Arc::new(fsl_array)]).unwrap()
95+
RecordBatch::try_new(schema, vec![Arc::new(list_array)]).unwrap()
9696
}
9797

9898
fn bench_write_fixed_size_list(c: &mut Criterion) {

arrow-select/src/coalesce/byte_view.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,6 @@ impl<B: ByteViewType> InProgressByteViewArray<B> {
101101
if self.views.capacity() == 0 {
102102
self.views.reserve(self.batch_size);
103103
}
104-
debug_assert_eq!(self.views.capacity(), self.batch_size);
105104
}
106105

107106
/// Finishes in progress buffer, if any

arrow-select/src/coalesce/primitive.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,6 @@ impl<T: ArrowPrimitiveType> InProgressPrimitiveArray<T> {
5858
if self.current.capacity() == 0 {
5959
self.current.reserve(self.batch_size);
6060
}
61-
debug_assert_eq!(self.current.capacity(), self.batch_size);
6261
}
6362
}
6463

parquet-variant-compute/src/unshred_variant.rs

Lines changed: 59 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@
2020
use crate::{BorrowedShreddingState, VariantArray, VariantValueArrayBuilder};
2121
use arrow::array::{
2222
Array, AsArray as _, BinaryViewArray, BooleanArray, FixedSizeBinaryArray, FixedSizeListArray,
23-
GenericListArray, GenericListViewArray, ListLikeArray, PrimitiveArray, StringArray,
24-
StructArray,
23+
GenericListArray, GenericListViewArray, LargeStringArray, ListLikeArray, PrimitiveArray,
24+
StringArray, StringViewArray, StructArray,
2525
};
2626
use arrow::buffer::NullBuffer;
2727
use arrow::datatypes::{
@@ -105,6 +105,8 @@ enum UnshredVariantRowBuilder<'a> {
105105
TimestampNanosecond(TimestampUnshredRowBuilder<'a, TimestampNanosecondType>),
106106
PrimitiveBoolean(UnshredPrimitiveRowBuilder<'a, BooleanArray>),
107107
PrimitiveString(UnshredPrimitiveRowBuilder<'a, StringArray>),
108+
PrimitiveStringView(UnshredPrimitiveRowBuilder<'a, StringViewArray>),
109+
PrimitiveLargeString(UnshredPrimitiveRowBuilder<'a, LargeStringArray>),
108110
PrimitiveBinaryView(UnshredPrimitiveRowBuilder<'a, BinaryViewArray>),
109111
PrimitiveUuid(UnshredPrimitiveRowBuilder<'a, FixedSizeBinaryArray>),
110112
List(ListUnshredVariantBuilder<'a, GenericListArray<i32>>),
@@ -146,6 +148,8 @@ impl<'a> UnshredVariantRowBuilder<'a> {
146148
Self::TimestampNanosecond(b) => b.append_row(builder, metadata, index),
147149
Self::PrimitiveBoolean(b) => b.append_row(builder, metadata, index),
148150
Self::PrimitiveString(b) => b.append_row(builder, metadata, index),
151+
Self::PrimitiveStringView(b) => b.append_row(builder, metadata, index),
152+
Self::PrimitiveLargeString(b) => b.append_row(builder, metadata, index),
149153
Self::PrimitiveBinaryView(b) => b.append_row(builder, metadata, index),
150154
Self::PrimitiveUuid(b) => b.append_row(builder, metadata, index),
151155
Self::List(b) => b.append_row(builder, metadata, index),
@@ -226,6 +230,8 @@ impl<'a> UnshredVariantRowBuilder<'a> {
226230
}
227231
DataType::Boolean => primitive_builder!(PrimitiveBoolean, as_boolean),
228232
DataType::Utf8 => primitive_builder!(PrimitiveString, as_string),
233+
DataType::Utf8View => primitive_builder!(PrimitiveStringView, as_string_view),
234+
DataType::LargeUtf8 => primitive_builder!(PrimitiveLargeString, as_string),
229235
DataType::BinaryView => primitive_builder!(PrimitiveBinaryView, as_binary_view),
230236
DataType::FixedSizeBinary(16) => {
231237
primitive_builder!(PrimitiveUuid, as_fixed_size_binary)
@@ -405,6 +411,8 @@ macro_rules! impl_append_to_variant_builder {
405411

406412
impl_append_to_variant_builder!(BooleanArray);
407413
impl_append_to_variant_builder!(StringArray);
414+
impl_append_to_variant_builder!(StringViewArray);
415+
impl_append_to_variant_builder!(LargeStringArray);
408416
impl_append_to_variant_builder!(BinaryViewArray);
409417
impl_append_to_variant_builder!(PrimitiveArray<Int8Type>);
410418
impl_append_to_variant_builder!(PrimitiveArray<Int16Type>);
@@ -664,5 +672,52 @@ impl<'a, L: ListLikeArray> ListUnshredVariantBuilder<'a, L> {
664672
}
665673
}
666674

667-
// TODO: This code is covered by tests in `parquet/tests/variant_integration.rs`. Does that suffice?
668-
// Or do we also need targeted stand-alone unit tests for full coverage?
675+
#[cfg(test)]
676+
mod tests {
677+
use crate::VariantArray;
678+
use arrow::array::{BinaryViewArray, LargeStringArray, StringViewArray};
679+
use parquet_variant::Variant;
680+
681+
#[test]
682+
fn test_unshred_utf8view_typed_value() {
683+
let metadata_bytes: &[u8] = &[0x01, 0x00, 0x00];
684+
let metadata = BinaryViewArray::from_iter_values(vec![metadata_bytes; 3]);
685+
686+
let typed_value: arrow::array::ArrayRef = std::sync::Arc::new(StringViewArray::from(vec![
687+
Some("hello"),
688+
Some("middle"),
689+
Some("world"),
690+
]));
691+
692+
let variant_array = VariantArray::from_parts(metadata, None, Some(typed_value), None);
693+
694+
let result = crate::unshred_variant(&variant_array).unwrap();
695+
696+
assert_eq!(result.len(), 3);
697+
assert_eq!(result.value(0), Variant::from("hello"));
698+
assert_eq!(result.value(1), Variant::from("middle"));
699+
assert_eq!(result.value(2), Variant::from("world"));
700+
}
701+
702+
#[test]
703+
fn test_unshred_largeutf8_typed_value() {
704+
let metadata_bytes: &[u8] = &[0x01, 0x00, 0x00];
705+
let metadata = BinaryViewArray::from_iter_values(vec![metadata_bytes; 3]);
706+
707+
let typed_value: arrow::array::ArrayRef =
708+
std::sync::Arc::new(LargeStringArray::from(vec![
709+
Some("hello"),
710+
Some("middle"),
711+
Some("world"),
712+
]));
713+
714+
let variant_array = VariantArray::from_parts(metadata, None, Some(typed_value), None);
715+
716+
let result = crate::unshred_variant(&variant_array).unwrap();
717+
718+
assert_eq!(result.len(), 3);
719+
assert_eq!(result.value(0), Variant::from("hello"));
720+
assert_eq!(result.value(1), Variant::from("middle"));
721+
assert_eq!(result.value(2), Variant::from("world"));
722+
}
723+
}

0 commit comments

Comments
 (0)