Skip to content

Commit fe126d0

Browse files
authored
make_decoder accepts borrowed DataType instead of owned (#9270)
# Which issue does this PR close? - Part of #8987 # Rationale for this change Today's json decoder helper, `make_decoder`, takes an owned data type whose components are cloned at every level during the recursive decoder initialization process. This breaks pointer stability of the resulting `DataType` instances that a custom JSON decoder factory would see, vs. those of the schema it and the reader builder were initialized with. The lack of pointer stability prevents users from creating "path based" decoder factories, that are able to customize decoder behavior based not only on type, but also on the field's path in the schema. See the `PathBasedDecoderFactory` in arrow-json/tests/custom_decoder_tests.rs of #9259, for an example. # What changes are included in this PR? By passing `&DataType` instead, we change code like this: ```rust let decoder = make_decoder(field.data_type().clone(), ...); Ok(Self { data_type, decoder, ... }) ``` to this: ```rust let child_decoder = make_decoder(field.data_type(), ...); Ok(Self { data_type: data_type.clone(), decoder, ... }) ``` Result: Every call to `make_decoder` receives a reference to the actual original data type from the builder's input schema. The final decoder `Self` is unchanged -- it already received a clone and continues to do so. NOTE: There is one additional clone of the top-level `DataType::Struct` we create for normal (`!is_field`) builders. But that's a cheap arc clone of a `Fields` member. # Are these changes tested? Yes, existing unit tests validate the change. # Are there any user-facing changes? No. All functions and data types involved are private -- the array decoders are marked `pub` but are defined in a private mod with no public re-export that would make them available outside the crate.
1 parent 4cd2d14 commit fe126d0

File tree

6 files changed

+37
-33
lines changed

6 files changed

+37
-33
lines changed

arrow-json/src/reader/list_array.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,18 +34,18 @@ pub struct ListArrayDecoder<O> {
3434
impl<O: OffsetSizeTrait> ListArrayDecoder<O> {
3535
pub fn new(
3636
ctx: &DecoderContext,
37-
data_type: DataType,
37+
data_type: &DataType,
3838
is_nullable: bool,
3939
) -> Result<Self, ArrowError> {
40-
let field = match &data_type {
40+
let field = match data_type {
4141
DataType::List(f) if !O::IS_LARGE => f,
4242
DataType::LargeList(f) if O::IS_LARGE => f,
4343
_ => unreachable!(),
4444
};
45-
let decoder = ctx.make_decoder(field.data_type().clone(), field.is_nullable())?;
45+
let decoder = ctx.make_decoder(field.data_type(), field.is_nullable())?;
4646

4747
Ok(Self {
48-
data_type,
48+
data_type: data_type.clone(),
4949
decoder,
5050
phantom: Default::default(),
5151
is_nullable,

arrow-json/src/reader/map_array.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,10 @@ pub struct MapArrayDecoder {
3333
impl MapArrayDecoder {
3434
pub fn new(
3535
ctx: &DecoderContext,
36-
data_type: DataType,
36+
data_type: &DataType,
3737
is_nullable: bool,
3838
) -> Result<Self, ArrowError> {
39-
let fields = match &data_type {
39+
let fields = match data_type {
4040
DataType::Map(_, true) => {
4141
return Err(ArrowError::NotYetImplemented(
4242
"Decoding MapArray with sorted fields".to_string(),
@@ -53,11 +53,11 @@ impl MapArrayDecoder {
5353
_ => unreachable!(),
5454
};
5555

56-
let keys = ctx.make_decoder(fields[0].data_type().clone(), fields[0].is_nullable())?;
57-
let values = ctx.make_decoder(fields[1].data_type().clone(), fields[1].is_nullable())?;
56+
let keys = ctx.make_decoder(fields[0].data_type(), fields[0].is_nullable())?;
57+
let values = ctx.make_decoder(fields[1].data_type(), fields[1].is_nullable())?;
5858

5959
Ok(Self {
60-
data_type,
60+
data_type: data_type.clone(),
6161
keys,
6262
values,
6363
is_nullable,

arrow-json/src/reader/mod.rs

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ use crate::StructMode;
137137
use crate::reader::binary_array::{
138138
BinaryArrayDecoder, BinaryViewDecoder, FixedSizeBinaryArrayDecoder,
139139
};
140+
use std::borrow::Cow;
140141
use std::io::BufRead;
141142
use std::sync::Arc;
142143

@@ -295,20 +296,21 @@ impl ReaderBuilder {
295296

296297
/// Create a [`Decoder`]
297298
pub fn build_decoder(self) -> Result<Decoder, ArrowError> {
298-
let (data_type, nullable) = match self.is_field {
299-
false => (DataType::Struct(self.schema.fields.clone()), false),
300-
true => {
301-
let field = &self.schema.fields[0];
302-
(field.data_type().clone(), field.is_nullable())
303-
}
299+
let (data_type, nullable) = if self.is_field {
300+
let field = &self.schema.fields[0];
301+
let data_type = Cow::Borrowed(field.data_type());
302+
(data_type, field.is_nullable())
303+
} else {
304+
let data_type = Cow::Owned(DataType::Struct(self.schema.fields.clone()));
305+
(data_type, false)
304306
};
305307

306308
let ctx = DecoderContext {
307309
coerce_primitive: self.coerce_primitive,
308310
strict_mode: self.strict_mode,
309311
struct_mode: self.struct_mode,
310312
};
311-
let decoder = ctx.make_decoder(data_type, nullable)?;
313+
let decoder = ctx.make_decoder(data_type.as_ref(), nullable)?;
312314

313315
let num_fields = self.schema.flattened_fields().len();
314316

@@ -713,7 +715,7 @@ impl DecoderContext {
713715
/// implementation.
714716
fn make_decoder(
715717
&self,
716-
data_type: DataType,
718+
data_type: &DataType,
717719
is_nullable: bool,
718720
) -> Result<Box<dyn ArrayDecoder>, ArrowError> {
719721
make_decoder(self, data_type, is_nullable)
@@ -728,12 +730,12 @@ macro_rules! primitive_decoder {
728730

729731
fn make_decoder(
730732
ctx: &DecoderContext,
731-
data_type: DataType,
733+
data_type: &DataType,
732734
is_nullable: bool,
733735
) -> Result<Box<dyn ArrayDecoder>, ArrowError> {
734736
let coerce_primitive = ctx.coerce_primitive();
735737
downcast_integer! {
736-
data_type => (primitive_decoder, data_type),
738+
*data_type => (primitive_decoder, data_type),
737739
DataType::Null => Ok(Box::<NullArrayDecoder>::default()),
738740
DataType::Float16 => primitive_decoder!(Float16Type, data_type),
739741
DataType::Float32 => primitive_decoder!(Float32Type, data_type),
@@ -792,7 +794,7 @@ fn make_decoder(
792794
DataType::FixedSizeBinary(len) => Ok(Box::new(FixedSizeBinaryArrayDecoder::new(len))),
793795
DataType::BinaryView => Ok(Box::new(BinaryViewDecoder::default())),
794796
DataType::Map(_, _) => Ok(Box::new(MapArrayDecoder::new(ctx, data_type, is_nullable)?)),
795-
d => Err(ArrowError::NotYetImplemented(format!("Support for {d} in JSON reader")))
797+
_ => Err(ArrowError::NotYetImplemented(format!("Support for {data_type} in JSON reader")))
796798
}
797799
}
798800

arrow-json/src/reader/primitive_array.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,9 +80,9 @@ pub struct PrimitiveArrayDecoder<P: ArrowPrimitiveType> {
8080
}
8181

8282
impl<P: ArrowPrimitiveType> PrimitiveArrayDecoder<P> {
83-
pub fn new(data_type: DataType) -> Self {
83+
pub fn new(data_type: &DataType) -> Self {
8484
Self {
85-
data_type,
85+
data_type: data_type.clone(),
8686
phantom: Default::default(),
8787
}
8888
}

arrow-json/src/reader/struct_array.rs

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -81,28 +81,30 @@ pub struct StructArrayDecoder {
8181
impl StructArrayDecoder {
8282
pub fn new(
8383
ctx: &DecoderContext,
84-
data_type: DataType,
84+
data_type: &DataType,
8585
is_nullable: bool,
8686
) -> Result<Self, ArrowError> {
87-
let struct_mode = ctx.struct_mode();
88-
let fields = struct_fields(&data_type);
89-
90-
// If this struct nullable, need to permit nullability in child array
91-
// StructArrayDecoder::decode verifies that if the child is not nullable
92-
// it doesn't contain any nulls not masked by its parent
87+
let fields = struct_fields(data_type);
9388
let decoders = fields
9489
.iter()
95-
.map(|f| ctx.make_decoder(f.data_type().clone(), f.is_nullable() || is_nullable))
90+
.map(|f| {
91+
// If this struct nullable, need to permit nullability in child array
92+
// StructArrayDecoder::decode verifies that if the child is not nullable
93+
// it doesn't contain any nulls not masked by its parent
94+
let nullable = f.is_nullable() || is_nullable;
95+
ctx.make_decoder(f.data_type(), nullable)
96+
})
9697
.collect::<Result<Vec<_>, ArrowError>>()?;
9798

99+
let struct_mode = ctx.struct_mode();
98100
let field_name_to_index = if struct_mode == StructMode::ObjectOnly {
99101
build_field_index(fields)
100102
} else {
101103
None
102104
};
103105

104106
Ok(Self {
105-
data_type,
107+
data_type: data_type.clone(),
106108
decoders,
107109
strict_mode: ctx.strict_mode(),
108110
is_nullable,

arrow-json/src/reader/timestamp_array.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,9 @@ pub struct TimestampArrayDecoder<P: ArrowTimestampType, Tz: TimeZone> {
3737
}
3838

3939
impl<P: ArrowTimestampType, Tz: TimeZone> TimestampArrayDecoder<P, Tz> {
40-
pub fn new(data_type: DataType, timezone: Tz) -> Self {
40+
pub fn new(data_type: &DataType, timezone: Tz) -> Self {
4141
Self {
42-
data_type,
42+
data_type: data_type.clone(),
4343
timezone,
4444
phantom: Default::default(),
4545
}

0 commit comments

Comments
 (0)