From 5b246df8d2bdebaad38cbd32eff0c0cbf542fc58 Mon Sep 17 00:00:00 2001 From: Mikhail Zabaluev Date: Sun, 1 Feb 2026 03:56:53 +0200 Subject: [PATCH 1/4] fix: allow null default for any unions with null --- arrow-avro/src/codec.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arrow-avro/src/codec.rs b/arrow-avro/src/codec.rs index d54c6602dad6..5dff5d0018dc 100644 --- a/arrow-avro/src/codec.rs +++ b/arrow-avro/src/codec.rs @@ -307,18 +307,18 @@ impl AvroDataType { } } - // Handle JSON nulls per-spec: allowed only for `null` type or unions with null FIRST + // Handle JSON nulls per-spec: allowed only for `null` type or unions with null if default_json.is_null() { return match self.codec() { Codec::Null => Ok(AvroLiteral::Null), - Codec::Union(encodings, _, _) if !encodings.is_empty() - && matches!(encodings[0].codec(), Codec::Null) => + Codec::Union(encodings, _, _) + if encodings.iter().any(|enc| matches!(enc.codec(), Codec::Null)) => { Ok(AvroLiteral::Null) } - _ if self.nullability() == Some(Nullability::NullFirst) => Ok(AvroLiteral::Null), + _ if self.nullability().is_some() => Ok(AvroLiteral::Null), _ => Err(ArrowError::SchemaError( - "JSON null default is only valid for `null` type or for a union whose first branch is `null`" + "JSON null default is only valid for `null` type or for a union with a `null` branch" .to_string(), )), }; From 665dc6087f1705ce80a9604928ee5458da267b2c Mon Sep 17 00:00:00 2001 From: Mikhail Zabaluev Date: Fri, 27 Feb 2026 12:18:23 +0200 Subject: [PATCH 2/4] test: null default resolution, null as second type Test the Avro 1.12 spec behavior of resolving default values in the specific case when the default value for the field added in the reader schema is null, and null the second branch in the field's union type. --- arrow-avro/src/reader/mod.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arrow-avro/src/reader/mod.rs b/arrow-avro/src/reader/mod.rs index aa01f272bfeb..6b2568b42eb6 100644 --- a/arrow-avro/src/reader/mod.rs +++ b/arrow-avro/src/reader/mod.rs @@ -2936,6 +2936,7 @@ mod test { },"default":{"x":7}}), serde_json::json!({"name":"d_nullable_null","type":["null","int"],"default":null}), serde_json::json!({"name":"d_nullable_value","type":["int","null"],"default":123}), + serde_json::json!({"name":"d_nullable_null_second","type":["int","null"],"default":null}), ], ); let actual = read_alltypes_with_reader_schema(path, reader_schema); @@ -2943,7 +2944,7 @@ mod test { assert!(num_rows > 0, "skippable_types.avro should contain rows"); assert_eq!( actual.num_columns(), - 22, + 23, "expected exactly our defaulted fields" ); let mut arrays: Vec> = Vec::with_capacity(22); @@ -3070,6 +3071,10 @@ mod test { arrays.push(Arc::new(Int32Array::from_iter_values(std::iter::repeat_n( 123, num_rows, )))); + arrays.push(Arc::new(Int32Array::from_iter(std::iter::repeat_n( + None::, + num_rows, + )))); let expected = RecordBatch::try_new(actual.schema(), arrays).unwrap(); assert_eq!( actual, expected, From bdce846c0e6112e23466f1d8bd6e6bb4b2df6593 Mon Sep 17 00:00:00 2001 From: Mikhail Zabaluev Date: Fri, 27 Feb 2026 13:04:51 +0200 Subject: [PATCH 3/4] test: update nullability rules test Avro 1.12, new rules. --- arrow-avro/src/codec.rs | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/arrow-avro/src/codec.rs b/arrow-avro/src/codec.rs index 5dff5d0018dc..2864677f20bc 100644 --- a/arrow-avro/src/codec.rs +++ b/arrow-avro/src/codec.rs @@ -2946,12 +2946,9 @@ mod tests { assert_default_stored(&dt_int_nf, &Value::Null); let mut dt_int_ns = AvroDataType::new(Codec::Int32, HashMap::new(), Some(Nullability::NullSecond)); - let err2 = dt_int_ns.parse_and_store_default(&Value::Null).unwrap_err(); - assert!( - err2.to_string() - .contains("JSON null default is only valid for `null` type"), - "unexpected error: {err2}" - ); + let lit3 = dt_int_ns.parse_and_store_default(&Value::Null).unwrap(); + assert_eq!(lit3, AvroLiteral::Null); + assert_default_stored(&dt_int_ns, &Value::Null); } #[test] From b7130ea1484b7d00c4ede3b37cd0045c801137ca Mon Sep 17 00:00:00 2001 From: Mikhail Zabaluev Date: Tue, 7 Apr 2026 23:12:09 +0300 Subject: [PATCH 4/4] feat: avro_1_12 feature Introduce the "avro_1_12" feature flag and use it to guard the behavior of JSON null defaults for union types having null schema in a position other than the first. --- arrow-avro/Cargo.toml | 1 + arrow-avro/src/codec.rs | 81 +++++++++++++++++++++++++++++------- arrow-avro/src/reader/mod.rs | 29 ++++++++++++- 3 files changed, 95 insertions(+), 16 deletions(-) diff --git a/arrow-avro/Cargo.toml b/arrow-avro/Cargo.toml index f46ef7e7b999..e89b061dfc4c 100644 --- a/arrow-avro/Cargo.toml +++ b/arrow-avro/Cargo.toml @@ -44,6 +44,7 @@ md5 = ["dep:md5"] sha256 = ["dep:sha2"] small_decimals = [] avro_custom_types = ["dep:arrow-select"] +avro_1_12 = [] # Enable async APIs async = ["futures", "tokio"] diff --git a/arrow-avro/src/codec.rs b/arrow-avro/src/codec.rs index f09f4e0e804d..b441a26796c3 100644 --- a/arrow-avro/src/codec.rs +++ b/arrow-avro/src/codec.rs @@ -251,6 +251,52 @@ impl AvroDataType { self.nullability } + // Returns `Ok` if this data type accepts a JSON null default value, + // according to Avro schema rules prior to spec version 1.12, otherwise + // returns an `Err` with a schema error. + // Prior to 1.12, Avro only allowed default values matching the first branch of a union. + #[cfg(not(feature = "avro_1_12"))] + fn validate_null_default(&self) -> Result<(), ArrowError> { + match self.codec() { + Codec::Null => Ok(()), + Codec::Union(encodings, _, _) + if encodings + .first() + .map_or(false, |enc| matches!(enc.codec(), Codec::Null)) => + { + Ok(()) + } + _ if self.nullability() == Some(Nullability::NullFirst) => Ok(()), + _ => Err(ArrowError::SchemaError( + "JSON null default is only valid for `null` type or for a union whose first branch is `null`" + .to_string(), + )), + } + } + + // Returns `Ok` if this data type accepts a JSON null default value, + // according to Avro schema rules for spec version 1.12 and later, otherwise + // returns an `Err` with a schema error. + // Since 1.12, Avro allows default values matching any branch of a union. + #[cfg(feature = "avro_1_12")] + fn validate_null_default(&self) -> Result<(), ArrowError> { + match self.codec() { + Codec::Null => Ok(()), + Codec::Union(encodings, _, _) + if encodings + .iter() + .any(|enc| matches!(enc.codec(), Codec::Null)) => + { + Ok(()) + } + _ if self.nullability().is_some() => Ok(()), + _ => Err(ArrowError::SchemaError( + "JSON null default is only valid for `null` type or for a union with a `null` branch" + .to_string(), + )), + } + } + #[inline] fn parse_default_literal(&self, default_json: &Value) -> Result { fn expect_string<'v>( @@ -313,21 +359,10 @@ impl AvroDataType { } } - // Handle JSON nulls per-spec: allowed only for `null` type or unions with null + // Handle JSON nulls per the spec rules if default_json.is_null() { - return match self.codec() { - Codec::Null => Ok(AvroLiteral::Null), - Codec::Union(encodings, _, _) - if encodings.iter().any(|enc| matches!(enc.codec(), Codec::Null)) => - { - Ok(AvroLiteral::Null) - } - _ if self.nullability().is_some() => Ok(AvroLiteral::Null), - _ => Err(ArrowError::SchemaError( - "JSON null default is only valid for `null` type or for a union with a `null` branch" - .to_string(), - )), - }; + self.validate_null_default()?; + return Ok(AvroLiteral::Null); } let lit = match self.codec() { Codec::Null => { @@ -3282,6 +3317,24 @@ mod tests { let lit2 = dt_int_nf.parse_and_store_default(&Value::Null).unwrap(); assert_eq!(lit2, AvroLiteral::Null); assert_default_stored(&dt_int_nf, &Value::Null); + } + + #[cfg(not(feature = "avro_1_12"))] + #[test] + fn test_validate_and_store_default_null_and_nullability_rules_avro_1_11() { + let mut dt_int_ns = + AvroDataType::new(Codec::Int32, HashMap::new(), Some(Nullability::NullSecond)); + let err2 = dt_int_ns.parse_and_store_default(&Value::Null).unwrap_err(); + assert!( + err2.to_string() + .contains("JSON null default is only valid for `null` type"), + "unexpected error: {err2}" + ); + } + + #[cfg(feature = "avro_1_12")] + #[test] + fn test_validate_and_store_default_null_and_nullability_rules_avro_1_12() { let mut dt_int_ns = AvroDataType::new(Codec::Int32, HashMap::new(), Some(Nullability::NullSecond)); let lit3 = dt_int_ns.parse_and_store_default(&Value::Null).unwrap(); diff --git a/arrow-avro/src/reader/mod.rs b/arrow-avro/src/reader/mod.rs index 1943976861b2..2d0891af28cf 100644 --- a/arrow-avro/src/reader/mod.rs +++ b/arrow-avro/src/reader/mod.rs @@ -2950,7 +2950,6 @@ mod test { },"default":{"x":7}}), serde_json::json!({"name":"d_nullable_null","type":["null","int"],"default":null}), serde_json::json!({"name":"d_nullable_value","type":["int","null"],"default":123}), - serde_json::json!({"name":"d_nullable_null_second","type":["int","null"],"default":null}), ], ); let actual = read_alltypes_with_reader_schema(path, reader_schema); @@ -2958,7 +2957,7 @@ mod test { assert!(num_rows > 0, "skippable_types.avro should contain rows"); assert_eq!( actual.num_columns(), - 23, + 22, "expected exactly our defaulted fields" ); let mut arrays: Vec> = Vec::with_capacity(22); @@ -3085,6 +3084,32 @@ mod test { arrays.push(Arc::new(Int32Array::from_iter_values(std::iter::repeat_n( 123, num_rows, )))); + let expected = RecordBatch::try_new(actual.schema(), arrays).unwrap(); + assert_eq!( + actual, expected, + "defaults should materialize correctly for all fields" + ); + } + + #[cfg(feature = "avro_1_12")] + #[test] + fn test_schema_resolution_defaults_cases_supported_by_avro_1_12() { + let path = "test/data/skippable_types.avro"; + let reader_schema = make_reader_schema_with_default_fields( + path, + vec![ + serde_json::json!({"name":"d_nullable_null_second","type":["int","null"],"default":null}), + ], + ); + let actual = read_alltypes_with_reader_schema(path, reader_schema); + let num_rows = actual.num_rows(); + assert!(num_rows > 0, "skippable_types.avro should contain rows"); + assert_eq!( + actual.num_columns(), + 1, + "expected exactly our defaulted fields" + ); + let mut arrays: Vec> = Vec::with_capacity(22); arrays.push(Arc::new(Int32Array::from_iter(std::iter::repeat_n( None::, num_rows,