From 7e9ab93705a02afb8bc0ebd7df754432b70ff28f Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Mon, 1 Dec 2025 12:55:48 +0100 Subject: [PATCH 1/6] Correct PartialEq impl for Union Fields --- arrow-schema/src/fields.rs | 51 +++++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/arrow-schema/src/fields.rs b/arrow-schema/src/fields.rs index 93638181d9ae..9ef50b4fd0b1 100644 --- a/arrow-schema/src/fields.rs +++ b/arrow-schema/src/fields.rs @@ -318,7 +318,7 @@ impl<'a> IntoIterator for &'a Fields { } /// A cheaply cloneable, owned collection of [`FieldRef`] and their corresponding type ids -#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash)] +#[derive(Clone, Eq, Ord, PartialOrd)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] #[cfg_attr(feature = "serde", serde(transparent))] pub struct UnionFields(Arc<[(i8, FieldRef)]>); @@ -345,6 +345,28 @@ impl std::ops::Index for UnionFields { } } +impl PartialEq for UnionFields { + fn eq(&self, other: &Self) -> bool { + self.len() == other.len() + && self.iter().all(|a| { + other.iter().any(|b| { + a.0 == b.0 + && a.1.is_nullable() == b.1.is_nullable() + && a.1.data_type().equals_datatype(b.1.data_type()) + }) + }) + } +} + +impl std::hash::Hash for UnionFields { + fn hash(&self, state: &mut H) { + let mut v = self.0.iter().collect::>(); + v.sort_by_key(|(id, _)| *id); + + v.hash(state); + } +} + impl UnionFields { /// Create a new [`UnionFields`] with no fields pub fn empty() -> Self { @@ -1039,4 +1061,31 @@ mod tests { assert!(res.is_ok()); assert_eq!(res.unwrap().len(), 3); } + + #[test] + fn test_union_field_equality() { + let ids = vec![0, 1, 2]; + let fields = vec![ + Field::new("a", DataType::Binary, true), + Field::new("b", DataType::Utf8, true), + Field::new("c", DataType::Int16, true), + ]; + + let u = UnionFields::try_new(ids.clone(), fields.clone()).unwrap(); + assert_eq!(u.clone(), u.clone()); + + let u_rev = + UnionFields::try_new(ids.clone().into_iter().rev(), fields.into_iter().rev()).unwrap(); + assert_eq!(u, u_rev); + + let fields_2 = vec![ + Field::new("a", DataType::Binary, true), + Field::new("b", DataType::Utf8, true), + // everything is the same from `fields` except Field "c" is not nullable + Field::new("c", DataType::Int16, false), + ]; + + let u2 = UnionFields::try_new(ids.clone(), fields_2.clone()).unwrap(); + assert_ne!(u, u2); + } } From 05fba3a8ecb8d622d6d2934648441f21126ab206 Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Mon, 1 Dec 2025 13:12:22 +0100 Subject: [PATCH 2/6] Impl hash --- arrow-schema/src/fields.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow-schema/src/fields.rs b/arrow-schema/src/fields.rs index 9ef50b4fd0b1..f54fe7e6179c 100644 --- a/arrow-schema/src/fields.rs +++ b/arrow-schema/src/fields.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -use std::ops::Deref; use std::sync::Arc; +use std::{hash::Hash, ops::Deref}; use crate::{ArrowError, DataType, Field, FieldRef}; @@ -358,7 +358,7 @@ impl PartialEq for UnionFields { } } -impl std::hash::Hash for UnionFields { +impl Hash for UnionFields { fn hash(&self, state: &mut H) { let mut v = self.0.iter().collect::>(); v.sort_by_key(|(id, _)| *id); From 1805f341b91566f5f318234e2474c185f2b7a539 Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Fri, 12 Dec 2025 14:00:18 -0500 Subject: [PATCH 3/6] Check metadata as well --- arrow-schema/src/fields.rs | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/arrow-schema/src/fields.rs b/arrow-schema/src/fields.rs index f54fe7e6179c..ffec7d34aa53 100644 --- a/arrow-schema/src/fields.rs +++ b/arrow-schema/src/fields.rs @@ -347,14 +347,7 @@ impl std::ops::Index for UnionFields { impl PartialEq for UnionFields { fn eq(&self, other: &Self) -> bool { - self.len() == other.len() - && self.iter().all(|a| { - other.iter().any(|b| { - a.0 == b.0 - && a.1.is_nullable() == b.1.is_nullable() - && a.1.data_type().equals_datatype(b.1.data_type()) - }) - }) + self.len() == other.len() && self.iter().all(|a| other.iter().any(|b| a == b)) } } From c815709886f49bea43745860efc20cfa47c68560 Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Wed, 17 Dec 2025 11:25:57 -0500 Subject: [PATCH 4/6] Sort by type ids --- arrow-schema/src/fields.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arrow-schema/src/fields.rs b/arrow-schema/src/fields.rs index ffec7d34aa53..0539da70dee0 100644 --- a/arrow-schema/src/fields.rs +++ b/arrow-schema/src/fields.rs @@ -417,7 +417,7 @@ impl UnionFields { loop { match (type_ids_iter.next(), fields_iter.next()) { - (None, None) => return Ok(Self(out.into())), + (None, None) => break, (Some(type_id), Some(field)) => { // check type id is non-negative if type_id < 0 { @@ -450,6 +450,11 @@ impl UnionFields { } } } + + // sort by type ids to produce a consistent ordering + out.sort_unstable_by_key(|&(i, _)| i); + + Ok(Self(out.into())) } /// Create a new [`UnionFields`] from a collection of fields with automatically From a05dac14d47dc9b92e3bfe0c2dc8b55e88dab46d Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Wed, 17 Dec 2025 11:39:05 -0500 Subject: [PATCH 5/6] Fix up tests --- arrow-array/src/array/union_array.rs | 12 ++++++++---- arrow-avro/src/reader/record.rs | 21 ++++++++++++++++----- arrow-schema/src/fields.rs | 17 +---------------- 3 files changed, 25 insertions(+), 25 deletions(-) diff --git a/arrow-array/src/array/union_array.rs b/arrow-array/src/array/union_array.rs index 5ba7b947c724..840854d61bd9 100644 --- a/arrow-array/src/array/union_array.rs +++ b/arrow-array/src/array/union_array.rs @@ -1721,8 +1721,9 @@ mod tests { .len(7) .buffers(vec![type_ids, value_offsets]) .child_data(vec![ - string_array.into_data(), + // Child arrays must be in sorted order by type ID: 4, 8, 9 int_array.into_data(), + string_array.into_data(), float_array.into_data(), ]) .build() @@ -1833,8 +1834,9 @@ mod tests { .len(7) .buffers(vec![type_ids, value_offsets]) .child_data(vec![ - string_array.into_data(), + // Child arrays must be in sorted order by type ID: 4, 8, 9 int_array.into_data(), + string_array.into_data(), float_array.into_data(), ]) .build() @@ -1862,9 +1864,10 @@ mod tests { ], ) .unwrap(); + // Child arrays must be in sorted order by type ID: 2, 3 let children = vec![ - Arc::new(StringArray::from_iter_values(["a", "b"])) as _, Arc::new(StringArray::from_iter_values(["c", "d"])) as _, + Arc::new(StringArray::from_iter_values(["a", "b"])) as _, ]; let type_ids = vec![3, 3, 2].into(); @@ -1890,9 +1893,10 @@ mod tests { "Invalid argument error: Type Ids values must match one of the field type ids" ); + // Child arrays must be in sorted order by type ID: 2, 3 let children = vec![ - Arc::new(StringArray::from_iter_values(["a", "b"])) as _, Arc::new(StringArray::from_iter_values(["c"])) as _, + Arc::new(StringArray::from_iter_values(["a", "b"])) as _, ]; let type_ids = ScalarBuffer::from(vec![3_i8, 3, 2]); let offsets = Some(vec![0, 1, 0].into()); diff --git a/arrow-avro/src/reader/record.rs b/arrow-avro/src/reader/record.rs index 97cdeed20fc6..1af3c1250abc 100644 --- a/arrow-avro/src/reader/record.rs +++ b/arrow-avro/src/reader/record.rs @@ -4330,8 +4330,17 @@ mod tests { avro_children.push(AvroDataType::new(codec, Default::default(), None)); fields.push(arrow_schema::Field::new(name, dt, true)); } - let union_fields = UnionFields::try_new(type_ids, fields).unwrap(); - let union_codec = Codec::Union(avro_children.into(), union_fields, UnionMode::Dense); + let union_fields = UnionFields::try_new(type_ids.clone(), fields).unwrap(); + + // UnionFields are sorted by type_id, so we need to reorder avro_children to match + let mut sorted_indices: Vec = (0..type_ids.len()).collect(); + sorted_indices.sort_by_key(|&i| type_ids[i]); + let sorted_avro_children: Vec = sorted_indices + .iter() + .map(|&i| avro_children[i].clone()) + .collect(); + + let union_codec = Codec::Union(sorted_avro_children.into(), union_fields, UnionMode::Dense); AvroDataType::new(union_codec, Default::default(), None) } @@ -4396,11 +4405,13 @@ mod tests { vec![42, 7], ); let mut dec = Decoder::try_new(&union_dt).unwrap(); - let r1 = encode_avro_long(0); + // after sorting by type_id, schema order is [string(7), null(42)] + // to encode null, use branch 1; to encode string, use branch 0 + let r1 = encode_avro_long(1); let mut r2 = Vec::new(); - r2.extend_from_slice(&encode_avro_long(1)); + r2.extend_from_slice(&encode_avro_long(0)); r2.extend_from_slice(&encode_avro_bytes(b"abc")); - let r3 = encode_avro_long(0); + let r3 = encode_avro_long(1); dec.decode(&mut AvroCursor::new(&r1)).unwrap(); dec.decode(&mut AvroCursor::new(&r2)).unwrap(); dec.decode(&mut AvroCursor::new(&r3)).unwrap(); diff --git a/arrow-schema/src/fields.rs b/arrow-schema/src/fields.rs index 0539da70dee0..214a6837c1c9 100644 --- a/arrow-schema/src/fields.rs +++ b/arrow-schema/src/fields.rs @@ -318,7 +318,7 @@ impl<'a> IntoIterator for &'a Fields { } /// A cheaply cloneable, owned collection of [`FieldRef`] and their corresponding type ids -#[derive(Clone, Eq, Ord, PartialOrd)] +#[derive(Clone, PartialEq, Eq, Hash, Ord, PartialOrd)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] #[cfg_attr(feature = "serde", serde(transparent))] pub struct UnionFields(Arc<[(i8, FieldRef)]>); @@ -345,21 +345,6 @@ impl std::ops::Index for UnionFields { } } -impl PartialEq for UnionFields { - fn eq(&self, other: &Self) -> bool { - self.len() == other.len() && self.iter().all(|a| other.iter().any(|b| a == b)) - } -} - -impl Hash for UnionFields { - fn hash(&self, state: &mut H) { - let mut v = self.0.iter().collect::>(); - v.sort_by_key(|(id, _)| *id); - - v.hash(state); - } -} - impl UnionFields { /// Create a new [`UnionFields`] with no fields pub fn empty() -> Self { From 03bc36abdacac1937804414a72fe108a6045d7a1 Mon Sep 17 00:00:00 2001 From: Matthew Kim <38759997+friendlymatthew@users.noreply.github.com> Date: Tue, 24 Mar 2026 11:10:42 -0400 Subject: [PATCH 6/6] sort --- arrow-array/src/array/union_array.rs | 12 +++------ arrow-avro/src/reader/record.rs | 21 ++++----------- arrow-schema/src/fields.rs | 38 +++------------------------- 3 files changed, 12 insertions(+), 59 deletions(-) diff --git a/arrow-array/src/array/union_array.rs b/arrow-array/src/array/union_array.rs index 840854d61bd9..5ba7b947c724 100644 --- a/arrow-array/src/array/union_array.rs +++ b/arrow-array/src/array/union_array.rs @@ -1721,9 +1721,8 @@ mod tests { .len(7) .buffers(vec![type_ids, value_offsets]) .child_data(vec![ - // Child arrays must be in sorted order by type ID: 4, 8, 9 - int_array.into_data(), string_array.into_data(), + int_array.into_data(), float_array.into_data(), ]) .build() @@ -1834,9 +1833,8 @@ mod tests { .len(7) .buffers(vec![type_ids, value_offsets]) .child_data(vec![ - // Child arrays must be in sorted order by type ID: 4, 8, 9 - int_array.into_data(), string_array.into_data(), + int_array.into_data(), float_array.into_data(), ]) .build() @@ -1864,10 +1862,9 @@ mod tests { ], ) .unwrap(); - // Child arrays must be in sorted order by type ID: 2, 3 let children = vec![ - Arc::new(StringArray::from_iter_values(["c", "d"])) as _, Arc::new(StringArray::from_iter_values(["a", "b"])) as _, + Arc::new(StringArray::from_iter_values(["c", "d"])) as _, ]; let type_ids = vec![3, 3, 2].into(); @@ -1893,10 +1890,9 @@ mod tests { "Invalid argument error: Type Ids values must match one of the field type ids" ); - // Child arrays must be in sorted order by type ID: 2, 3 let children = vec![ - Arc::new(StringArray::from_iter_values(["c"])) as _, Arc::new(StringArray::from_iter_values(["a", "b"])) as _, + Arc::new(StringArray::from_iter_values(["c"])) as _, ]; let type_ids = ScalarBuffer::from(vec![3_i8, 3, 2]); let offsets = Some(vec![0, 1, 0].into()); diff --git a/arrow-avro/src/reader/record.rs b/arrow-avro/src/reader/record.rs index 1af3c1250abc..97cdeed20fc6 100644 --- a/arrow-avro/src/reader/record.rs +++ b/arrow-avro/src/reader/record.rs @@ -4330,17 +4330,8 @@ mod tests { avro_children.push(AvroDataType::new(codec, Default::default(), None)); fields.push(arrow_schema::Field::new(name, dt, true)); } - let union_fields = UnionFields::try_new(type_ids.clone(), fields).unwrap(); - - // UnionFields are sorted by type_id, so we need to reorder avro_children to match - let mut sorted_indices: Vec = (0..type_ids.len()).collect(); - sorted_indices.sort_by_key(|&i| type_ids[i]); - let sorted_avro_children: Vec = sorted_indices - .iter() - .map(|&i| avro_children[i].clone()) - .collect(); - - let union_codec = Codec::Union(sorted_avro_children.into(), union_fields, UnionMode::Dense); + let union_fields = UnionFields::try_new(type_ids, fields).unwrap(); + let union_codec = Codec::Union(avro_children.into(), union_fields, UnionMode::Dense); AvroDataType::new(union_codec, Default::default(), None) } @@ -4405,13 +4396,11 @@ mod tests { vec![42, 7], ); let mut dec = Decoder::try_new(&union_dt).unwrap(); - // after sorting by type_id, schema order is [string(7), null(42)] - // to encode null, use branch 1; to encode string, use branch 0 - let r1 = encode_avro_long(1); + let r1 = encode_avro_long(0); let mut r2 = Vec::new(); - r2.extend_from_slice(&encode_avro_long(0)); + r2.extend_from_slice(&encode_avro_long(1)); r2.extend_from_slice(&encode_avro_bytes(b"abc")); - let r3 = encode_avro_long(1); + let r3 = encode_avro_long(0); dec.decode(&mut AvroCursor::new(&r1)).unwrap(); dec.decode(&mut AvroCursor::new(&r2)).unwrap(); dec.decode(&mut AvroCursor::new(&r3)).unwrap(); diff --git a/arrow-schema/src/fields.rs b/arrow-schema/src/fields.rs index 214a6837c1c9..93638181d9ae 100644 --- a/arrow-schema/src/fields.rs +++ b/arrow-schema/src/fields.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. +use std::ops::Deref; use std::sync::Arc; -use std::{hash::Hash, ops::Deref}; use crate::{ArrowError, DataType, Field, FieldRef}; @@ -318,7 +318,7 @@ impl<'a> IntoIterator for &'a Fields { } /// A cheaply cloneable, owned collection of [`FieldRef`] and their corresponding type ids -#[derive(Clone, PartialEq, Eq, Hash, Ord, PartialOrd)] +#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] #[cfg_attr(feature = "serde", serde(transparent))] pub struct UnionFields(Arc<[(i8, FieldRef)]>); @@ -402,7 +402,7 @@ impl UnionFields { loop { match (type_ids_iter.next(), fields_iter.next()) { - (None, None) => break, + (None, None) => return Ok(Self(out.into())), (Some(type_id), Some(field)) => { // check type id is non-negative if type_id < 0 { @@ -435,11 +435,6 @@ impl UnionFields { } } } - - // sort by type ids to produce a consistent ordering - out.sort_unstable_by_key(|&(i, _)| i); - - Ok(Self(out.into())) } /// Create a new [`UnionFields`] from a collection of fields with automatically @@ -1044,31 +1039,4 @@ mod tests { assert!(res.is_ok()); assert_eq!(res.unwrap().len(), 3); } - - #[test] - fn test_union_field_equality() { - let ids = vec![0, 1, 2]; - let fields = vec![ - Field::new("a", DataType::Binary, true), - Field::new("b", DataType::Utf8, true), - Field::new("c", DataType::Int16, true), - ]; - - let u = UnionFields::try_new(ids.clone(), fields.clone()).unwrap(); - assert_eq!(u.clone(), u.clone()); - - let u_rev = - UnionFields::try_new(ids.clone().into_iter().rev(), fields.into_iter().rev()).unwrap(); - assert_eq!(u, u_rev); - - let fields_2 = vec![ - Field::new("a", DataType::Binary, true), - Field::new("b", DataType::Utf8, true), - // everything is the same from `fields` except Field "c" is not nullable - Field::new("c", DataType::Int16, false), - ]; - - let u2 = UnionFields::try_new(ids.clone(), fields_2.clone()).unwrap(); - assert_ne!(u, u2); - } }