From 641767153398d31e754c2f76b990dd11dee2953b Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Tue, 24 Jun 2025 14:54:00 +0200
Subject: [PATCH 01/11] chore: skip order calculation / exponential planning

---
 .../src/equivalence/properties/union.rs       | 40 +++++++++++++++++--
 1 file changed, 36 insertions(+), 4 deletions(-)
diff --git a/datafusion/physical-expr/src/equivalence/properties/union.rs b/datafusion/physical-expr/src/equivalence/properties/union.rs
index d77129472a8ba..702688011668d 100644
--- a/datafusion/physical-expr/src/equivalence/properties/union.rs
+++ b/datafusion/physical-expr/src/equivalence/properties/union.rs
@@ -67,16 +67,43 @@ fn calculate_union_binary(
         })
         .collect::<Vec<_>>();
 
+    // TEMP HACK WORKAROUND
+    // Revert code from https://github.com/apache/datafusion/pull/12562
+    // Context: https://github.com/apache/datafusion/issues/13748
+    // Context: https://github.com/influxdata/influxdb_iox/issues/13038
+
     // Next, calculate valid orderings for the union by searching for prefixes
     // in both sides.
-    let mut orderings = UnionEquivalentOrderingBuilder::new();
-    orderings.add_satisfied_orderings(&lhs, &rhs)?;
-    orderings.add_satisfied_orderings(&rhs, &lhs)?;
-    let orderings = orderings.build();
+    let mut orderings = vec![];
+    for ordering in lhs.normalized_oeq_class().into_iter() {
+        let mut ordering: Vec<PhysicalSortExpr> = ordering.into();
+
+        // Progressively shorten the ordering to search for a satisfied prefix:
+        while !rhs.ordering_satisfy(ordering.clone())? {
+            ordering.pop();
+        }
+        // There is a non-trivial satisfied prefix, add it as a valid ordering:
+        if !ordering.is_empty() {
+            orderings.push(ordering);
+        }
+    }
 
+    for ordering in rhs.normalized_oeq_class().into_iter() {
+        let mut ordering: Vec<PhysicalSortExpr> = ordering.into();
+
+        // Progressively shorten the ordering to search for a satisfied prefix:
+        while !lhs.ordering_satisfy(ordering.clone())? {
+            ordering.pop();
+        }
+        // There is a non-trivial satisfied prefix, add it as a valid ordering:
+        if !ordering.is_empty() {
+            orderings.push(ordering);
+        }
+    }
     let mut eq_properties = EquivalenceProperties::new(lhs.schema);
     eq_properties.add_constants(constants)?;
     eq_properties.add_orderings(orderings);
+
     Ok(eq_properties)
 }
 
@@ -122,6 +149,7 @@ struct UnionEquivalentOrderingBuilder {
     orderings: Vec<LexOrdering>,
 }
 
+#[expect(unused)]
 impl UnionEquivalentOrderingBuilder {
     fn new() -> Self {
         Self { orderings: vec![] }
@@ -504,6 +532,7 @@ mod tests {
     }
 
     #[test]
+    #[ignore = "InfluxData patch: chore: skip order calculation / exponential planning"]
     fn test_union_equivalence_properties_constants_fill_gaps() -> Result<()> {
         let schema = create_test_schema().unwrap();
         UnionEquivalenceTest::new(&schema)
@@ -579,6 +608,7 @@ mod tests {
     }
 
     #[test]
+    #[ignore = "InfluxData patch: chore: skip order calculation / exponential planning"]
     fn test_union_equivalence_properties_constants_fill_gaps_non_symmetric() -> Result<()>
     {
         let schema = create_test_schema().unwrap();
@@ -607,6 +637,7 @@ mod tests {
     }
 
     #[test]
+    #[ignore = "InfluxData patch: chore: skip order calculation / exponential planning"]
     fn test_union_equivalence_properties_constants_gap_fill_symmetric() -> Result<()> {
         let schema = create_test_schema().unwrap();
         UnionEquivalenceTest::new(&schema)
@@ -658,6 +689,7 @@ mod tests {
     }
 
     #[test]
+    #[ignore = "InfluxData patch: chore: skip order calculation / exponential planning"]
     fn test_union_equivalence_properties_constants_middle_desc() -> Result<()> {
         let schema = create_test_schema().unwrap();
         UnionEquivalenceTest::new(&schema)

From 4277f8fe0cb7b5527a6a6ba35efdd63104fd66ca Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Tue, 16 Jul 2024 12:14:19 -0400
Subject: [PATCH 02/11] (New) Test + workaround for SanityCheck plan

---
 datafusion/physical-optimizer/src/sanity_checker.rs | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/datafusion/physical-optimizer/src/sanity_checker.rs b/datafusion/physical-optimizer/src/sanity_checker.rs
index bff33a281556d..8eb00327143dc 100644
--- a/datafusion/physical-optimizer/src/sanity_checker.rs
+++ b/datafusion/physical-optimizer/src/sanity_checker.rs
@@ -32,6 +32,8 @@ use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
 use datafusion_physical_expr::intervals::utils::{check_support, is_datatype_supported};
 use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType};
 use datafusion_physical_plan::joins::SymmetricHashJoinExec;
+use datafusion_physical_plan::sorts::sort::SortExec;
+use datafusion_physical_plan::union::UnionExec;
 use datafusion_physical_plan::{ExecutionPlanProperties, get_plan_string};
 
 use crate::PhysicalOptimizerRule;
@@ -136,6 +138,14 @@ pub fn check_plan_sanity(
         plan.required_input_ordering(),
         plan.required_input_distribution(),
     ) {
+        // TEMP HACK WORKAROUND https://github.com/apache/datafusion/issues/11492
+        if child.as_any().downcast_ref::<UnionExec>().is_some() {
+            continue;
+        }
+        if child.as_any().downcast_ref::<SortExec>().is_some() {
+            continue;
+        }
+
         let child_eq_props = child.equivalence_properties();
         if let Some(sort_req) = sort_req {
             let sort_req = sort_req.into_single();

From 64dc11c50a50e0dde6f39c5fb6d130344ea183bf Mon Sep 17 00:00:00 2001
From: Adam Curtis <adam.curtis.dev@gmail.com>
Date: Tue, 21 Oct 2025 13:33:16 -0400
Subject: [PATCH 03/11] chore: add debug logging and skip error on physical
 schema check

---
 datafusion/core/src/physical_planner.rs | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs
index cc7d534776d7e..b006d8ae39a9c 100644
--- a/datafusion/core/src/physical_planner.rs
+++ b/datafusion/core/src/physical_planner.rs
@@ -802,10 +802,17 @@ impl DefaultPhysicalPlanner {
                             ));
                         }
                     }
-                    return internal_err!(
+                    debug!(
                         "Physical input schema should be the same as the one converted from logical input schema. Differences: {}",
                         differences.iter().map(|s| format!("\n\t- {s}")).join("")
                     );
+
+                    //influx: temporarily remove error and only log so that we can find a
+                    //reproducer in production
+                    // return internal_err!("Physical input schema should be the same as the one converted from logical input schema. Differences: {}", differences
+                    //     .iter()
+                    //     .map(|s| format!("\n\t- {s}"))
+                    //     .join(""));
                 }
 
                 let groups = self.create_grouping_physical_expr(
@@ -4207,6 +4214,8 @@ digraph {
     }
 
     #[tokio::test]
+    // Ignored due to disabling the physical schema check skip.
+    #[ignore]
     async fn test_aggregate_schema_mismatch_metadata() {
         let logical_schema =
             Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, false)]));
@@ -4227,6 +4236,8 @@ digraph {
     }
 
     #[tokio::test]
+    // Ignored due to disabling the physical schema check skip.
+    #[ignore]
     async fn test_aggregate_schema_mismatch_field_count() {
         let logical_schema =
             Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, false)]));
@@ -4247,6 +4258,8 @@ digraph {
     }
 
     #[tokio::test]
+    // Ignored due to disabling the physical schema check skip.
+    #[ignore]
     async fn test_aggregate_schema_mismatch_field_name() {
         let logical_schema =
             Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, false)]));
@@ -4268,6 +4281,8 @@ digraph {
     }
 
     #[tokio::test]
+    // Ignored due to disabling the physical schema check skip.
+    #[ignore]
     async fn test_aggregate_schema_mismatch_field_type() {
         let logical_schema =
             Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, false)]));
@@ -4286,6 +4301,8 @@ digraph {
     }
 
     #[tokio::test]
+    // Ignored due to disabling the physical schema check skip.
+    #[ignore]
     async fn test_aggregate_schema_mismatch_field_nullability() {
         let logical_schema =
             Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, false)]));
@@ -4304,6 +4321,8 @@ digraph {
     }
 
     #[tokio::test]
+    // Ignored due to disabling the physical schema check skip.
+    #[ignore]
     async fn test_aggregate_schema_mismatch_field_metadata() {
         let logical_schema =
             Arc::new(Schema::new(vec![Field::new("c1", DataType::Int32, false)]));
@@ -4324,6 +4343,8 @@ digraph {
     }
 
     #[tokio::test]
+    // Ignored due to disabling the physical schema check skip.
+    #[ignore]
     async fn test_aggregate_schema_mismatch_multiple() {
         let logical_schema = Arc::new(Schema::new(vec![
             Field::new("c1", DataType::Int32, false),

From 506dd6de584193f787018548a806178941207e94 Mon Sep 17 00:00:00 2001
From: Adam Curtis <adam.curtis.dev@gmail.com>
Date: Tue, 16 Dec 2025 12:21:06 -0500
Subject: [PATCH 04/11] fix: wrap join operators with cooperative() for
 cancellation support

---
 datafusion/physical-plan/src/joins/cross_join.rs         | 9 +++++----
 datafusion/physical-plan/src/joins/hash_join/exec.rs     | 5 +++--
 datafusion/physical-plan/src/joins/nested_loop_join.rs   | 5 +++--
 .../physical-plan/src/joins/sort_merge_join/exec.rs      | 5 +++--
 .../physical-plan/src/joins/symmetric_hash_join.rs       | 9 +++++----
 5 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/datafusion/physical-plan/src/joins/cross_join.rs b/datafusion/physical-plan/src/joins/cross_join.rs
index 4f32b6176ec39..1774a8ea95e53 100644
--- a/datafusion/physical-plan/src/joins/cross_join.rs
+++ b/datafusion/physical-plan/src/joins/cross_join.rs
@@ -25,6 +25,7 @@ use super::utils::{
     OnceAsync, OnceFut, StatefulStreamResult, adjust_right_output_partitioning,
     reorder_output_after_swap,
 };
+use crate::coop::cooperative;
 use crate::execution_plan::{EmissionType, boundedness_from_children};
 use crate::metrics::{ExecutionPlanMetricsSet, MetricsSet};
 use crate::projection::{
@@ -332,7 +333,7 @@ impl ExecutionPlan for CrossJoinExec {
         })?;
 
         if enforce_batch_size_in_joins {
-            Ok(Box::pin(CrossJoinStream {
+            Ok(Box::pin(cooperative(CrossJoinStream {
                 schema: Arc::clone(&self.schema),
                 left_fut,
                 right: stream,
@@ -341,9 +342,9 @@ impl ExecutionPlan for CrossJoinExec {
                 state: CrossJoinStreamState::WaitBuildSide,
                 left_data: RecordBatch::new_empty(self.left().schema()),
                 batch_transformer: BatchSplitter::new(batch_size),
-            }))
+            })))
         } else {
-            Ok(Box::pin(CrossJoinStream {
+            Ok(Box::pin(cooperative(CrossJoinStream {
                 schema: Arc::clone(&self.schema),
                 left_fut,
                 right: stream,
@@ -352,7 +353,7 @@ impl ExecutionPlan for CrossJoinExec {
                 state: CrossJoinStreamState::WaitBuildSide,
                 left_data: RecordBatch::new_empty(self.left().schema()),
                 batch_transformer: NoopBatchTransformer::new(),
-            }))
+            })))
         }
     }
 
diff --git a/datafusion/physical-plan/src/joins/hash_join/exec.rs b/datafusion/physical-plan/src/joins/hash_join/exec.rs
index 91fc1ee4436ee..0a45887eff3e4 100644
--- a/datafusion/physical-plan/src/joins/hash_join/exec.rs
+++ b/datafusion/physical-plan/src/joins/hash_join/exec.rs
@@ -22,6 +22,7 @@ use std::sync::{Arc, OnceLock};
 use std::{any::Any, vec};
 
 use crate::ExecutionPlanProperties;
+use crate::coop::cooperative;
 use crate::execution_plan::{EmissionType, boundedness_from_children};
 use crate::filter_pushdown::{
     ChildPushdownResult, FilterDescription, FilterPushdownPhase,
@@ -1061,7 +1062,7 @@ impl ExecutionPlan for HashJoinExec {
             .map(|(_, right_expr)| Arc::clone(right_expr))
             .collect::<Vec<_>>();
 
-        Ok(Box::pin(HashJoinStream::new(
+        Ok(Box::pin(cooperative(HashJoinStream::new(
             partition,
             self.schema(),
             on_right,
@@ -1079,7 +1080,7 @@ impl ExecutionPlan for HashJoinExec {
             self.right.output_ordering().is_some(),
             build_accumulator,
             self.mode,
-        )))
+        ))))
     }
 
     fn metrics(&self) -> Option<MetricsSet> {
diff --git a/datafusion/physical-plan/src/joins/nested_loop_join.rs b/datafusion/physical-plan/src/joins/nested_loop_join.rs
index 44637321a7e35..fa435c0d815b1 100644
--- a/datafusion/physical-plan/src/joins/nested_loop_join.rs
+++ b/datafusion/physical-plan/src/joins/nested_loop_join.rs
@@ -29,6 +29,7 @@ use super::utils::{
     reorder_output_after_swap, swap_join_projection,
 };
 use crate::common::can_project;
+use crate::coop::cooperative;
 use crate::execution_plan::{EmissionType, boundedness_from_children};
 use crate::joins::SharedBitmapBuilder;
 use crate::joins::utils::{
@@ -529,7 +530,7 @@ impl ExecutionPlan for NestedLoopJoinExec {
             None => self.column_indices.clone(),
         };
 
-        Ok(Box::pin(NestedLoopJoinStream::new(
+        Ok(Box::pin(cooperative(NestedLoopJoinStream::new(
             self.schema(),
             self.filter.clone(),
             self.join_type,
@@ -538,7 +539,7 @@ impl ExecutionPlan for NestedLoopJoinExec {
             column_indices_after_projection,
             metrics,
             batch_size,
-        )))
+        ))))
     }
 
     fn metrics(&self) -> Option<MetricsSet> {
diff --git a/datafusion/physical-plan/src/joins/sort_merge_join/exec.rs b/datafusion/physical-plan/src/joins/sort_merge_join/exec.rs
index ae7a5fa764bcc..82929107806de 100644
--- a/datafusion/physical-plan/src/joins/sort_merge_join/exec.rs
+++ b/datafusion/physical-plan/src/joins/sort_merge_join/exec.rs
@@ -23,6 +23,7 @@ use std::any::Any;
 use std::fmt::Formatter;
 use std::sync::Arc;
 
+use crate::coop::cooperative;
 use crate::execution_plan::{EmissionType, boundedness_from_children};
 use crate::expressions::PhysicalSortExpr;
 use crate::joins::sort_merge_join::metrics::SortMergeJoinMetrics;
@@ -497,7 +498,7 @@ impl ExecutionPlan for SortMergeJoinExec {
             .register(context.memory_pool());
 
         // create join stream
-        Ok(Box::pin(SortMergeJoinStream::try_new(
+        Ok(Box::pin(cooperative(SortMergeJoinStream::try_new(
             context.session_config().spill_compression(),
             Arc::clone(&self.schema),
             self.sort_options.clone(),
@@ -512,7 +513,7 @@ impl ExecutionPlan for SortMergeJoinExec {
             SortMergeJoinMetrics::new(partition, &self.metrics),
             reservation,
             context.runtime_env(),
-        )?))
+        )?)))
     }
 
     fn metrics(&self) -> Option<MetricsSet> {
diff --git a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs
index 1f6bc703a0300..a54f930114c60 100644
--- a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs
+++ b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs
@@ -33,6 +33,7 @@ use std::task::{Context, Poll};
 use std::vec;
 
 use crate::common::SharedMemoryReservation;
+use crate::coop::cooperative;
 use crate::execution_plan::{boundedness_from_children, emission_type_from_children};
 use crate::joins::stream_join_utils::{
     PruningJoinHashMap, SortedFilterExpr, StreamJoinMetrics,
@@ -534,7 +535,7 @@ impl ExecutionPlan for SymmetricHashJoinExec {
         }
 
         if enforce_batch_size_in_joins {
-            Ok(Box::pin(SymmetricHashJoinStream {
+            Ok(Box::pin(cooperative(SymmetricHashJoinStream {
                 left_stream,
                 right_stream,
                 schema: self.schema(),
@@ -552,9 +553,9 @@ impl ExecutionPlan for SymmetricHashJoinExec {
                 state: SHJStreamState::PullRight,
                 reservation,
                 batch_transformer: BatchSplitter::new(batch_size),
-            }))
+            })))
         } else {
-            Ok(Box::pin(SymmetricHashJoinStream {
+            Ok(Box::pin(cooperative(SymmetricHashJoinStream {
                 left_stream,
                 right_stream,
                 schema: self.schema(),
@@ -572,7 +573,7 @@ impl ExecutionPlan for SymmetricHashJoinExec {
                 state: SHJStreamState::PullRight,
                 reservation,
                 batch_transformer: NoopBatchTransformer::new(),
-            }))
+            })))
         }
     }
 

From d615ad91a7f75a8f34eedae60cec4458ac8d277d Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Wed, 4 Feb 2026 08:44:14 -0500
Subject: [PATCH 05/11] Update bytes to v1.11.1 to avoid security audit

---
 Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5ab0b8c84a563..e9167c639a774 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1225,9 +1225,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
 
 [[package]]
 name = "bytes"
-version = "1.11.0"
+version = "1.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b35204fbdc0b3f4446b89fc1ac2cf84a8a68971995d0bf2e925ec7cd960f9cb3"
+checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33"
 
 [[package]]
 name = "bytes-utils"

From d87aafd4122b632ff8c8f2a07e6f6f087ce73f6d Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Fri, 6 Feb 2026 14:13:02 -0500
Subject: [PATCH 06/11] chore: Update time to avoid rustsec error

---
 Cargo.lock | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index e9167c639a774..c2e9312a24723 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4193,9 +4193,9 @@ dependencies = [
 
 [[package]]
 name = "num-conv"
-version = "0.1.0"
+version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
+checksum = "cf97ec579c3c42f953ef76dbf8d55ac91fb219dde70e49aa4a6b7d74e9919050"
 
 [[package]]
 name = "num-integer"
@@ -6044,30 +6044,30 @@ dependencies = [
 
 [[package]]
 name = "time"
-version = "0.3.44"
+version = "0.3.47"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "91e7d9e3bb61134e77bde20dd4825b97c010155709965fedf0f49bb138e52a9d"
+checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c"
 dependencies = [
  "deranged",
  "itoa",
  "num-conv",
  "powerfmt",
- "serde",
+ "serde_core",
  "time-core",
  "time-macros",
 ]
 
 [[package]]
 name = "time-core"
-version = "0.1.6"
+version = "0.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "40868e7c1d2f0b8d73e4a8c7f0ff63af4f6d19be117e90bd73eb1d62cf831c6b"
+checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca"
 
 [[package]]
 name = "time-macros"
-version = "0.2.24"
+version = "0.2.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "30cfb0125f12d9c277f35663a0a33f8c30190f4e4574868a330595412d34ebf3"
+checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215"
 dependencies = [
  "num-conv",
  "time-core",

From a21b158cb073c0dffd8475e0e359af9f514a355b Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Mon, 9 Feb 2026 14:57:20 -0500
Subject: [PATCH 07/11] Fix incorrect SortExec removal before AggregateExec

---
 .../src/enforce_sorting/sort_pushdown.rs      |  74 +++++++
 .../sqllogictest/test_files/sort_pushdown.slt | 184 ++++++++++++++++++
 2 files changed, 258 insertions(+)

diff --git a/datafusion/physical-optimizer/src/enforce_sorting/sort_pushdown.rs b/datafusion/physical-optimizer/src/enforce_sorting/sort_pushdown.rs
index 698fdea8e766e..267faeda0c1bb 100644
--- a/datafusion/physical-optimizer/src/enforce_sorting/sort_pushdown.rs
+++ b/datafusion/physical-optimizer/src/enforce_sorting/sort_pushdown.rs
@@ -35,6 +35,7 @@ use datafusion_physical_expr_common::sort_expr::{
     LexOrdering, LexRequirement, OrderingRequirements, PhysicalSortExpr,
     PhysicalSortRequirement,
 };
+use datafusion_physical_plan::aggregates::AggregateExec;
 use datafusion_physical_plan::execution_plan::CardinalityEffect;
 use datafusion_physical_plan::filter::FilterExec;
 use datafusion_physical_plan::joins::utils::{
@@ -353,6 +354,8 @@ fn pushdown_requirement_to_children(
                 Ok(None)
             }
         }
+    } else if let Some(aggregate_exec) = plan.as_any().downcast_ref::<AggregateExec>() {
+        handle_aggregate_pushdown(aggregate_exec, parent_required)
     } else if maintains_input_order.is_empty()
         || !maintains_input_order.iter().any(|o| *o)
         || plan.as_any().is::<RepartitionExec>()
@@ -388,6 +391,77 @@ fn pushdown_requirement_to_children(
     // TODO: Add support for Projection push down
 }
 
+/// Try to push sorting through  [`AggregateExec`]
+///
+/// `AggregateExec` only preserves the input order of its group by columns
+/// (not aggregates in general, which are formed from arbitrary expressions over
+/// input)
+///
+/// Thus function rewrites the parent required ordering in terms of the
+/// aggregate input if possible. This rewritten requirement represents the
+/// ordering of the `AggregateExec`'s **input** that would also satisfy the
+/// **parent** ordering.
+///
+/// If no such mapping is possible (e.g. because the sort references aggregate
+/// columns), returns None.
+fn handle_aggregate_pushdown(
+    aggregate_exec: &AggregateExec,
+    parent_required: OrderingRequirements,
+) -> Result<Option<Vec<Option<OrderingRequirements>>>> {
+    if !aggregate_exec
+        .maintains_input_order()
+        .into_iter()
+        .any(|o| o)
+    {
+        return Ok(None);
+    }
+
+    let group_expr = aggregate_exec.group_expr();
+    // GROUPING SETS introduce additional output columns and NULL substitutions;
+    // skip pushdown until we can map those cases safely.
+    if group_expr.has_grouping_set() {
+        return Ok(None);
+    }
+
+    let group_input_exprs = group_expr.input_exprs();
+    let parent_requirement = parent_required.into_single();
+    let mut child_requirement = Vec::with_capacity(parent_requirement.len());
+
+    for req in parent_requirement {
+        // Sort above AggregateExec should reference its output columns. Map each
+        // output group-by column to its original input expression.
+        let Some(column) = req.expr.as_any().downcast_ref::<Column>() else {
+            return Ok(None);
+        };
+        if column.index() >= group_input_exprs.len() {
+            // AggregateExec does not produce output that is sorted on aggregate
+            // columns so those can not be pushed through.
+            return Ok(None);
+        }
+        child_requirement.push(PhysicalSortRequirement::new(
+            Arc::clone(&group_input_exprs[column.index()]),
+            req.options,
+        ));
+    }
+
+    let Some(child_requirement) = LexRequirement::new(child_requirement) else {
+        return Ok(None);
+    };
+
+    // Keep sort above aggregate unless input ordering already satisfies the
+    // mapped requirement.
+    if aggregate_exec
+        .input()
+        .equivalence_properties()
+        .ordering_satisfy_requirement(child_requirement.iter().cloned())?
+    {
+        let child_requirements = OrderingRequirements::new(child_requirement);
+        Ok(Some(vec![Some(child_requirements)]))
+    } else {
+        Ok(None)
+    }
+}
+
 /// Return true if pushing the sort requirements through a node would violate
 /// the input sorting requirements for the plan
 fn pushdown_would_violate_requirements(
diff --git a/datafusion/sqllogictest/test_files/sort_pushdown.slt b/datafusion/sqllogictest/test_files/sort_pushdown.slt
index 58d9915a24be2..4148de1d62f73 100644
--- a/datafusion/sqllogictest/test_files/sort_pushdown.slt
+++ b/datafusion/sqllogictest/test_files/sort_pushdown.slt
@@ -851,6 +851,184 @@ LIMIT 3;
 5 4
 2 -3
 
+# Test 3.7: Aggregate ORDER BY expression should keep SortExec
+# Source pattern declared on parquet scan: [x ASC, y ASC].
+# Requested pattern in ORDER BY: [x ASC, CAST(y AS BIGINT) % 2 ASC].
+# Example for x=1 input y order 1,2,3 gives bucket order 1,0,1, which does not
+# match requested bucket ASC order. SortExec is required above AggregateExec.
+statement ok
+SET datafusion.execution.target_partitions = 1;
+
+statement ok
+CREATE TABLE agg_expr_data(x INT, y INT, v INT) AS VALUES
+(1, 1, 10),
+(1, 2, 20),
+(1, 3, 30),
+(2, 1, 40),
+(2, 2, 50),
+(2, 3, 60);
+
+query I
+COPY (SELECT * FROM agg_expr_data ORDER BY x, y)
+TO 'test_files/scratch/sort_pushdown/agg_expr_sorted.parquet';
+----
+6
+
+statement ok
+CREATE EXTERNAL TABLE agg_expr_parquet(x INT, y INT, v INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/agg_expr_sorted.parquet'
+WITH ORDER (x ASC, y ASC);
+
+query TT
+EXPLAIN SELECT
+  x,
+  CAST(y AS BIGINT) % 2,
+  SUM(v)
+FROM agg_expr_parquet
+GROUP BY x, CAST(y AS BIGINT) % 2
+ORDER BY x, CAST(y AS BIGINT) % 2;
+----
+logical_plan
+01)Sort: agg_expr_parquet.x ASC NULLS LAST, agg_expr_parquet.y % Int64(2) ASC NULLS LAST
+02)--Aggregate: groupBy=[[agg_expr_parquet.x, CAST(agg_expr_parquet.y AS Int64) % Int64(2)]], aggr=[[sum(CAST(agg_expr_parquet.v AS Int64))]]
+03)----TableScan: agg_expr_parquet projection=[x, y, v]
+physical_plan
+01)SortExec: expr=[x@0 ASC NULLS LAST, agg_expr_parquet.y % Int64(2)@1 ASC NULLS LAST], preserve_partitioning=[false]
+02)--AggregateExec: mode=Single, gby=[x@0 as x, CAST(y@1 AS Int64) % 2 as agg_expr_parquet.y % Int64(2)], aggr=[sum(agg_expr_parquet.v)], ordering_mode=PartiallySorted([0])
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/agg_expr_sorted.parquet]]}, projection=[x, y, v], output_ordering=[x@0 ASC NULLS LAST, y@1 ASC NULLS LAST], file_type=parquet
+
+# Expected output pattern from ORDER BY [x, bucket]:
+# rows grouped by x, and within each x bucket appears as 0 then 1.
+query III
+SELECT
+  x,
+  CAST(y AS BIGINT) % 2,
+  SUM(v)
+FROM agg_expr_parquet
+GROUP BY x, CAST(y AS BIGINT) % 2
+ORDER BY x, CAST(y AS BIGINT) % 2;
+----
+1 0 20
+1 1 40
+2 0 50
+2 1 100
+
+# Test 3.8: Aggregate ORDER BY monotonic expression can push down (no SortExec)
+query TT
+EXPLAIN SELECT
+  x,
+  CAST(y AS BIGINT),
+  SUM(v)
+FROM agg_expr_parquet
+GROUP BY x, CAST(y AS BIGINT)
+ORDER BY x, CAST(y AS BIGINT);
+----
+logical_plan
+01)Sort: agg_expr_parquet.x ASC NULLS LAST, agg_expr_parquet.y ASC NULLS LAST
+02)--Aggregate: groupBy=[[agg_expr_parquet.x, CAST(agg_expr_parquet.y AS Int64)]], aggr=[[sum(CAST(agg_expr_parquet.v AS Int64))]]
+03)----TableScan: agg_expr_parquet projection=[x, y, v]
+physical_plan
+01)AggregateExec: mode=Single, gby=[x@0 as x, CAST(y@1 AS Int64) as agg_expr_parquet.y], aggr=[sum(agg_expr_parquet.v)], ordering_mode=Sorted
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/agg_expr_sorted.parquet]]}, projection=[x, y, v], output_ordering=[x@0 ASC NULLS LAST, y@1 ASC NULLS LAST], file_type=parquet
+
+query III
+SELECT
+  x,
+  CAST(y AS BIGINT),
+  SUM(v)
+FROM agg_expr_parquet
+GROUP BY x, CAST(y AS BIGINT)
+ORDER BY x, CAST(y AS BIGINT);
+----
+1 1 10
+1 2 20
+1 3 30
+2 1 40
+2 2 50
+2 3 60
+
+# Test 3.9: Aggregate ORDER BY aggregate output should keep SortExec
+query TT
+EXPLAIN SELECT x, SUM(v)
+FROM agg_expr_parquet
+GROUP BY x
+ORDER BY SUM(v);
+----
+logical_plan
+01)Sort: sum(agg_expr_parquet.v) ASC NULLS LAST
+02)--Aggregate: groupBy=[[agg_expr_parquet.x]], aggr=[[sum(CAST(agg_expr_parquet.v AS Int64))]]
+03)----TableScan: agg_expr_parquet projection=[x, v]
+physical_plan
+01)SortExec: expr=[sum(agg_expr_parquet.v)@1 ASC NULLS LAST], preserve_partitioning=[false]
+02)--AggregateExec: mode=Single, gby=[x@0 as x], aggr=[sum(agg_expr_parquet.v)], ordering_mode=Sorted
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/agg_expr_sorted.parquet]]}, projection=[x, v], output_ordering=[x@0 ASC NULLS LAST], file_type=parquet
+
+query II
+SELECT x, SUM(v)
+FROM agg_expr_parquet
+GROUP BY x
+ORDER BY SUM(v);
+----
+1 60
+2 150
+
+# Test 3.10: Aggregate with non-preserved input order should keep SortExec
+# v is not part of the order by
+query TT
+EXPLAIN SELECT v, SUM(y)
+FROM agg_expr_parquet
+GROUP BY v
+ORDER BY v;
+----
+logical_plan
+01)Sort: agg_expr_parquet.v ASC NULLS LAST
+02)--Aggregate: groupBy=[[agg_expr_parquet.v]], aggr=[[sum(CAST(agg_expr_parquet.y AS Int64))]]
+03)----TableScan: agg_expr_parquet projection=[y, v]
+physical_plan
+01)SortExec: expr=[v@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--AggregateExec: mode=Single, gby=[v@1 as v], aggr=[sum(agg_expr_parquet.y)]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/agg_expr_sorted.parquet]]}, projection=[y, v], file_type=parquet
+
+query II
+SELECT v, SUM(y)
+FROM agg_expr_parquet
+GROUP BY v
+ORDER BY v;
+----
+10 1
+20 2
+30 3
+40 1
+50 2
+60 3
+
+# Test 3.11: Aggregate ORDER BY non-column expression (unsatisfied) keeps SortExec
+# (though note in theory DataFusion could figure out that data sorted by x will also be sorted by x+1)
+query TT
+EXPLAIN SELECT x, SUM(v)
+FROM agg_expr_parquet
+GROUP BY x
+ORDER BY x + 1 DESC;
+----
+logical_plan
+01)Sort: CAST(agg_expr_parquet.x AS Int64) + Int64(1) DESC NULLS FIRST
+02)--Aggregate: groupBy=[[agg_expr_parquet.x]], aggr=[[sum(CAST(agg_expr_parquet.v AS Int64))]]
+03)----TableScan: agg_expr_parquet projection=[x, v]
+physical_plan
+01)SortExec: expr=[CAST(x@0 AS Int64) + 1 DESC], preserve_partitioning=[false]
+02)--AggregateExec: mode=Single, gby=[x@0 as x], aggr=[sum(agg_expr_parquet.v)], ordering_mode=Sorted
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/agg_expr_sorted.parquet]]}, projection=[x, v], output_ordering=[x@0 ASC NULLS LAST], file_type=parquet
+
+query II
+SELECT x, SUM(v)
+FROM agg_expr_parquet
+GROUP BY x
+ORDER BY x + 1 DESC;
+----
+2 150
+1 60
+
 # Cleanup
 statement ok
 DROP TABLE timestamp_data;
@@ -882,5 +1060,11 @@ DROP TABLE signed_data;
 statement ok
 DROP TABLE signed_parquet;
 
+statement ok
+DROP TABLE agg_expr_data;
+
+statement ok
+DROP TABLE agg_expr_parquet;
+
 statement ok
 SET datafusion.optimizer.enable_sort_pushdown = true;

From b62786424824e3f87230f8fd1923937199802e7a Mon Sep 17 00:00:00 2001
From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>
Date: Sat, 14 Feb 2026 14:04:01 -0500
Subject: [PATCH 08/11] fix: validate inter-file ordering in eq_properties()
 (#20329)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Discovered this bug while working on #19724.

TLDR: just because the files themselves are sorted doesn't mean the
partition streams are sorted.

- **`eq_properties()` in `FileScanConfig` blindly trusted
`output_ordering`** (set from Parquet `sorting_columns` metadata)
without verifying that files within a group are in the correct
inter-file order
- `EnforceSorting` then removed `SortExec` based on this unvalidated
ordering, producing **wrong results** when filesystem order didn't match
data order
- Added `validated_output_ordering()` that filters orderings using
`MinMaxStatistics::new_from_files()` + `is_sorted()` to verify
inter-file sort order before reporting them to the optimizer

- Added `validated_output_ordering()` method on `FileScanConfig` that
validates each output ordering against actual file group statistics
- Changed `eq_properties()` to call `self.validated_output_ordering()`
instead of `self.output_ordering.clone()`

Added 8 new regression tests (Tests 4-11):

| Test | Scenario | Key assertion |
|------|----------|---------------|
| **4** | Reversed filesystem order (inferred ordering) | SortExec
retained — wrong inter-file order detected |
| **5** | Overlapping file ranges (inferred ordering) | SortExec
retained — overlapping ranges detected |
| **6** | `WITH ORDER` + reversed filesystem order | SortExec retained
despite explicit ordering |
| **7** | Correctly ordered multi-file group (positive) | SortExec
eliminated — validation passes |
| **8** | DESC ordering with wrong inter-file DESC order | SortExec
retained for DESC direction |
| **9** | Multi-column sort key (overlapping vs non-overlapping) |
Conservative rejection with overlapping stats; passes with clean
boundaries |
| **10** | Correctly ordered + `WITH ORDER` (positive) | SortExec
eliminated — both ordering and stats agree |
| **11** | Multiple partitions (one file per group) |
`SortPreservingMergeExec` merges; no per-partition sort needed |

- [x] `cargo test --test sqllogictests -- sort_pushdown` — all new +
existing tests pass
- [x] `cargo test -p datafusion-datasource` — 97 unit tests + 6 doc
tests pass
- [x] Existing Test 1 (single-file sort pushdown with `WITH ORDER`)
still eliminates SortExec (no regression)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../partition_statistics.rs                   |   2 +-
 datafusion/datasource/src/file_scan_config.rs | 162 ++++--
 datafusion/datasource/src/statistics.rs       |   3 +-
 .../test_files/parquet_sorted_statistics.slt  |   2 +-
 .../sqllogictest/test_files/sort_pushdown.slt | 539 ++++++++++++++++++
 5 files changed, 661 insertions(+), 47 deletions(-)

diff --git a/datafusion/core/tests/physical_optimizer/partition_statistics.rs b/datafusion/core/tests/physical_optimizer/partition_statistics.rs
index ba53d079e3059..36fa95aa9f57d 100644
--- a/datafusion/core/tests/physical_optimizer/partition_statistics.rs
+++ b/datafusion/core/tests/physical_optimizer/partition_statistics.rs
@@ -864,7 +864,7 @@ mod test {
         let plan_string = get_plan_string(&aggregate_exec_partial).swap_remove(0);
         assert_snapshot!(
             plan_string,
-            @"AggregateExec: mode=Partial, gby=[id@0 as id, 1 + id@0 as expr], aggr=[COUNT(c)], ordering_mode=Sorted"
+            @"AggregateExec: mode=Partial, gby=[id@0 as id, 1 + id@0 as expr], aggr=[COUNT(c)]"
         );
 
         let p0_statistics = aggregate_exec_partial.partition_statistics(Some(0))?;
diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs
index c8636343ccc5a..facd12d7cbdfe 100644
--- a/datafusion/datasource/src/file_scan_config.rs
+++ b/datafusion/datasource/src/file_scan_config.rs
@@ -665,7 +665,7 @@ impl DataSource for FileScanConfig {
         let schema = self.file_source.table_schema().table_schema();
         let mut eq_properties = EquivalenceProperties::new_with_orderings(
             Arc::clone(schema),
-            self.output_ordering.clone(),
+            self.validated_output_ordering(),
         )
         .with_constraints(self.constraints.clone());
 
@@ -853,6 +853,40 @@ impl DataSource for FileScanConfig {
 }
 
 impl FileScanConfig {
+    /// Returns only the output orderings that are validated against actual
+    /// file group statistics.
+    ///
+    /// For example, individual files may be ordered by `col1 ASC`,
+    /// but if we have files with these min/max statistics in a single partition / file group:
+    ///
+    /// - file1: min(col1) = 10, max(col1) = 20
+    /// - file2: min(col1) = 5, max(col1) = 15
+    ///
+    /// Because reading file1 followed by file2 would produce out-of-order output (there is overlap
+    /// in the ranges), we cannot retain `col1 ASC` as a valid output ordering.
+    ///
+    /// Similarly this would not be a valid order (non-overlapping ranges but not ordered):
+    ///
+    /// - file1: min(col1) = 20, max(col1) = 30
+    /// - file2: min(col1) = 10, max(col1) = 15
+    ///
+    /// On the other hand if we had:
+    ///
+    /// - file1: min(col1) = 5, max(col1) = 15
+    /// - file2: min(col1) = 16, max(col1) = 25
+    ///
+    /// Then we know that reading file1 followed by file2 will produce ordered output,
+    /// so `col1 ASC` would be retained.
+    ///
+    /// Note that we are checking for ordering *within* *each* file group / partition,
+    /// files in different partitions are read independently and do not affect each other's ordering.
+    /// Merging of the multiple partition streams into a single ordered stream is handled
+    /// upstream e.g. by `SortPreservingMergeExec`.
+    fn validated_output_ordering(&self) -> Vec<LexOrdering> {
+        let schema = self.file_source.table_schema().table_schema();
+        validate_orderings(&self.output_ordering, schema, &self.file_groups, None)
+    }
+
     /// Get the file schema (schema of the files without partition columns)
     pub fn file_schema(&self) -> &SchemaRef {
         self.file_source.table_schema().file_schema()
@@ -1202,6 +1236,51 @@ fn ordered_column_indices_from_projection(
         .collect::<Option<Vec<usize>>>()
 }
 
+/// Check whether a given ordering is valid for all file groups by verifying
+/// that files within each group are sorted according to their min/max statistics.
+///
+/// For single-file (or empty) groups, the ordering is trivially valid.
+/// For multi-file groups, we check that the min/max statistics for the sort
+/// columns are in order and non-overlapping (or touching at boundaries).
+///
+/// `projection` maps projected column indices back to table-schema indices
+/// when validating after projection; pass `None` when validating at
+/// table-schema level.
+fn is_ordering_valid_for_file_groups(
+    file_groups: &[FileGroup],
+    ordering: &LexOrdering,
+    schema: &SchemaRef,
+    projection: Option<&[usize]>,
+) -> bool {
+    file_groups.iter().all(|group| {
+        if group.len() <= 1 {
+            return true; // single-file groups are trivially sorted
+        }
+        match MinMaxStatistics::new_from_files(ordering, schema, projection, group.iter())
+        {
+            Ok(stats) => stats.is_sorted(),
+            Err(_) => false, // can't prove sorted → reject
+        }
+    })
+}
+
+/// Filters orderings to retain only those valid for all file groups,
+/// verified via min/max statistics.
+fn validate_orderings(
+    orderings: &[LexOrdering],
+    schema: &SchemaRef,
+    file_groups: &[FileGroup],
+    projection: Option<&[usize]>,
+) -> Vec<LexOrdering> {
+    orderings
+        .iter()
+        .filter(|ordering| {
+            is_ordering_valid_for_file_groups(file_groups, ordering, schema, projection)
+        })
+        .cloned()
+        .collect()
+}
+
 /// The various listing tables does not attempt to read all files
 /// concurrently, instead they will read files in sequence within a
 /// partition.  This is an important property as it allows plans to
@@ -1268,52 +1347,47 @@ fn get_projected_output_ordering(
     let projected_orderings =
         project_orderings(&base_config.output_ordering, projected_schema);
 
-    let mut all_orderings = vec![];
-    for new_ordering in projected_orderings {
-        // Check if any file groups are not sorted
-        if base_config.file_groups.iter().any(|group| {
-            if group.len() <= 1 {
-                // File groups with <= 1 files are always sorted
-                return false;
-            }
-
-            let Some(indices) = base_config
-                .file_source
-                .projection()
-                .as_ref()
-                .map(|p| ordered_column_indices_from_projection(p))
-            else {
-                // Can't determine if ordered without a simple projection
-                return true;
-            };
-
-            let statistics = match MinMaxStatistics::new_from_files(
-                &new_ordering,
+    let indices = base_config
+        .file_source
+        .projection()
+        .as_ref()
+        .map(|p| ordered_column_indices_from_projection(p));
+
+    match indices {
+        Some(Some(indices)) => {
+            // Simple column projection — validate with statistics
+            validate_orderings(
+                &projected_orderings,
                 projected_schema,
-                indices.as_deref(),
-                group.iter(),
-            ) {
-                Ok(statistics) => statistics,
-                Err(e) => {
-                    log::trace!("Error fetching statistics for file group: {e}");
-                    // we can't prove that it's ordered, so we have to reject it
-                    return true;
-                }
-            };
-
-            !statistics.is_sorted()
-        }) {
-            debug!(
-                "Skipping specified output ordering {:?}. \
-                Some file groups couldn't be determined to be sorted: {:?}",
-                base_config.output_ordering[0], base_config.file_groups
-            );
-            continue;
+                &base_config.file_groups,
+                Some(indices.as_slice()),
+            )
+        }
+        None => {
+            // No projection — validate with statistics (no remapping needed)
+            validate_orderings(
+                &projected_orderings,
+                projected_schema,
+                &base_config.file_groups,
+                None,
+            )
+        }
+        Some(None) => {
+            // Complex projection (expressions, not simple columns) — can't
+            // determine column indices for statistics. Still valid if all
+            // file groups have at most one file.
+            if base_config.file_groups.iter().all(|g| g.len() <= 1) {
+                projected_orderings
+            } else {
+                debug!(
+                    "Skipping specified output orderings. \
+                     Some file groups couldn't be determined to be sorted: {:?}",
+                    base_config.file_groups
+                );
+                vec![]
+            }
         }
-
-        all_orderings.push(new_ordering);
     }
-    all_orderings
 }
 
 /// Convert type to a type suitable for use as a `ListingTable`
diff --git a/datafusion/datasource/src/statistics.rs b/datafusion/datasource/src/statistics.rs
index 2f34ca032e132..b1a56e096c222 100644
--- a/datafusion/datasource/src/statistics.rs
+++ b/datafusion/datasource/src/statistics.rs
@@ -266,11 +266,12 @@ impl MinMaxStatistics {
     }
 
     /// Check if the min/max statistics are in order and non-overlapping
+    /// (or touching at boundaries)
     pub fn is_sorted(&self) -> bool {
         self.max_by_sort_order
             .iter()
             .zip(self.min_by_sort_order.iter().skip(1))
-            .all(|(max, next_min)| max < next_min)
+            .all(|(max, next_min)| max <= next_min)
     }
 }
 
diff --git a/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt b/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt
index 5a559bdb94835..fd3a40ca17079 100644
--- a/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt
+++ b/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt
@@ -274,4 +274,4 @@ logical_plan
 02)--TableScan: test_table projection=[constant_col]
 physical_plan
 01)SortPreservingMergeExec: [constant_col@0 ASC NULLS LAST]
-02)--DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=A/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=B/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=C/2.parquet]]}, projection=[constant_col], file_type=parquet
+02)--DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=A/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=B/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=C/2.parquet]]}, projection=[constant_col], output_ordering=[constant_col@0 ASC NULLS LAST], file_type=parquet
diff --git a/datafusion/sqllogictest/test_files/sort_pushdown.slt b/datafusion/sqllogictest/test_files/sort_pushdown.slt
index 4148de1d62f73..a9ccb050b72fe 100644
--- a/datafusion/sqllogictest/test_files/sort_pushdown.slt
+++ b/datafusion/sqllogictest/test_files/sort_pushdown.slt
@@ -1068,3 +1068,542 @@ DROP TABLE agg_expr_parquet;
 
 statement ok
 SET datafusion.optimizer.enable_sort_pushdown = true;
+
+# Test 4: Reversed filesystem order with inferred ordering
+# Create 3 parquet files with non-overlapping id ranges, named so filesystem
+# order is OPPOSITE to data order. Each file is internally sorted by id ASC.
+# Force target_partitions=1 so all files end up in one file group, which is
+# where the inter-file ordering bug manifests.
+# Without inter-file validation, the optimizer would incorrectly trust the
+# inferred ordering and remove SortExec.
+
+# Save current target_partitions and set to 1 to force single file group
+statement ok
+SET datafusion.execution.target_partitions = 1;
+
+statement ok
+CREATE TABLE reversed_high(id INT, value INT) AS VALUES (7, 700), (8, 800), (9, 900);
+
+statement ok
+CREATE TABLE reversed_mid(id INT, value INT) AS VALUES (4, 400), (5, 500), (6, 600);
+
+statement ok
+CREATE TABLE reversed_low(id INT, value INT) AS VALUES (1, 100), (2, 200), (3, 300);
+
+query I
+COPY (SELECT * FROM reversed_high ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/reversed/a_high.parquet';
+----
+3
+
+query I
+COPY (SELECT * FROM reversed_mid ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/reversed/b_mid.parquet';
+----
+3
+
+query I
+COPY (SELECT * FROM reversed_low ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/reversed/c_low.parquet';
+----
+3
+
+# External table with NO "WITH ORDER" — relies on inferred ordering from parquet metadata
+statement ok
+CREATE EXTERNAL TABLE reversed_parquet(id INT, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/reversed/';
+
+# Test 4.1: SortExec must be present because files are not in inter-file order
+query TT
+EXPLAIN SELECT * FROM reversed_parquet ORDER BY id ASC;
+----
+logical_plan
+01)Sort: reversed_parquet.id ASC NULLS LAST
+02)--TableScan: reversed_parquet projection=[id, value]
+physical_plan
+01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/a_high.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/c_low.parquet]]}, projection=[id, value], file_type=parquet
+
+# Test 4.2: Results must be correct
+query II
+SELECT * FROM reversed_parquet ORDER BY id ASC;
+----
+1 100
+2 200
+3 300
+4 400
+5 500
+6 600
+7 700
+8 800
+9 900
+
+# Test 5: Overlapping files with inferred ordering
+# Create files with overlapping id ranges
+
+statement ok
+CREATE TABLE overlap_x(id INT, value INT) AS VALUES (1, 100), (3, 300), (5, 500);
+
+statement ok
+CREATE TABLE overlap_y(id INT, value INT) AS VALUES (2, 200), (4, 400), (6, 600);
+
+query I
+COPY (SELECT * FROM overlap_x ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/overlap/file_x.parquet';
+----
+3
+
+query I
+COPY (SELECT * FROM overlap_y ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/overlap/file_y.parquet';
+----
+3
+
+statement ok
+CREATE EXTERNAL TABLE overlap_parquet(id INT, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/overlap/';
+
+# Test 5.1: SortExec must be present because files have overlapping ranges
+query TT
+EXPLAIN SELECT * FROM overlap_parquet ORDER BY id ASC;
+----
+logical_plan
+01)Sort: overlap_parquet.id ASC NULLS LAST
+02)--TableScan: overlap_parquet projection=[id, value]
+physical_plan
+01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/overlap/file_x.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/overlap/file_y.parquet]]}, projection=[id, value], file_type=parquet
+
+# Test 5.2: Results must be correct
+query II
+SELECT * FROM overlap_parquet ORDER BY id ASC;
+----
+1 100
+2 200
+3 300
+4 400
+5 500
+6 600
+
+# Test 6: WITH ORDER + reversed filesystem order
+# Same file setup as Test 4 but explicitly declaring ordering via WITH ORDER.
+# Even with WITH ORDER, the optimizer should detect that inter-file order is wrong
+# and keep SortExec.
+
+statement ok
+CREATE EXTERNAL TABLE reversed_with_order_parquet(id INT, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/reversed/'
+WITH ORDER (id ASC);
+
+# Test 6.1: SortExec must be present despite WITH ORDER
+query TT
+EXPLAIN SELECT * FROM reversed_with_order_parquet ORDER BY id ASC;
+----
+logical_plan
+01)Sort: reversed_with_order_parquet.id ASC NULLS LAST
+02)--TableScan: reversed_with_order_parquet projection=[id, value]
+physical_plan
+01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/a_high.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/c_low.parquet]]}, projection=[id, value], file_type=parquet
+
+# Test 6.2: Results must be correct
+query II
+SELECT * FROM reversed_with_order_parquet ORDER BY id ASC;
+----
+1 100
+2 200
+3 300
+4 400
+5 500
+6 600
+7 700
+8 800
+9 900
+
+# Test 7: Correctly ordered multi-file single group (positive case)
+# Files are in CORRECT inter-file order within a single group.
+# The validation should PASS and SortExec should be eliminated.
+
+statement ok
+CREATE TABLE correct_low(id INT, value INT) AS VALUES (1, 100), (2, 200), (3, 300);
+
+statement ok
+CREATE TABLE correct_mid(id INT, value INT) AS VALUES (4, 400), (5, 500), (6, 600);
+
+statement ok
+CREATE TABLE correct_high(id INT, value INT) AS VALUES (7, 700), (8, 800), (9, 900);
+
+query I
+COPY (SELECT * FROM correct_low ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/correct/a_low.parquet';
+----
+3
+
+query I
+COPY (SELECT * FROM correct_mid ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/correct/b_mid.parquet';
+----
+3
+
+query I
+COPY (SELECT * FROM correct_high ORDER BY id ASC)
+TO 'test_files/scratch/sort_pushdown/correct/c_high.parquet';
+----
+3
+
+statement ok
+CREATE EXTERNAL TABLE correct_parquet(id INT, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/correct/'
+WITH ORDER (id ASC);
+
+# Test 7.1: SortExec should be ELIMINATED — files are in correct inter-file order
+query TT
+EXPLAIN SELECT * FROM correct_parquet ORDER BY id ASC;
+----
+logical_plan
+01)Sort: correct_parquet.id ASC NULLS LAST
+02)--TableScan: correct_parquet projection=[id, value]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/correct/a_low.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/correct/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/correct/c_high.parquet]]}, projection=[id, value], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet
+
+# Test 7.2: Results must be correct
+query II
+SELECT * FROM correct_parquet ORDER BY id ASC;
+----
+1 100
+2 200
+3 300
+4 400
+5 500
+6 600
+7 700
+8 800
+9 900
+
+# Test 7.3: DESC query on correctly ordered ASC files should still use SortExec
+# Note: reverse_row_groups=true reverses the file list in the plan display
+query TT
+EXPLAIN SELECT * FROM correct_parquet ORDER BY id DESC;
+----
+logical_plan
+01)Sort: correct_parquet.id DESC NULLS FIRST
+02)--TableScan: correct_parquet projection=[id, value]
+physical_plan
+01)SortExec: expr=[id@0 DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/correct/c_high.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/correct/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/correct/a_low.parquet]]}, projection=[id, value], file_type=parquet, reverse_row_groups=true
+
+query II
+SELECT * FROM correct_parquet ORDER BY id DESC;
+----
+9 900
+8 800
+7 700
+6 600
+5 500
+4 400
+3 300
+2 200
+1 100
+
+# Test 8: DESC ordering with files in wrong inter-file DESC order
+# Create files internally sorted by id DESC, but named so filesystem order
+# is WRONG for DESC ordering (low values first in filesystem order).
+
+statement ok
+CREATE TABLE desc_low(id INT, value INT) AS VALUES (3, 300), (2, 200), (1, 100);
+
+statement ok
+CREATE TABLE desc_high(id INT, value INT) AS VALUES (9, 900), (8, 800), (7, 700);
+
+query I
+COPY (SELECT * FROM desc_low ORDER BY id DESC)
+TO 'test_files/scratch/sort_pushdown/desc_reversed/a_low.parquet';
+----
+3
+
+query I
+COPY (SELECT * FROM desc_high ORDER BY id DESC)
+TO 'test_files/scratch/sort_pushdown/desc_reversed/b_high.parquet';
+----
+3
+
+statement ok
+CREATE EXTERNAL TABLE desc_reversed_parquet(id INT, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/desc_reversed/'
+WITH ORDER (id DESC);
+
+# Test 8.1: SortExec must be present — files are in wrong inter-file DESC order
+# (a_low has 1-3, b_high has 7-9; for DESC, b_high should come first)
+query TT
+EXPLAIN SELECT * FROM desc_reversed_parquet ORDER BY id DESC;
+----
+logical_plan
+01)Sort: desc_reversed_parquet.id DESC NULLS FIRST
+02)--TableScan: desc_reversed_parquet projection=[id, value]
+physical_plan
+01)SortExec: expr=[id@0 DESC], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/desc_reversed/a_low.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/desc_reversed/b_high.parquet]]}, projection=[id, value], file_type=parquet
+
+# Test 8.2: Results must be correct
+query II
+SELECT * FROM desc_reversed_parquet ORDER BY id DESC;
+----
+9 900
+8 800
+7 700
+3 300
+2 200
+1 100
+
+# Test 9: Multi-column sort key validation
+# Files have (category, id) ordering. Files share a boundary value on category='B'
+# so column-level min/max statistics overlap on the primary key column.
+# The validation conservatively rejects this because column-level stats can't
+# precisely represent row-level boundaries for multi-column keys.
+
+statement ok
+CREATE TABLE multi_col_a(category VARCHAR, id INT, value INT) AS VALUES
+('A', 1, 10), ('A', 2, 20), ('B', 1, 30);
+
+statement ok
+CREATE TABLE multi_col_b(category VARCHAR, id INT, value INT) AS VALUES
+('B', 2, 40), ('C', 1, 50), ('C', 2, 60);
+
+query I
+COPY (SELECT * FROM multi_col_a ORDER BY category ASC, id ASC)
+TO 'test_files/scratch/sort_pushdown/multi_col/a_first.parquet';
+----
+3
+
+query I
+COPY (SELECT * FROM multi_col_b ORDER BY category ASC, id ASC)
+TO 'test_files/scratch/sort_pushdown/multi_col/b_second.parquet';
+----
+3
+
+statement ok
+CREATE EXTERNAL TABLE multi_col_parquet(category VARCHAR, id INT, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/multi_col/'
+WITH ORDER (category ASC, id ASC);
+
+# Test 9.1: SortExec is present — validation conservatively rejects because
+# column-level stats overlap on category='B' across both files
+query TT
+EXPLAIN SELECT * FROM multi_col_parquet ORDER BY category ASC, id ASC;
+----
+logical_plan
+01)Sort: multi_col_parquet.category ASC NULLS LAST, multi_col_parquet.id ASC NULLS LAST
+02)--TableScan: multi_col_parquet projection=[category, id, value]
+physical_plan
+01)SortExec: expr=[category@0 ASC NULLS LAST, id@1 ASC NULLS LAST], preserve_partitioning=[false]
+02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/multi_col/a_first.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/multi_col/b_second.parquet]]}, projection=[category, id, value], file_type=parquet
+
+# Test 9.2: Results must be correct
+query TII
+SELECT * FROM multi_col_parquet ORDER BY category ASC, id ASC;
+----
+A 1 10
+A 2 20
+B 1 30
+B 2 40
+C 1 50
+C 2 60
+
+# Test 9.3: Multi-column sort with non-overlapping primary key across files
+# When files don't overlap on the primary column, validation succeeds.
+
+statement ok
+CREATE TABLE multi_col_x(category VARCHAR, id INT, value INT) AS VALUES
+('A', 1, 10), ('A', 2, 20);
+
+statement ok
+CREATE TABLE multi_col_y(category VARCHAR, id INT, value INT) AS VALUES
+('B', 1, 30), ('B', 2, 40);
+
+query I
+COPY (SELECT * FROM multi_col_x ORDER BY category ASC, id ASC)
+TO 'test_files/scratch/sort_pushdown/multi_col_clean/x_first.parquet';
+----
+2
+
+query I
+COPY (SELECT * FROM multi_col_y ORDER BY category ASC, id ASC)
+TO 'test_files/scratch/sort_pushdown/multi_col_clean/y_second.parquet';
+----
+2
+
+statement ok
+CREATE EXTERNAL TABLE multi_col_clean_parquet(category VARCHAR, id INT, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/multi_col_clean/'
+WITH ORDER (category ASC, id ASC);
+
+# Test 9.3a: SortExec should be eliminated — non-overlapping primary column
+query TT
+EXPLAIN SELECT * FROM multi_col_clean_parquet ORDER BY category ASC, id ASC;
+----
+logical_plan
+01)Sort: multi_col_clean_parquet.category ASC NULLS LAST, multi_col_clean_parquet.id ASC NULLS LAST
+02)--TableScan: multi_col_clean_parquet projection=[category, id, value]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/multi_col_clean/x_first.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/multi_col_clean/y_second.parquet]]}, projection=[category, id, value], output_ordering=[category@0 ASC NULLS LAST, id@1 ASC NULLS LAST], file_type=parquet
+
+# Test 9.3b: Results must be correct
+query TII
+SELECT * FROM multi_col_clean_parquet ORDER BY category ASC, id ASC;
+----
+A 1 10
+A 2 20
+B 1 30
+B 2 40
+
+# Test 10: Correctly ordered files WITH ORDER (positive counterpart to Test 6)
+# Files in correct_parquet are in correct ASC order — WITH ORDER should pass validation
+# and SortExec should be eliminated.
+
+statement ok
+CREATE EXTERNAL TABLE correct_with_order_parquet(id INT, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/correct/'
+WITH ORDER (id ASC);
+
+# Test 10.1: SortExec should be ELIMINATED — files are in correct order
+query TT
+EXPLAIN SELECT * FROM correct_with_order_parquet ORDER BY id ASC;
+----
+logical_plan
+01)Sort: correct_with_order_parquet.id ASC NULLS LAST
+02)--TableScan: correct_with_order_parquet projection=[id, value]
+physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/correct/a_low.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/correct/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/correct/c_high.parquet]]}, projection=[id, value], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet
+
+# Test 10.2: Results must be correct
+query II
+SELECT * FROM correct_with_order_parquet ORDER BY id ASC;
+----
+1 100
+2 200
+3 300
+4 400
+5 500
+6 600
+7 700
+8 800
+9 900
+
+# Test 11: Multiple file groups (target_partitions > 1) — each group has one file
+# When files are spread across separate partitions (one file per group), each
+# partition is trivially sorted and SortPreservingMergeExec handles the merge.
+
+# Restore higher target_partitions so files go into separate groups
+statement ok
+SET datafusion.execution.target_partitions = 4;
+
+statement ok
+CREATE EXTERNAL TABLE multi_partition_parquet(id INT, value INT)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/sort_pushdown/reversed/'
+WITH ORDER (id ASC);
+
+# Test 11.1: With separate partitions, each file is trivially sorted.
+# SortPreservingMergeExec merges, no SortExec needed per-partition.
+query TT
+EXPLAIN SELECT * FROM multi_partition_parquet ORDER BY id ASC;
+----
+logical_plan
+01)Sort: multi_partition_parquet.id ASC NULLS LAST
+02)--TableScan: multi_partition_parquet projection=[id, value]
+physical_plan
+01)SortPreservingMergeExec: [id@0 ASC NULLS LAST]
+02)--DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/a_high.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/b_mid.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/reversed/c_low.parquet]]}, projection=[id, value], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet
+
+# Test 11.2: Results must be correct
+query II
+SELECT * FROM multi_partition_parquet ORDER BY id ASC;
+----
+1 100
+2 200
+3 300
+4 400
+5 500
+6 600
+7 700
+8 800
+9 900
+
+# Restore target_partitions to 1 for remaining cleanup
+statement ok
+SET datafusion.execution.target_partitions = 2;
+
+statement ok
+DROP TABLE reversed_high;
+
+statement ok
+DROP TABLE reversed_mid;
+
+statement ok
+DROP TABLE reversed_low;
+
+statement ok
+DROP TABLE reversed_parquet;
+
+statement ok
+DROP TABLE overlap_x;
+
+statement ok
+DROP TABLE overlap_y;
+
+statement ok
+DROP TABLE overlap_parquet;
+
+statement ok
+DROP TABLE reversed_with_order_parquet;
+
+statement ok
+DROP TABLE correct_low;
+
+statement ok
+DROP TABLE correct_mid;
+
+statement ok
+DROP TABLE correct_high;
+
+statement ok
+DROP TABLE correct_parquet;
+
+statement ok
+DROP TABLE desc_low;
+
+statement ok
+DROP TABLE desc_high;
+
+statement ok
+DROP TABLE desc_reversed_parquet;
+
+statement ok
+DROP TABLE multi_col_a;
+
+statement ok
+DROP TABLE multi_col_b;
+
+statement ok
+DROP TABLE multi_col_parquet;
+
+statement ok
+DROP TABLE multi_col_x;
+
+statement ok
+DROP TABLE multi_col_y;
+
+statement ok
+DROP TABLE multi_col_clean_parquet;
+
+statement ok
+DROP TABLE correct_with_order_parquet;
+
+statement ok
+DROP TABLE multi_partition_parquet;
+

From 866a4f01e70418a86375d7439d1285c5c7760313 Mon Sep 17 00:00:00 2001
From: Tim-53 <82676248+Tim-53@users.noreply.github.com>
Date: Tue, 24 Feb 2026 13:10:15 +0100
Subject: [PATCH 09/11] fix: HashJoin panic with dictionary-encoded columns in
 multi-key joins (#20441)

- Closes #20437

`flatten_dictionary_array` returned only the unique values rather then
the full expanded array when being called on a `DictionaryArray`. When
building a `StructArray` this caused a length mismatch panic.

Replaced `array.values()` with `arrow::compute::cast(array, value_type)`
in `flatten_dictionary_array`, which properly expands the dictionary
into a full length array matching the row count.

Yes, both a new unit test aswell as a regression test were added.

Nope

---------

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
---
 .../src/joins/hash_join/inlist_builder.rs     |  58 ++++++--
 datafusion/sqllogictest/test_files/joins.slt  | 125 ++++++++++++++++++
 2 files changed, 174 insertions(+), 9 deletions(-)

diff --git a/datafusion/physical-plan/src/joins/hash_join/inlist_builder.rs b/datafusion/physical-plan/src/joins/hash_join/inlist_builder.rs
index 7dccc5b0ba7c2..9bf59d9e333d6 100644
--- a/datafusion/physical-plan/src/joins/hash_join/inlist_builder.rs
+++ b/datafusion/physical-plan/src/joins/hash_join/inlist_builder.rs
@@ -20,8 +20,8 @@
 use std::sync::Arc;
 
 use arrow::array::{ArrayRef, StructArray};
+use arrow::compute::cast;
 use arrow::datatypes::{Field, FieldRef, Fields};
-use arrow::downcast_dictionary_array;
 use arrow_schema::DataType;
 use datafusion_common::Result;
 
@@ -33,15 +33,16 @@ pub(super) fn build_struct_fields(data_types: &[DataType]) -> Result<Fields> {
         .collect()
 }
 
-/// Flattens dictionary-encoded arrays to their underlying value arrays.
+/// Casts dictionary-encoded arrays to their underlying value type, preserving row count.
 /// Non-dictionary arrays are returned as-is.
-fn flatten_dictionary_array(array: &ArrayRef) -> ArrayRef {
-    downcast_dictionary_array! {
-        array => {
+fn flatten_dictionary_array(array: &ArrayRef) -> Result<ArrayRef> {
+    match array.data_type() {
+        DataType::Dictionary(_, value_type) => {
+            let casted = cast(array, value_type)?;
             // Recursively flatten in case of nested dictionaries
-            flatten_dictionary_array(array.values())
+            flatten_dictionary_array(&casted)
         }
-        _ => Arc::clone(array)
+        _ => Ok(Arc::clone(array)),
     }
 }
 
@@ -68,7 +69,7 @@ pub(super) fn build_struct_inlist_values(
     let flattened_arrays: Vec<ArrayRef> = join_key_arrays
         .iter()
         .map(flatten_dictionary_array)
-        .collect();
+        .collect::<Result<Vec<_>>>()?;
 
     // Build the source array/struct
     let source_array: ArrayRef = if flattened_arrays.len() == 1 {
@@ -99,7 +100,9 @@ pub(super) fn build_struct_inlist_values(
 #[cfg(test)]
 mod tests {
     use super::*;
-    use arrow::array::{Int32Array, StringArray};
+    use arrow::array::{
+        DictionaryArray, Int8Array, Int32Array, StringArray, StringDictionaryBuilder,
+    };
     use arrow_schema::DataType;
     use std::sync::Arc;
 
@@ -130,4 +133,41 @@ mod tests {
             )
         );
     }
+
+    #[test]
+    fn test_build_multi_column_inlist_with_dictionary() {
+        let mut builder = StringDictionaryBuilder::<arrow::datatypes::Int8Type>::new();
+        builder.append_value("foo");
+        builder.append_value("foo");
+        builder.append_value("foo");
+        let dict_array = Arc::new(builder.finish()) as ArrayRef;
+
+        let int_array = Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef;
+
+        let result = build_struct_inlist_values(&[dict_array, int_array])
+            .unwrap()
+            .unwrap();
+
+        assert_eq!(result.len(), 3);
+        assert_eq!(
+            *result.data_type(),
+            DataType::Struct(
+                build_struct_fields(&[DataType::Utf8, DataType::Int32]).unwrap()
+            )
+        );
+    }
+
+    #[test]
+    fn test_build_single_column_dictionary_inlist() {
+        let keys = Int8Array::from(vec![0i8, 0, 0]);
+        let values = Arc::new(StringArray::from(vec!["foo"]));
+        let dict_array = Arc::new(DictionaryArray::new(keys, values)) as ArrayRef;
+
+        let result = build_struct_inlist_values(std::slice::from_ref(&dict_array))
+            .unwrap()
+            .unwrap();
+
+        assert_eq!(result.len(), 3);
+        assert_eq!(*result.data_type(), DataType::Utf8);
+    }
 }
diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt
index 38037ede21db2..e020885a447e0 100644
--- a/datafusion/sqllogictest/test_files/joins.slt
+++ b/datafusion/sqllogictest/test_files/joins.slt
@@ -5198,3 +5198,128 @@ DROP TABLE t1_c;
 
 statement ok
 DROP TABLE t2_c;
+
+# Reproducer of https://github.com/apache/datafusion/issues/19067
+statement count 0
+set datafusion.explain.physical_plan_only = true;
+
+# Setup Left Table with FixedSizeBinary(4)
+statement count 0
+CREATE TABLE issue_19067_left AS
+SELECT
+  column1 as id,
+  arrow_cast(decode(column2, 'hex'), 'FixedSizeBinary(4)') as join_key
+FROM (VALUES
+  (1, 'AAAAAAAA'),
+  (2, 'BBBBBBBB'),
+  (3, 'CCCCCCCC')
+);
+
+# Setup Right Table with FixedSizeBinary(4)
+statement count 0
+CREATE TABLE issue_19067_right AS
+SELECT
+  arrow_cast(decode(column1, 'hex'), 'FixedSizeBinary(4)') as join_key,
+  column2 as value
+FROM (VALUES
+  ('AAAAAAAA', 1000),
+  ('BBBBBBBB', 2000)
+);
+
+# Perform Left Join. Third row should contain NULL in `right_key`.
+query I??I
+SELECT
+  l.id,
+  l.join_key as left_key,
+  r.join_key as right_key,
+  r.value
+FROM issue_19067_left l
+LEFT JOIN issue_19067_right r ON l.join_key = r.join_key
+ORDER BY l.id;
+----
+1 aaaaaaaa aaaaaaaa 1000
+2 bbbbbbbb bbbbbbbb 2000
+3 cccccccc NULL NULL
+
+# Ensure usage of HashJoinExec
+query TT
+EXPLAIN
+SELECT
+  l.id,
+  l.join_key as left_key,
+  r.join_key as right_key,
+  r.value
+FROM issue_19067_left l
+LEFT JOIN issue_19067_right r ON l.join_key = r.join_key
+ORDER BY l.id;
+----
+physical_plan
+01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false]
+02)--ProjectionExec: expr=[id@2 as id, join_key@3 as left_key, join_key@0 as right_key, value@1 as value]
+03)----HashJoinExec: mode=CollectLeft, join_type=Right, on=[(join_key@0, join_key@1)]
+04)------DataSourceExec: partitions=1, partition_sizes=[1]
+05)------DataSourceExec: partitions=1, partition_sizes=[1]
+
+statement count 0
+set datafusion.explain.physical_plan_only = false;
+
+statement count 0
+DROP TABLE issue_19067_left;
+
+statement count 0
+DROP TABLE issue_19067_right;
+
+# Test that empty projections pushed into joins produce correct row counts at runtime.
+# When count(1) is used over a RIGHT/FULL JOIN, the optimizer embeds an empty projection
+# (projection=[]) into the HashJoinExec. This validates that the runtime batch construction
+# handles zero-column output correctly, preserving the correct number of rows.
+
+statement ok
+CREATE TABLE empty_proj_left AS VALUES (1, 'a'), (2, 'b'), (3, 'c');
+
+statement ok
+CREATE TABLE empty_proj_right AS VALUES (1, 'x'), (2, 'y'), (4, 'z');
+
+query I
+SELECT count(1) FROM empty_proj_left RIGHT JOIN empty_proj_right ON empty_proj_left.column1 = empty_proj_right.column1;
+----
+3
+
+query I
+SELECT count(1) FROM empty_proj_left FULL JOIN empty_proj_right ON empty_proj_left.column1 = empty_proj_right.column1;
+----
+4
+
+statement count 0
+DROP TABLE empty_proj_left;
+
+statement count 0
+DROP TABLE empty_proj_right;
+
+# Issue #20437: HashJoin panic with dictionary-encoded columns in multi-key joins
+# https://github.com/apache/datafusion/issues/20437
+
+statement ok
+CREATE TABLE issue_20437_small AS
+SELECT id, arrow_cast(region, 'Dictionary(Int32, Utf8)') AS region
+FROM (VALUES (1, 'west'), (2, 'west')) AS t(id, region);
+
+statement ok
+CREATE TABLE issue_20437_large AS
+SELECT id, region, value
+FROM (VALUES (1, 'west', 100), (2, 'west', 200), (3, 'east', 300)) AS t(id, region, value);
+
+query ITI
+SELECT s.id, s.region, l.value
+FROM issue_20437_small s
+JOIN issue_20437_large l ON s.id = l.id AND s.region = l.region
+ORDER BY s.id;
+----
+1 west 100
+2 west 200
+
+statement count 0
+DROP TABLE issue_20437_small;
+
+statement count 0
+DROP TABLE issue_20437_large;

From 20598115a943fb0da68f9fe8507b510d8f292a9c Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Mon, 23 Feb 2026 14:38:11 -0500
Subject: [PATCH 10/11] Avoid flattening dictionaries in Join InLists

---
 .../src/joins/hash_join/inlist_builder.rs     | 39 +++++++------------
 1 file changed, 13 insertions(+), 26 deletions(-)

diff --git a/datafusion/physical-plan/src/joins/hash_join/inlist_builder.rs b/datafusion/physical-plan/src/joins/hash_join/inlist_builder.rs
index 9bf59d9e333d6..0ca338265ecc6 100644
--- a/datafusion/physical-plan/src/joins/hash_join/inlist_builder.rs
+++ b/datafusion/physical-plan/src/joins/hash_join/inlist_builder.rs
@@ -20,7 +20,6 @@
 use std::sync::Arc;
 
 use arrow::array::{ArrayRef, StructArray};
-use arrow::compute::cast;
 use arrow::datatypes::{Field, FieldRef, Fields};
 use arrow_schema::DataType;
 use datafusion_common::Result;
@@ -33,19 +32,6 @@ pub(super) fn build_struct_fields(data_types: &[DataType]) -> Result<Fields> {
         .collect()
 }
 
-/// Casts dictionary-encoded arrays to their underlying value type, preserving row count.
-/// Non-dictionary arrays are returned as-is.
-fn flatten_dictionary_array(array: &ArrayRef) -> Result<ArrayRef> {
-    match array.data_type() {
-        DataType::Dictionary(_, value_type) => {
-            let casted = cast(array, value_type)?;
-            // Recursively flatten in case of nested dictionaries
-            flatten_dictionary_array(&casted)
-        }
-        _ => Ok(Arc::clone(array)),
-    }
-}
-
 /// Builds InList values from join key column arrays.
 ///
 /// If `join_key_arrays` is:
@@ -65,20 +51,14 @@ fn flatten_dictionary_array(array: &ArrayRef) -> Result<ArrayRef> {
 pub(super) fn build_struct_inlist_values(
     join_key_arrays: &[ArrayRef],
 ) -> Result<Option<ArrayRef>> {
-    // Flatten any dictionary-encoded arrays
-    let flattened_arrays: Vec<ArrayRef> = join_key_arrays
-        .iter()
-        .map(flatten_dictionary_array)
-        .collect::<Result<Vec<_>>>()?;
-
     // Build the source array/struct
-    let source_array: ArrayRef = if flattened_arrays.len() == 1 {
+    let source_array: ArrayRef = if join_key_arrays.len() == 1 {
         // Single column: use directly
-        Arc::clone(&flattened_arrays[0])
+        Arc::clone(&join_key_arrays[0])
     } else {
         // Multi-column: build StructArray once from all columns
         let fields = build_struct_fields(
-            &flattened_arrays
+            &join_key_arrays
                 .iter()
                 .map(|arr| arr.data_type().clone())
                 .collect::<Vec<_>>(),
@@ -88,7 +68,7 @@ pub(super) fn build_struct_inlist_values(
         let arrays_with_fields: Vec<(FieldRef, ArrayRef)> = fields
             .iter()
             .cloned()
-            .zip(flattened_arrays.iter().cloned())
+            .zip(join_key_arrays.iter().cloned())
             .collect();
 
         Arc::new(StructArray::from(arrays_with_fields))
@@ -152,7 +132,14 @@ mod tests {
         assert_eq!(
             *result.data_type(),
             DataType::Struct(
-                build_struct_fields(&[DataType::Utf8, DataType::Int32]).unwrap()
+                build_struct_fields(&[
+                    DataType::Dictionary(
+                        Box::new(DataType::Int8),
+                        Box::new(DataType::Utf8)
+                    ),
+                    DataType::Int32
+                ])
+                .unwrap()
             )
         );
     }
@@ -168,6 +155,6 @@ mod tests {
             .unwrap();
 
         assert_eq!(result.len(), 3);
-        assert_eq!(*result.data_type(), DataType::Utf8);
+        assert_eq!(result.data_type(), dict_array.data_type());
     }
 }

From 6a0206e08361c7b325f03f37da71e760b1cfaf18 Mon Sep 17 00:00:00 2001
From: Adam Curtis <adam.curtis.dev@gmail.com>
Date: Wed, 4 Mar 2026 10:47:39 -0500
Subject: [PATCH 11/11] chore: bump arrow/parquet to 57.3.0

Includes fix for FixedSizeBinary LEFT JOIN bug
- https://github.com/apache/arrow-rs/pull/8981

Cherry-picked test and API updates from
- https://github.com/apache/datafusion/pull/19355
---
 Cargo.lock                                    | 64 +++++++++----------
 Cargo.toml                                    | 14 ++--
 datafusion/common/src/scalar/mod.rs           |  7 +-
 .../src/avro_to_arrow/schema.rs               |  4 +-
 .../functions/src/core/union_extract.rs       |  5 +-
 datafusion/physical-plan/src/filter.rs        |  5 +-
 datafusion/proto-common/src/from_proto/mod.rs | 21 +++---
 .../tests/cases/roundtrip_logical_plan.rs     | 12 ++--
 datafusion/sqllogictest/src/test_context.rs   |  5 +-
 datafusion/sqllogictest/test_files/case.slt   |  5 +-
 .../test_files/spark/hash/crc32.slt           |  6 +-
 datafusion/sqllogictest/test_files/struct.slt | 39 ++++++++---
 12 files changed, 109 insertions(+), 78 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index c2e9312a24723..3bdae442123fa 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -223,9 +223,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
 
 [[package]]
 name = "arrow"
-version = "57.1.0"
+version = "57.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cb372a7cbcac02a35d3fb7b3fc1f969ec078e871f9bb899bf00a2e1809bec8a3"
+checksum = "e4754a624e5ae42081f464514be454b39711daae0458906dacde5f4c632f33a8"
 dependencies = [
  "arrow-arith",
  "arrow-array",
@@ -246,9 +246,9 @@ dependencies = [
 
 [[package]]
 name = "arrow-arith"
-version = "57.1.0"
+version = "57.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0f377dcd19e440174596d83deb49cd724886d91060c07fec4f67014ef9d54049"
+checksum = "f7b3141e0ec5145a22d8694ea8b6d6f69305971c4fa1c1a13ef0195aef2d678b"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -260,9 +260,9 @@ dependencies = [
 
 [[package]]
 name = "arrow-array"
-version = "57.1.0"
+version = "57.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a23eaff85a44e9fa914660fb0d0bb00b79c4a3d888b5334adb3ea4330c84f002"
+checksum = "4c8955af33b25f3b175ee10af580577280b4bd01f7e823d94c7cdef7cf8c9aef"
 dependencies = [
  "ahash",
  "arrow-buffer",
@@ -279,9 +279,9 @@ dependencies = [
 
 [[package]]
 name = "arrow-buffer"
-version = "57.1.0"
+version = "57.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a2819d893750cb3380ab31ebdc8c68874dd4429f90fd09180f3c93538bd21626"
+checksum = "c697ddca96183182f35b3a18e50b9110b11e916d7b7799cbfd4d34662f2c56c2"
 dependencies = [
  "bytes",
  "half",
@@ -291,9 +291,9 @@ dependencies = [
 
 [[package]]
 name = "arrow-cast"
-version = "57.1.0"
+version = "57.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e3d131abb183f80c450d4591dc784f8d7750c50c6e2bc3fcaad148afc8361271"
+checksum = "646bbb821e86fd57189c10b4fcdaa941deaf4181924917b0daa92735baa6ada5"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -313,9 +313,9 @@ dependencies = [
 
 [[package]]
 name = "arrow-csv"
-version = "57.1.0"
+version = "57.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2275877a0e5e7e7c76954669366c2aa1a829e340ab1f612e647507860906fb6b"
+checksum = "8da746f4180004e3ce7b83c977daf6394d768332349d3d913998b10a120b790a"
 dependencies = [
  "arrow-array",
  "arrow-cast",
@@ -328,9 +328,9 @@ dependencies = [
 
 [[package]]
 name = "arrow-data"
-version = "57.1.0"
+version = "57.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05738f3d42cb922b9096f7786f606fcb8669260c2640df8490533bb2fa38c9d3"
+checksum = "1fdd994a9d28e6365aa78e15da3f3950c0fdcea6b963a12fa1c391afb637b304"
 dependencies = [
  "arrow-buffer",
  "arrow-schema",
@@ -341,9 +341,9 @@ dependencies = [
 
 [[package]]
 name = "arrow-flight"
-version = "57.1.0"
+version = "57.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8b5f57c3d39d1b1b7c1376a772ea86a131e7da310aed54ebea9363124bb885e3"
+checksum = "58c5b083668e6230eae3eab2fc4b5fb989974c845d0aa538dde61a4327c78675"
 dependencies = [
  "arrow-arith",
  "arrow-array",
@@ -369,9 +369,9 @@ dependencies = [
 
 [[package]]
 name = "arrow-ipc"
-version = "57.1.0"
+version = "57.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3d09446e8076c4b3f235603d9ea7c5494e73d441b01cd61fb33d7254c11964b3"
+checksum = "abf7df950701ab528bf7c0cf7eeadc0445d03ef5d6ffc151eaae6b38a58feff1"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -385,9 +385,9 @@ dependencies = [
 
 [[package]]
 name = "arrow-json"
-version = "57.1.0"
+version = "57.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "371ffd66fa77f71d7628c63f209c9ca5341081051aa32f9c8020feb0def787c0"
+checksum = "0ff8357658bedc49792b13e2e862b80df908171275f8e6e075c460da5ee4bf86"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -409,9 +409,9 @@ dependencies = [
 
 [[package]]
 name = "arrow-ord"
-version = "57.1.0"
+version = "57.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cbc94fc7adec5d1ba9e8cd1b1e8d6f72423b33fe978bf1f46d970fafab787521"
+checksum = "f7d8f1870e03d4cbed632959498bcc84083b5a24bded52905ae1695bd29da45b"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -422,9 +422,9 @@ dependencies = [
 
 [[package]]
 name = "arrow-row"
-version = "57.1.0"
+version = "57.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "169676f317157dc079cc5def6354d16db63d8861d61046d2f3883268ced6f99f"
+checksum = "18228633bad92bff92a95746bbeb16e5fc318e8382b75619dec26db79e4de4c0"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -435,9 +435,9 @@ dependencies = [
 
 [[package]]
 name = "arrow-schema"
-version = "57.1.0"
+version = "57.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d27609cd7dd45f006abae27995c2729ef6f4b9361cde1ddd019dc31a5aa017e0"
+checksum = "8c872d36b7bf2a6a6a2b40de9156265f0242910791db366a2c17476ba8330d68"
 dependencies = [
  "bitflags",
  "serde",
@@ -447,9 +447,9 @@ dependencies = [
 
 [[package]]
 name = "arrow-select"
-version = "57.1.0"
+version = "57.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ae980d021879ea119dd6e2a13912d81e64abed372d53163e804dfe84639d8010"
+checksum = "68bf3e3efbd1278f770d67e5dc410257300b161b93baedb3aae836144edcaf4b"
 dependencies = [
  "ahash",
  "arrow-array",
@@ -461,9 +461,9 @@ dependencies = [
 
 [[package]]
 name = "arrow-string"
-version = "57.1.0"
+version = "57.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cf35e8ef49dcf0c5f6d175edee6b8af7b45611805333129c541a8b89a0fc0534"
+checksum = "85e968097061b3c0e9fe3079cf2e703e487890700546b5b0647f60fca1b5a8d8"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -4380,9 +4380,9 @@ dependencies = [
 
 [[package]]
 name = "parquet"
-version = "57.1.0"
+version = "57.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be3e4f6d320dd92bfa7d612e265d7d08bba0a240bab86af3425e1d255a511d89"
+checksum = "6ee96b29972a257b855ff2341b37e61af5f12d6af1158b6dcdb5b31ea07bb3cb"
 dependencies = [
  "ahash",
  "arrow-array",
diff --git a/Cargo.toml b/Cargo.toml
index 6424f512cc3df..8bc8006daf147 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -91,19 +91,19 @@ ahash = { version = "0.8", default-features = false, features = [
     "runtime-rng",
 ] }
 apache-avro = { version = "0.21", default-features = false }
-arrow = { version = "57.1.0", features = [
+arrow = { version = "57.2.0", features = [
     "prettyprint",
     "chrono-tz",
 ] }
-arrow-buffer = { version = "57.1.0", default-features = false }
-arrow-flight = { version = "57.1.0", features = [
+arrow-buffer = { version = "57.2.0", default-features = false }
+arrow-flight = { version = "57.2.0", features = [
     "flight-sql-experimental",
 ] }
-arrow-ipc = { version = "57.1.0", default-features = false, features = [
+arrow-ipc = { version = "57.2.0", default-features = false, features = [
     "lz4",
 ] }
-arrow-ord = { version = "57.1.0", default-features = false }
-arrow-schema = { version = "57.1.0", default-features = false }
+arrow-ord = { version = "57.2.0", default-features = false }
+arrow-schema = { version = "57.2.0", default-features = false }
 async-trait = "0.1.89"
 bigdecimal = "0.4.8"
 bytes = "1.11"
@@ -166,7 +166,7 @@ log = "^0.4"
 num-traits = { version = "0.2" }
 object_store = { version = "0.12.4", default-features = false }
 parking_lot = "0.12"
-parquet = { version = "57.1.0", default-features = false, features = [
+parquet = { version = "57.2.0", default-features = false, features = [
     "arrow",
     "async",
     "object_store",
diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs
index e4e048ad3c0d8..eda4952cf590b 100644
--- a/datafusion/common/src/scalar/mod.rs
+++ b/datafusion/common/src/scalar/mod.rs
@@ -8868,7 +8868,7 @@ mod tests {
             .unwrap(),
             ScalarValue::try_new_null(&DataType::Map(map_field_ref, false)).unwrap(),
             ScalarValue::try_new_null(&DataType::Union(
-                UnionFields::new(vec![42], vec![field_ref]),
+                UnionFields::try_new(vec![42], vec![field_ref]).unwrap(),
                 UnionMode::Dense,
             ))
             .unwrap(),
@@ -8971,13 +8971,14 @@ mod tests {
         }
 
         // Test union type
-        let union_fields = UnionFields::new(
+        let union_fields = UnionFields::try_new(
             vec![0, 1],
             vec![
                 Field::new("i32", DataType::Int32, false),
                 Field::new("f64", DataType::Float64, false),
             ],
-        );
+        )
+        .unwrap();
         let union_result = ScalarValue::new_default(&DataType::Union(
             union_fields.clone(),
             UnionMode::Sparse,
diff --git a/datafusion/datasource-avro/src/avro_to_arrow/schema.rs b/datafusion/datasource-avro/src/avro_to_arrow/schema.rs
index 0e8f2a4d56088..053be3c9aff94 100644
--- a/datafusion/datasource-avro/src/avro_to_arrow/schema.rs
+++ b/datafusion/datasource-avro/src/avro_to_arrow/schema.rs
@@ -117,8 +117,8 @@ fn schema_to_field_with_props(
                     .iter()
                     .map(|s| schema_to_field_with_props(s, None, has_nullable, None))
                     .collect::<Result<Vec<Field>>>()?;
-                let type_ids = 0_i8..fields.len() as i8;
-                DataType::Union(UnionFields::new(type_ids, fields), UnionMode::Dense)
+                // Assign type_ids based on the order in which they appear
+                DataType::Union(UnionFields::from_fields(fields), UnionMode::Dense)
             }
         }
         AvroSchema::Record(RecordSchema { fields, .. }) => {
diff --git a/datafusion/functions/src/core/union_extract.rs b/datafusion/functions/src/core/union_extract.rs
index 56d4f23cc4e2e..8d915fb2e2c07 100644
--- a/datafusion/functions/src/core/union_extract.rs
+++ b/datafusion/functions/src/core/union_extract.rs
@@ -189,13 +189,14 @@ mod tests {
     fn test_scalar_value() -> Result<()> {
         let fun = UnionExtractFun::new();
 
-        let fields = UnionFields::new(
+        let fields = UnionFields::try_new(
             vec![1, 3],
             vec![
                 Field::new("str", DataType::Utf8, false),
                 Field::new("int", DataType::Int32, false),
             ],
-        );
+        )
+        .unwrap();
 
         let args = vec![
             ColumnarValue::Scalar(ScalarValue::Union(
diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs
index e724cdad64840..e78e50c9868fb 100644
--- a/datafusion/physical-plan/src/filter.rs
+++ b/datafusion/physical-plan/src/filter.rs
@@ -1540,13 +1540,14 @@ mod tests {
     #[test]
     fn test_equivalence_properties_union_type() -> Result<()> {
         let union_type = DataType::Union(
-            UnionFields::new(
+            UnionFields::try_new(
                 vec![0, 1],
                 vec![
                     Field::new("f1", DataType::Int32, true),
                     Field::new("f2", DataType::Utf8, true),
                 ],
-            ),
+            )
+            .unwrap(),
             UnionMode::Sparse,
         );
 
diff --git a/datafusion/proto-common/src/from_proto/mod.rs b/datafusion/proto-common/src/from_proto/mod.rs
index e8e71c3884586..3c41b8cad9ed1 100644
--- a/datafusion/proto-common/src/from_proto/mod.rs
+++ b/datafusion/proto-common/src/from_proto/mod.rs
@@ -304,13 +304,16 @@ impl TryFrom<&protobuf::arrow_type::ArrowTypeEnum> for DataType {
                 };
                 let union_fields = parse_proto_fields_to_fields(&union.union_types)?;
 
-                // Default to index based type ids if not provided
-                let type_ids: Vec<_> = match union.type_ids.is_empty() {
-                    true => (0..union_fields.len() as i8).collect(),
-                    false => union.type_ids.iter().map(|i| *i as i8).collect(),
+                // Default to index based type ids if not explicitly provided
+                let union_fields = if union.type_ids.is_empty() {
+                    UnionFields::from_fields(union_fields)
+                } else {
+                    let type_ids = union.type_ids.iter().map(|i| *i as i8);
+                    UnionFields::try_new(type_ids, union_fields).map_err(|e| {
+                        DataFusionError::from(e).context("Deserializing Union DataType")
+                    })?
                 };
-
-                DataType::Union(UnionFields::new(type_ids, union_fields), union_mode)
+                DataType::Union(union_fields, union_mode)
             }
             arrow_type::ArrowTypeEnum::Dictionary(dict) => {
                 let key_datatype = dict.as_ref().key.as_deref().required("key")?;
@@ -602,7 +605,9 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue {
                     .collect::<Option<Vec<_>>>();
                 let fields = fields.ok_or_else(|| Error::required("UnionField"))?;
                 let fields = parse_proto_fields_to_fields(&fields)?;
-                let fields = UnionFields::new(ids, fields);
+                let union_fields = UnionFields::try_new(ids, fields).map_err(|e| {
+                    DataFusionError::from(e).context("Deserializing Union ScalarValue")
+                })?;
                 let v_id = val.value_id as i8;
                 let val = match &val.value {
                     None => None,
@@ -614,7 +619,7 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue {
                         Some((v_id, Box::new(val)))
                     }
                 };
-                Self::Union(val, fields, mode)
+                Self::Union(val, union_fields, mode)
             }
             Value::FixedSizeBinaryValue(v) => {
                 Self::FixedSizeBinary(v.length, Some(v.clone().values))
diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
index bcfda648b53e5..b9af9fc9352b2 100644
--- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
+++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs
@@ -1780,19 +1780,20 @@ fn round_trip_datatype() {
             ),
         ])),
         DataType::Union(
-            UnionFields::new(
+            UnionFields::try_new(
                 vec![7, 5, 3],
                 vec![
                     Field::new("nullable", DataType::Boolean, false),
                     Field::new("name", DataType::Utf8, false),
                     Field::new("datatype", DataType::Binary, false),
                 ],
-            ),
+            )
+            .unwrap(),
             UnionMode::Sparse,
         ),
         DataType::Union(
-            UnionFields::new(
-                vec![5, 8, 1],
+            UnionFields::try_new(
+                vec![5, 8, 1, 100],
                 vec![
                     Field::new("nullable", DataType::Boolean, false),
                     Field::new("name", DataType::Utf8, false),
@@ -1807,7 +1808,8 @@ fn round_trip_datatype() {
                         true,
                     ),
                 ],
-            ),
+            )
+            .unwrap(),
             UnionMode::Dense,
         ),
         DataType::Dictionary(
diff --git a/datafusion/sqllogictest/src/test_context.rs b/datafusion/sqllogictest/src/test_context.rs
index 9ec085b41eec0..a69bdac663002 100644
--- a/datafusion/sqllogictest/src/test_context.rs
+++ b/datafusion/sqllogictest/src/test_context.rs
@@ -436,14 +436,15 @@ fn create_example_udf() -> ScalarUDF {
 
 fn register_union_table(ctx: &SessionContext) {
     let union = UnionArray::try_new(
-        UnionFields::new(
+        UnionFields::try_new(
             // typeids: 3 for int, 1 for string
             vec![3, 1],
             vec![
                 Field::new("int", DataType::Int32, false),
                 Field::new("string", DataType::Utf8, false),
             ],
-        ),
+        )
+        .unwrap(),
         ScalarBuffer::from(vec![3, 1, 3]),
         None,
         vec![
diff --git a/datafusion/sqllogictest/test_files/case.slt b/datafusion/sqllogictest/test_files/case.slt
index 074d216ac7524..481dde5be9f5c 100644
--- a/datafusion/sqllogictest/test_files/case.slt
+++ b/datafusion/sqllogictest/test_files/case.slt
@@ -384,8 +384,7 @@ SELECT column2, column3, column4  FROM t;
 {foo: a, xxx: b} {xxx: c, foo: d} {xxx: e}
 
 # coerce structs with different field orders,
-# (note the *value*s are from column2 but the field name is 'xxx', as the coerced
-# type takes the field name from the last argument (column3)
+# should keep the same field values
 query ?
 SELECT
   case
@@ -394,7 +393,7 @@ SELECT
   end
 FROM t;
 ----
-{xxx: a, foo: b}
+{xxx: b, foo: a}
 
 # coerce structs with different field orders
 query ?
diff --git a/datafusion/sqllogictest/test_files/spark/hash/crc32.slt b/datafusion/sqllogictest/test_files/spark/hash/crc32.slt
index 6fbeb11fb9a36..df5588c75837d 100644
--- a/datafusion/sqllogictest/test_files/spark/hash/crc32.slt
+++ b/datafusion/sqllogictest/test_files/spark/hash/crc32.slt
@@ -81,7 +81,7 @@ SELECT crc32(arrow_cast('Spark', 'BinaryView'));
 ----
 1557323817
 
-# Upstream arrow-rs issue: https://github.com/apache/arrow-rs/issues/8841
-# This should succeed after we receive the fix
-query error Arrow error: Compute error: Internal Error: Cannot cast BinaryView to BinaryArray of expected type
+query I
 select crc32(arrow_cast(null, 'Dictionary(Int32, Utf8)'))
+----
+NULL
diff --git a/datafusion/sqllogictest/test_files/struct.slt b/datafusion/sqllogictest/test_files/struct.slt
index d985af1104da3..a91a5e7f870a9 100644
--- a/datafusion/sqllogictest/test_files/struct.slt
+++ b/datafusion/sqllogictest/test_files/struct.slt
@@ -492,9 +492,18 @@ Struct("r": Utf8, "c": Float64)
 statement ok
 drop table t;
 
-query error DataFusion error: Optimizer rule 'simplify_expressions' failed[\s\S]*Arrow error: Cast error: Cannot cast string 'a' to value of Float64 type
+statement ok
 create table t as values({r: 'a', c: 1}), ({c: 2.3, r: 'b'});
 
+query ?
+select * from t;
+----
+{c: 1.0, r: a}
+{c: 2.3, r: b}
+
+statement ok
+drop table t;
+
 ##################################
 ## Test Coalesce with Struct
 ##################################
@@ -560,10 +569,18 @@ create table t(a struct(r varchar, c int), b struct(r varchar, c float)) as valu
     (row('purple', 1), row('green', 2.3));
 
 # out of order struct literal
-# TODO: This query should not fail
-statement error DataFusion error: Optimizer rule 'simplify_expressions' failed[\s\S]*Arrow error: Cast error: Cannot cast string 'b' to value of Int32 type
+statement ok
 create table t(a struct(r varchar, c int)) as values ({r: 'a', c: 1}), ({c: 2, r: 'b'});
 
+query ?
+select * from t;
+----
+{r: a, c: 1}
+{r: b, c: 2}
+
+statement ok
+drop table t;
+
 ##################################
 ## Test Array of Struct
 ##################################
@@ -573,9 +590,11 @@ select [{r: 'a', c: 1}, {r: 'b', c: 2}];
 ----
 [{r: a, c: 1}, {r: b, c: 2}]
 
-# Can't create a list of struct with different field types
-query error
+# Create a list of struct with different field types
+query ?
 select [{r: 'a', c: 1}, {c: 2, r: 'b'}];
+----
+[{c: 1, r: a}, {c: 2, r: b}]
 
 statement ok
 create table t(a struct(r varchar, c int), b struct(r varchar, c float)) as values (row('a', 1), row('b', 2.3));
@@ -592,9 +611,11 @@ drop table t;
 statement ok
 create table t(a struct(r varchar, c int), b struct(c float, r varchar)) as values (row('a', 1), row(2.3, 'b'));
 
-# create array with different struct type is not valid
-query error
+# create array with different struct type should be cast
+query T
 select arrow_typeof([a, b]) from t;
+----
+List(Struct("c": Float32, "r": Utf8View))
 
 statement ok
 drop table t;
@@ -602,13 +623,13 @@ drop table t;
 statement ok
 create table t(a struct(r varchar, c int, g float), b struct(r varchar, c float, g int)) as values (row('a', 1, 2.3), row('b', 2.3, 2));
 
-# type of each column should not coerced but perserve as it is
+# type of each column should not coerced but preserve as it is
 query T
 select arrow_typeof(a) from t;
 ----
 Struct("r": Utf8View, "c": Int32, "g": Float32)
 
-# type of each column should not coerced but perserve as it is
+# type of each column should not coerced but preserve as it is
 query T
 select arrow_typeof(b) from t;
 ----