From a32c4b26c2e4f6ad856f64cb81bb0653fa4f9592 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Tue, 7 Apr 2026 07:43:27 -0500 Subject: [PATCH 1/6] fix: preserve source field metadata in TryCast expressions (#21390) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Which issue does this PR close? N/A — discovered while investigating metadata propagation through cast expressions in #21322 ## Rationale for this change `Expr::Cast` preserves the source field's metadata through a dedicated `to_field` handler in `expr_schema.rs`, but `Expr::TryCast` fell through to the default case which creates a `Field::new(...)` without any metadata. This caused source column metadata to be silently dropped when using `TRY_CAST`. ## What changes are included in this PR? - Added a dedicated `to_field` handler for `Expr::TryCast` in `expr_schema.rs` that preserves source field metadata (matching `Expr::Cast` behavior), while keeping TryCast's always-nullable semantics. - Added SLT tests in `metadata.slt` verifying metadata preservation through `TRY_CAST` on both timestamp and integer columns. ## Are these changes tested? Yes — new sqllogictest cases in `metadata.slt` using `arrow_metadata()` to verify metadata is preserved through `TRY_CAST`. ## Are there any user-facing changes? `TRY_CAST` now preserves source field metadata, consistent with `CAST` behavior. 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.6 (1M context) --- datafusion/expr/src/expr_schema.rs | 44 +++++++----- .../sqllogictest/test_files/metadata.slt | 67 +++++++++++++++++++ 2 files changed, 94 insertions(+), 17 deletions(-) diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index 81d788b1a2302..b613db4b6ad1d 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -65,6 +65,24 @@ pub trait ExprSchemable { -> Result<(DataType, bool)>; } +/// Derives the output field for a cast expression from the source field. +/// For `TryCast`, `force_nullable` is `true` since a failed cast returns NULL. +fn cast_output_field( + source_field: &FieldRef, + target_type: &DataType, + force_nullable: bool, +) -> Arc { + let mut f = source_field + .as_ref() + .clone() + .with_data_type(target_type.clone()) + .with_metadata(source_field.metadata().clone()); + if force_nullable { + f = f.with_nullable(true); + } + Arc::new(f) +} + impl ExprSchemable for Expr { /// Returns the [arrow::datatypes::DataType] of the expression /// based on [ExprSchema] @@ -553,33 +571,25 @@ impl ExprSchemable for Expr { func.return_field_from_args(args) } // _ => Ok((self.get_type(schema)?, self.nullable(schema)?)), - Expr::Cast(Cast { expr, field }) => expr - .to_field(schema) - .map(|(_table_ref, destination_field)| { - // This propagates the nullability of the input rather than - // force the nullability of the destination field. This is - // usually the desired behaviour (i.e., specifying a cast - // destination type usually does not force a user to pick - // nullability, and assuming `true` would prevent the non-nullability - // of the parent expression to make the result eligible for - // optimizations that only apply to non-nullable values). - destination_field - .as_ref() - .clone() - .with_data_type(field.data_type().clone()) - .with_metadata(destination_field.metadata().clone()) + Expr::Cast(Cast { expr, field }) => { + expr.to_field(schema).map(|(_table_ref, src)| { + cast_output_field(&src, field.data_type(), false) }) - .map(Arc::new), + } Expr::Placeholder(Placeholder { id: _, field: Some(field), }) => Ok(Arc::clone(field).renamed(&schema_name)), + Expr::TryCast(TryCast { expr, field }) => { + expr.to_field(schema).map(|(_table_ref, src)| { + cast_output_field(&src, field.data_type(), true) + }) + } Expr::Like(_) | Expr::SimilarTo(_) | Expr::Not(_) | Expr::Between(_) | Expr::Case(_) - | Expr::TryCast(_) | Expr::InList(_) | Expr::InSubquery(_) | Expr::SetComparison(_) diff --git a/datafusion/sqllogictest/test_files/metadata.slt b/datafusion/sqllogictest/test_files/metadata.slt index f3836b23ec321..11ed4fc632e2a 100644 --- a/datafusion/sqllogictest/test_files/metadata.slt +++ b/datafusion/sqllogictest/test_files/metadata.slt @@ -220,6 +220,34 @@ FROM table_with_metadata; 2020-09-08 2020-09-08 +# Regression test: CAST should preserve source field metadata +query DT +SELECT + CAST(ts AS DATE) as casted, + arrow_metadata(CAST(ts AS DATE), 'metadata_key') +FROM table_with_metadata; +---- +2020-09-08 ts non-nullable field +2020-09-08 ts non-nullable field +2020-09-08 ts non-nullable field + +# Regression test: CAST preserves metadata on integer column +query IT +SELECT + CAST(id AS BIGINT) as casted, + arrow_metadata(CAST(id AS BIGINT), 'metadata_key') +FROM table_with_metadata; +---- +1 the id field +NULL the id field +3 the id field + +# Regression test: CAST with single-argument arrow_metadata (returns full map) +query ? +select arrow_metadata(CAST(id AS BIGINT)) from table_with_metadata limit 1; +---- +{metadata_key: the id field} + # Regression test: distinct with cast query D SELECT DISTINCT (ts::DATE) AS dist @@ -347,5 +375,44 @@ select arrow_metadata(id) from table_with_metadata limit 1; ---- {metadata_key: the id field} +# Regression test: TRY_CAST should preserve source field metadata +query DT +SELECT + TRY_CAST(ts AS DATE) as try_casted, + arrow_metadata(TRY_CAST(ts AS DATE), 'metadata_key') +FROM table_with_metadata; +---- +2020-09-08 ts non-nullable field +2020-09-08 ts non-nullable field +2020-09-08 ts non-nullable field + +# Regression test: TRY_CAST preserves metadata on integer column +query IT +SELECT + TRY_CAST(id AS BIGINT) as try_casted, + arrow_metadata(TRY_CAST(id AS BIGINT), 'metadata_key') +FROM table_with_metadata; +---- +1 the id field +NULL the id field +3 the id field + +# Regression test: TRY_CAST preserves metadata even when cast fails (returns NULL) +query IT +SELECT + TRY_CAST(name AS INT) as try_casted, + arrow_metadata(TRY_CAST(name AS INT), 'metadata_key') +FROM table_with_metadata; +---- +NULL the name field +NULL the name field +NULL the name field + +# Regression test: TRY_CAST with single-argument arrow_metadata (returns full map) +query ? +select arrow_metadata(TRY_CAST(id AS BIGINT)) from table_with_metadata limit 1; +---- +{metadata_key: the id field} + statement ok drop table table_with_metadata; From 727022d9e7604640ec7e26c3375f9bd66626baf3 Mon Sep 17 00:00:00 2001 From: Qi Zhu Date: Tue, 7 Apr 2026 21:01:41 +0800 Subject: [PATCH 2/6] feat: make sort pushdown BufferExec capacity configurable, default 1GB (#21426) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Which issue does this PR close? Closes #21417 ## Rationale for this change #21182 introduced `BufferExec` between `SortPreservingMergeExec` and `DataSourceExec` when sort elimination removes a `SortExec`. The buffer capacity was hardcoded to 64MB, which can cause I/O stalls for wide-row full scans. ## What changes are included in this PR? - Add `datafusion.execution.sort_pushdown_buffer_capacity` config option (default 1GB) - Replace hardcoded `BUFFER_CAPACITY_AFTER_SORT_ELIMINATION` constant with the config value - Update SLT test expectations for new default capacity ## How are these changes justified? **Why 1GB default:** - This is a maximum, not pre-allocated — actual usage is bounded by partition data size - Strictly less memory than the `SortExec` it replaces (which buffers entire partition) - `BufferExec` integrates with `MemoryPool`, so global memory limits are respected - 64MB was too small for wide-row scans (16-column TPC-H `SELECT *` queries showed I/O stalls) **Why configurable:** - Different workloads have different optimal buffer sizes - Users with memory-constrained environments can reduce it - Users with wide tables or large row groups can increase it ## Are these changes tested? - Existing SLT Test G verifies `BufferExec` appears in plan with correct capacity - Config integration tested via existing config framework ## Are there any user-facing changes? New config option: `datafusion.execution.sort_pushdown_buffer_capacity` (default: 1GB) --- datafusion/common/src/config.rs | 13 +++++++++++ .../physical-optimizer/src/pushdown_sort.rs | 22 ++++--------------- .../test_files/information_schema.slt | 2 ++ .../sqllogictest/test_files/sort_pushdown.slt | 4 ++-- docs/source/user-guide/configs.md | 1 + 5 files changed, 22 insertions(+), 20 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 38b18c06fe930..53f2501d60752 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -557,6 +557,19 @@ config_namespace! { /// batches and merged. pub sort_in_place_threshold_bytes: usize, default = 1024 * 1024 + /// Maximum buffer capacity (in bytes) per partition for BufferExec + /// inserted during sort pushdown optimization. + /// + /// When PushdownSort eliminates a SortExec under SortPreservingMergeExec, + /// a BufferExec is inserted to replace SortExec's buffering role. This + /// prevents I/O stalls by allowing the scan to run ahead of the merge. + /// + /// This uses strictly less memory than the SortExec it replaces (which + /// buffers the entire partition). The buffer respects the global memory + /// pool limit. Setting this to a large value is safe — actual memory + /// usage is bounded by partition size and global memory limits. + pub sort_pushdown_buffer_capacity: usize, default = 1024 * 1024 * 1024 + /// Maximum size in bytes for individual spill files before rotating to a new file. /// /// When operators spill data to disk (e.g., RepartitionExec), they write diff --git a/datafusion/physical-optimizer/src/pushdown_sort.rs b/datafusion/physical-optimizer/src/pushdown_sort.rs index 308e91d0df145..9da8ce60e1ba5 100644 --- a/datafusion/physical-optimizer/src/pushdown_sort.rs +++ b/datafusion/physical-optimizer/src/pushdown_sort.rs @@ -65,20 +65,6 @@ use datafusion_physical_plan::sorts::sort::SortExec; use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; use std::sync::Arc; -/// Per-partition buffer capacity (in bytes) inserted between SPM and -/// DataSourceExec when sort elimination removes the buffering SortExec. -/// -/// SortExec buffers all input data in memory (potentially GB per partition) -/// before outputting sorted results. When we eliminate SortExec, SPM reads -/// directly from I/O-bound sources. BufferExec compensates with bounded -/// buffering, allowing I/O to pipeline with merge computation. -/// -/// This is strictly less memory than the SortExec it replaces, and only -/// inserted when PushdownSort eliminates a SortExec — no impact on other -/// query plans. BufferExec also integrates with MemoryPool, so it respects -/// the global memory limit and won't cause OOM. -const BUFFER_CAPACITY_AFTER_SORT_ELIMINATION: usize = 64 * 1024 * 1024; // 64 MB - /// A PhysicalOptimizerRule that attempts to push down sort requirements to data sources. /// /// See module-level documentation for details. @@ -102,6 +88,8 @@ impl PhysicalOptimizerRule for PushdownSort { return Ok(plan); } + let buffer_capacity = config.execution.sort_pushdown_buffer_capacity; + // Use transform_down to find and optimize all SortExec nodes (including nested ones) // Also handles SPM → SortExec pattern to insert BufferExec when sort is eliminated plan.transform_down(|plan: Arc| { @@ -124,10 +112,8 @@ impl PhysicalOptimizerRule for PushdownSort { // Insert BufferExec to replace SortExec's buffering role. // SortExec buffered all data in memory; BufferExec provides // bounded buffering so SPM doesn't stall on I/O. - let buffered: Arc = Arc::new(BufferExec::new( - inner, - BUFFER_CAPACITY_AFTER_SORT_ELIMINATION, - )); + let buffered: Arc = + Arc::new(BufferExec::new(inner, buffer_capacity)); let new_spm = SortPreservingMergeExec::new(spm.expr().clone(), buffered) .with_fetch(spm.fetch()); diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 77ae1d335fb8d..0b34f381cbc59 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -270,6 +270,7 @@ datafusion.execution.skip_partial_aggregation_probe_rows_threshold 100000 datafusion.execution.skip_physical_aggregate_schema_check false datafusion.execution.soft_max_rows_per_output_file 50000000 datafusion.execution.sort_in_place_threshold_bytes 1048576 +datafusion.execution.sort_pushdown_buffer_capacity 1073741824 datafusion.execution.sort_spill_reservation_bytes 10485760 datafusion.execution.spill_compression uncompressed datafusion.execution.split_file_groups_by_statistics false @@ -413,6 +414,7 @@ datafusion.execution.skip_partial_aggregation_probe_rows_threshold 100000 Number datafusion.execution.skip_physical_aggregate_schema_check false When set to true, skips verifying that the schema produced by planning the input of `LogicalPlan::Aggregate` exactly matches the schema of the input plan. When set to false, if the schema does not match exactly (including nullability and metadata), a planning error will be raised. This is used to workaround bugs in the planner that are now caught by the new schema verification step. datafusion.execution.soft_max_rows_per_output_file 50000000 Target number of rows in output files when writing multiple. This is a soft max, so it can be exceeded slightly. There also will be one file smaller than the limit if the total number of rows written is not roughly divisible by the soft max datafusion.execution.sort_in_place_threshold_bytes 1048576 When sorting, below what size should data be concatenated and sorted in a single RecordBatch rather than sorted in batches and merged. +datafusion.execution.sort_pushdown_buffer_capacity 1073741824 Maximum buffer capacity (in bytes) per partition for BufferExec inserted during sort pushdown optimization. When PushdownSort eliminates a SortExec under SortPreservingMergeExec, a BufferExec is inserted to replace SortExec's buffering role. This prevents I/O stalls by allowing the scan to run ahead of the merge. This uses strictly less memory than the SortExec it replaces (which buffers the entire partition). The buffer respects the global memory pool limit. Setting this to a large value is safe — actual memory usage is bounded by partition size and global memory limits. datafusion.execution.sort_spill_reservation_bytes 10485760 Specifies the reserved memory for each spillable sort operation to facilitate an in-memory merge. When a sort operation spills to disk, the in-memory data must be sorted and merged before being written to a file. This setting reserves a specific amount of memory for that in-memory sort/merge process. Note: This setting is irrelevant if the sort operation cannot spill (i.e., if there's no `DiskManager` configured). datafusion.execution.spill_compression uncompressed Sets the compression codec used when spilling data to disk. Since datafusion writes spill files using the Arrow IPC Stream format, only codecs supported by the Arrow IPC Stream Writer are allowed. Valid values are: uncompressed, lz4_frame, zstd. Note: lz4_frame offers faster (de)compression, but typically results in larger spill files. In contrast, zstd achieves higher compression ratios at the cost of slower (de)compression speed. datafusion.execution.split_file_groups_by_statistics false Attempt to eliminate sorts by packing & sorting files with non-overlapping statistics into the same file groups. Currently experimental diff --git a/datafusion/sqllogictest/test_files/sort_pushdown.slt b/datafusion/sqllogictest/test_files/sort_pushdown.slt index e9d4e221e1ddb..b6c75f3977010 100644 --- a/datafusion/sqllogictest/test_files/sort_pushdown.slt +++ b/datafusion/sqllogictest/test_files/sort_pushdown.slt @@ -2221,7 +2221,7 @@ logical_plan 02)--TableScan: tg_buffer projection=[id, value] physical_plan 01)SortPreservingMergeExec: [id@0 ASC NULLS LAST] -02)--BufferExec: capacity=67108864 +02)--BufferExec: capacity=1073741824 03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/a_high.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/c_low.parquet]]}, projection=[id, value], output_ordering=[id@0 ASC NULLS LAST], file_type=parquet # Verify correctness @@ -2248,7 +2248,7 @@ logical_plan 02)--TableScan: tg_buffer projection=[id, value] physical_plan 01)SortPreservingMergeExec: [id@0 ASC NULLS LAST], fetch=3 -02)--BufferExec: capacity=67108864 +02)--BufferExec: capacity=1073741824 03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/b_mid.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/a_high.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/sort_pushdown/tg_buffer/c_low.parquet]]}, projection=[id, value], limit=3, output_ordering=[id@0 ASC NULLS LAST], file_type=parquet query II diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index be42f4a0becb8..b88727e7e3b52 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -118,6 +118,7 @@ The following configuration settings are available: | datafusion.execution.spill_compression | uncompressed | Sets the compression codec used when spilling data to disk. Since datafusion writes spill files using the Arrow IPC Stream format, only codecs supported by the Arrow IPC Stream Writer are allowed. Valid values are: uncompressed, lz4_frame, zstd. Note: lz4_frame offers faster (de)compression, but typically results in larger spill files. In contrast, zstd achieves higher compression ratios at the cost of slower (de)compression speed. | | datafusion.execution.sort_spill_reservation_bytes | 10485760 | Specifies the reserved memory for each spillable sort operation to facilitate an in-memory merge. When a sort operation spills to disk, the in-memory data must be sorted and merged before being written to a file. This setting reserves a specific amount of memory for that in-memory sort/merge process. Note: This setting is irrelevant if the sort operation cannot spill (i.e., if there's no `DiskManager` configured). | | datafusion.execution.sort_in_place_threshold_bytes | 1048576 | When sorting, below what size should data be concatenated and sorted in a single RecordBatch rather than sorted in batches and merged. | +| datafusion.execution.sort_pushdown_buffer_capacity | 1073741824 | Maximum buffer capacity (in bytes) per partition for BufferExec inserted during sort pushdown optimization. When PushdownSort eliminates a SortExec under SortPreservingMergeExec, a BufferExec is inserted to replace SortExec's buffering role. This prevents I/O stalls by allowing the scan to run ahead of the merge. This uses strictly less memory than the SortExec it replaces (which buffers the entire partition). The buffer respects the global memory pool limit. Setting this to a large value is safe — actual memory usage is bounded by partition size and global memory limits. | | datafusion.execution.max_spill_file_size_bytes | 134217728 | Maximum size in bytes for individual spill files before rotating to a new file. When operators spill data to disk (e.g., RepartitionExec), they write multiple batches to the same file until this size limit is reached, then rotate to a new file. This reduces syscall overhead compared to one-file-per-batch while preventing files from growing too large. A larger value reduces file creation overhead but may hold more disk space. A smaller value creates more files but allows finer-grained space reclamation as files can be deleted once fully consumed. Now only `RepartitionExec` supports this spill file rotation feature, other spilling operators may create spill files larger than the limit. Default: 128 MB | | datafusion.execution.meta_fetch_concurrency | 32 | Number of files to read in parallel when inferring schema and statistics | | datafusion.execution.minimum_parallel_output_files | 4 | Guarantees a minimum level of output files running in parallel. RecordBatches will be distributed in round robin fashion to each parallel writer. Each writer is closed and a new file opened once soft_max_rows_per_output_file is reached. | From c73b6e11ce6d1acdb3d67c602c284c86da1dc624 Mon Sep 17 00:00:00 2001 From: theirix Date: Tue, 7 Apr 2026 14:16:46 +0100 Subject: [PATCH 3/6] bench: add benchmarks for first_value, last_value (#21409) ## Which issue does this PR close? ## Rationale for this change Spin-off of #21383 to have a bench for `First_Value`, `Last_Value` available before a PR with logic change. ## What changes are included in this PR? - Add benchmark for `GroupsAccumulator`. It's pretty complicated to test aggregates with grouping, since many operations are stateful, so I introduced end-to-end evaluate test (to actually test taking state) and convert_to_state (as in other benches) - A bench for a simple `Accumulator` ## Are these changes tested? - Manual bench run ## Are there any user-facing changes? --- datafusion/functions-aggregate/Cargo.toml | 4 + .../functions-aggregate/benches/first_last.rs | 262 ++++++++++++++++++ 2 files changed, 266 insertions(+) create mode 100644 datafusion/functions-aggregate/benches/first_last.rs diff --git a/datafusion/functions-aggregate/Cargo.toml b/datafusion/functions-aggregate/Cargo.toml index 07ee89af84a0e..8aa7976ab1abd 100644 --- a/datafusion/functions-aggregate/Cargo.toml +++ b/datafusion/functions-aggregate/Cargo.toml @@ -79,3 +79,7 @@ name = "min_max_bytes" [[bench]] name = "approx_distinct" harness = false + +[[bench]] +name = "first_last" +harness = false diff --git a/datafusion/functions-aggregate/benches/first_last.rs b/datafusion/functions-aggregate/benches/first_last.rs new file mode 100644 index 0000000000000..6070470138aa6 --- /dev/null +++ b/datafusion/functions-aggregate/benches/first_last.rs @@ -0,0 +1,262 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::hint::black_box; +use std::sync::Arc; + +use arrow::array::{ArrayRef, BooleanArray}; +use arrow::compute::SortOptions; +use arrow::datatypes::{DataType, Field, Int64Type, Schema}; +use arrow::util::bench_util::{create_boolean_array, create_primitive_array}; + +use datafusion_expr::{ + Accumulator, AggregateUDFImpl, EmitTo, GroupsAccumulator, function::AccumulatorArgs, +}; +use datafusion_functions_aggregate::first_last::{FirstValue, LastValue}; +use datafusion_physical_expr::PhysicalSortExpr; +use datafusion_physical_expr::expressions::col; + +use criterion::{Criterion, criterion_group, criterion_main}; + +fn prepare_groups_accumulator(is_first: bool) -> Box { + let schema = Arc::new(Schema::new(vec![ + Field::new("value", DataType::Int64, true), + Field::new("ord", DataType::Int64, true), + ])); + + let order_expr = col("ord", &schema).unwrap(); + let sort_expr = PhysicalSortExpr { + expr: order_expr, + options: SortOptions::default(), + }; + + let value_field: Arc = Field::new("value", DataType::Int64, true).into(); + let accumulator_args = AccumulatorArgs { + return_field: Arc::clone(&value_field), + schema: &schema, + expr_fields: &[value_field], + ignore_nulls: false, + order_bys: std::slice::from_ref(&sort_expr), + is_reversed: false, + name: if is_first { + "FIRST_VALUE(value ORDER BY ord)" + } else { + "LAST_VALUE(value ORDER BY ord)" + }, + is_distinct: false, + exprs: &[col("value", &schema).unwrap()], + }; + + if is_first { + FirstValue::new() + .create_groups_accumulator(accumulator_args) + .unwrap() + } else { + LastValue::new() + .create_groups_accumulator(accumulator_args) + .unwrap() + } +} + +fn prepare_accumulator(is_first: bool) -> Box { + let schema = Arc::new(Schema::new(vec![ + Field::new("value", DataType::Int64, true), + Field::new("ord", DataType::Int64, true), + ])); + + let order_expr = col("ord", &schema).unwrap(); + let sort_expr = PhysicalSortExpr { + expr: order_expr, + options: SortOptions::default(), + }; + + let value_field: Arc = Field::new("value", DataType::Int64, true).into(); + let accumulator_args = AccumulatorArgs { + return_field: Arc::clone(&value_field), + schema: &schema, + expr_fields: &[value_field], + ignore_nulls: false, + order_bys: std::slice::from_ref(&sort_expr), + is_reversed: false, + name: if is_first { + "FIRST_VALUE(value ORDER BY ord)" + } else { + "LAST_VALUE(value ORDER BY ord)" + }, + is_distinct: false, + exprs: &[col("value", &schema).unwrap()], + }; + + if is_first { + FirstValue::new().accumulator(accumulator_args).unwrap() + } else { + LastValue::new().accumulator(accumulator_args).unwrap() + } +} + +#[expect(clippy::needless_pass_by_value)] +fn convert_to_state_bench( + c: &mut Criterion, + is_first: bool, + name: &str, + values: ArrayRef, + opt_filter: Option<&BooleanArray>, +) { + c.bench_function(name, |b| { + b.iter(|| { + let accumulator = prepare_groups_accumulator(is_first); + black_box( + accumulator + .convert_to_state(std::slice::from_ref(&values), opt_filter) + .unwrap(), + ) + }) + }); +} + +#[expect(clippy::needless_pass_by_value)] +fn evaluate_accumulator_bench( + c: &mut Criterion, + is_first: bool, + name: &str, + values: ArrayRef, + ord: ArrayRef, +) { + c.bench_function(name, |b| { + b.iter_batched( + || { + // setup, not timed + let mut accumulator = prepare_accumulator(is_first); + accumulator + .update_batch(&[Arc::clone(&values), Arc::clone(&ord)]) + .unwrap(); + accumulator + }, + |mut accumulator| black_box(accumulator.evaluate().unwrap()), + criterion::BatchSize::SmallInput, + ) + }); +} + +#[expect(clippy::needless_pass_by_value)] +#[expect(clippy::too_many_arguments)] +fn evaluate_bench( + c: &mut Criterion, + is_first: bool, + emit_to: EmitTo, + name: &str, + values: ArrayRef, + ord: ArrayRef, + opt_filter: Option<&BooleanArray>, + num_groups: usize, +) { + let n = values.len(); + let group_indices: Vec = (0..n).map(|i| i % num_groups).collect(); + + c.bench_function(name, |b| { + b.iter_batched( + || { + // setup, not timed + let mut accumulator = prepare_groups_accumulator(is_first); + accumulator + .update_batch( + &[Arc::clone(&values), Arc::clone(&ord)], + &group_indices, + opt_filter, + num_groups, + ) + .unwrap(); + accumulator + }, + |mut accumulator| black_box(accumulator.evaluate(emit_to).unwrap()), + criterion::BatchSize::SmallInput, + ) + }); +} + +fn first_last_benchmark(c: &mut Criterion) { + const N: usize = 65536; + const NUM_GROUPS: usize = 1024; + + assert_eq!(N % NUM_GROUPS, 0); + + for is_first in [true, false] { + for pct in [0, 90] { + let fn_name = if is_first { + "first_value" + } else { + "last_value" + }; + + let null_density = (pct as f32) / 100.0; + let values = Arc::new(create_primitive_array::(N, null_density)) + as ArrayRef; + let ord = Arc::new(create_primitive_array::(N, null_density)) + as ArrayRef; + + evaluate_accumulator_bench( + c, + is_first, + &format!("{fn_name} evaluate_accumulator_bench nulls={pct}%"), + values.clone(), + ord.clone(), + ); + + for with_filter in [false, true] { + let filter = create_boolean_array(N, 0.0, 0.5); + let opt_filter = if with_filter { Some(&filter) } else { None }; + + convert_to_state_bench( + c, + is_first, + &format!( + "{fn_name} convert_to_state nulls={pct}%, filter={with_filter}" + ), + values.clone(), + opt_filter, + ); + evaluate_bench( + c, + is_first, + EmitTo::First(2), + &format!( + "{fn_name} evaluate_bench nulls={pct}%, filter={with_filter}, first(2)" + ), + values.clone(), + ord.clone(), + opt_filter, + NUM_GROUPS, + ); + evaluate_bench( + c, + is_first, + EmitTo::All, + &format!( + "{fn_name} evaluate_bench nulls={pct}%, filter={with_filter}, all" + ), + values.clone(), + ord.clone(), + opt_filter, + NUM_GROUPS, + ); + } + } + } +} + +criterion_group!(benches, first_last_benchmark); +criterion_main!(benches); From 7eccbb94066abd189e37b3cc8872586f2d7b7384 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 7 Apr 2026 09:25:58 -0400 Subject: [PATCH 4/6] chore(deps): bump the all-other-cargo-deps group with 4 updates (#21435) Bumps the all-other-cargo-deps group with 4 updates: [indexmap](https://github.com/indexmap-rs/indexmap), [tokio](https://github.com/tokio-rs/tokio), [libc](https://github.com/rust-lang/libc) and [semver](https://github.com/dtolnay/semver). Updates `indexmap` from 2.13.0 to 2.13.1
Changelog

Sourced from indexmap's changelog.

2.13.1 (2026-04-02)

  • Made some Slice methods const:
    • map::Slice::{first,last,split_at,split_at_checked,split_first,split_last}
    • set::Slice::{first,last,split_at,split_at_checked,split_first,split_last}
Commits

Updates `tokio` from 1.50.0 to 1.51.0
Release notes

Sourced from tokio's releases.

Tokio v1.51.0

1.51.0 (April 3rd, 2026)

Added

  • net: implement get_peer_cred on Hurd (#7989)
  • runtime: add tokio::runtime::worker_index() (#7921)
  • runtime: add runtime name (#7924)
  • runtime: stabilize LocalRuntime (#7557)
  • wasm: add wasm32-wasip2 networking support (#7933)

Changed

  • runtime: steal tasks from the LIFO slot (#7431)

Fixed

  • docs: do not show "Available on non-loom only." doc label (#7977)
  • macros: improve overall macro hygiene (#7997)
  • sync: fix notify_waiters priority in Notify (#7996)
  • sync: fix panic in Chan::recv_many when called with non-empty vector on closed channel (#7991)

#7431: tokio-rs/tokio#7431 #7557: tokio-rs/tokio#7557 #7921: tokio-rs/tokio#7921 #7924: tokio-rs/tokio#7924 #7933: tokio-rs/tokio#7933 #7977: tokio-rs/tokio#7977 #7989: tokio-rs/tokio#7989 #7991: tokio-rs/tokio#7991 #7996: tokio-rs/tokio#7996 #7997: tokio-rs/tokio#7997

Commits

Updates `libc` from 0.2.183 to 0.2.184
Release notes

Sourced from libc's releases.

0.2.184

MSRV

This release increases the MSRV of libc to 1.65. With this update, you can now always use the core::ffi::c_* types with libc definitions, since libc has been changed to reexport from core rather than redefining them. (This usually worked before but had edge cases.) (#4972)

Added

  • BSD: Add IP_MINTTL to bsd (#5026)
  • Cygwin: Add TIOCM_DSR (#5031)
  • FreeBSD: Added xfile structe and file descriptor types (#5002)
  • Linux: Add CAN netlink bindings (#5011)
  • Linux: Add struct ethhdr (#4239)
  • Linux: Add struct ifinfomsg (#5012)
  • Linux: Define max_align_t for riscv64 (#5029)
  • NetBSD: Add missing CLOCK_ constants (#5020)
  • NuttX: Add _SC_HOST_NAME_MAX (#5004)
  • VxWorks: Add flock and F_*LCK constants (#4043)
  • WASI: Add all _SC_* sysconf constants (#5023)

Deprecated

The remaining fixed-width integer aliases, __uint128_t, __uint128, __int128_t, and __int128, have been deprecated. Use i128 and u128 instead. (#4343)

Fixed

  • breaking Redox: Fix signal action constant types (#5009)
  • EspIDF: Correct the value of DT_* constants (#5034)
  • Redox: Fix locale values and add RTLD_NOLOAD, some TCP constants (#5025)
  • Various: Use Padding::new(<zeroed>) rather than Padding::uninit() (#5036)

Changed

  • potentially breaking Linux: Add new fields to struct ptrace_syscall_info (#4966)
  • Re-export core::ffi integer types rather than redefining (#5015)
  • Redox: Update F_DUPFD, IP, and TCP constants to match relibc (#4990)
Changelog

Sourced from libc's changelog.

0.2.184 - 2026-04-01

MSRV

This release increases the MSRV of libc to 1.65. With this update, you can now always use the core::ffi::c_* types with libc definitions, since libc has been changed to reexport from core rather than redefining them. (This usually worked before but had edge cases.) (#4972)

Added

  • BSD: Add IP_MINTTL to bsd (#5026)
  • Cygwin: Add TIOCM_DSR (#5031)
  • FreeBSD: Added xfile structe and file descriptor types (#5002)
  • Linux: Add CAN netlink bindings (#5011)
  • Linux: Add struct ethhdr (#4239)
  • Linux: Add struct ifinfomsg (#5012)
  • Linux: Define max_align_t for riscv64 (#5029)
  • NetBSD: Add missing CLOCK_ constants (#5020)
  • NuttX: Add _SC_HOST_NAME_MAX (#5004)
  • VxWorks: Add flock and F_*LCK constants (#4043)
  • WASI: Add all _SC_* sysconf constants (#5023)

Deprecated

The remaining fixed-width integer aliases, __uint128_t, __uint128, __int128_t, and __int128, have been deprecated. Use i128 and u128 instead. (#4343)

Fixed

  • breaking Redox: Fix signal action constant types (#5009)
  • EspIDF: Correct the value of DT_* constants (#5034)
  • Redox: Fix locale values and add RTLD_NOLOAD, some TCP constants (#5025)
  • Various: Use Padding::new(<zeroed>) rather than Padding::uninit() (#5036)

Changed

  • potentially breaking Linux: Add new fields to struct ptrace_syscall_info (#4966)
  • Re-export core::ffi integer types rather than redefining (#5015)
  • Redox: Update F_DUPFD, IP, and TCP constants to match relibc (#4990)
Commits
  • b1fd610 chore: Release libc 0.2.184
  • f596819 ci: Don't enforce cargo-semver-checks
  • 4645f60 linux: update ptrace_syscall_info struct
  • 14cbbec types: Remove Padding::uninit
  • b5dcda8 pthread: Use Padding::new(\<zeroed>) rather than Padding::uninit()
  • bbb1c5d types: Add a new function to Padding
  • df06e43 Fix locale values and add RTLD_NOLOAD, some TCP constants
  • 078f5c6 newlib/espidf: Move DT_* to espidf/mod.rs
  • d32b83d Add IP_MINTTL to bsd
  • 939e0ec Define max_align_t for riscv64-linux
  • Additional commits viewable in compare view

Updates `semver` from 1.0.27 to 1.0.28
Release notes

Sourced from semver's releases.

1.0.28

  • Documentation improvements
Commits
  • 7625c7a Release 1.0.28
  • fd404d0 Merge pull request 351 from czy-29/master
  • f75f26e The doc_auto_cfg and doc_cfg features have been merged
  • 9e2bfa2 Enable serde on docs.rs and automatically add serde flag to the docs
  • 8591f23 Unpin CI miri toolchain
  • 66bdd2c Pin CI miri to nightly-2026-02-11
  • 324ffce Switch from cargo bench to criterion
  • 34133a5 Update actions/upload-artifact@v5 -> v6
  • 7f935ff Update actions/upload-artifact@v4 -> v5
  • c07fb91 Switch from test::black_box to std::hint::black_box
  • Additional commits viewable in compare view

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore major version` will close this group update PR and stop Dependabot creating any more for the specific dependency's major version (unless you unignore this specific dependency's major version or upgrade to it yourself) - `@dependabot ignore minor version` will close this group update PR and stop Dependabot creating any more for the specific dependency's minor version (unless you unignore this specific dependency's minor version or upgrade to it yourself) - `@dependabot ignore ` will close this group update PR and stop Dependabot creating any more for the specific dependency (unless you unignore this specific dependency or upgrade to it yourself) - `@dependabot unignore ` will remove all of the ignore conditions of the specified dependency - `@dependabot unignore ` will remove the ignore condition of the specified dependency and ignore conditions
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 70 ++++++++++++++++++------------------ Cargo.toml | 4 +-- datafusion/common/Cargo.toml | 2 +- datafusion/ffi/Cargo.toml | 2 +- 4 files changed, 39 insertions(+), 39 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 895b3059f50c1..983a74ed3cf03 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -270,7 +270,7 @@ dependencies = [ "bzip2", "crc", "flate2", - "indexmap 2.13.0", + "indexmap 2.13.1", "liblzma", "rand 0.9.2", "serde", @@ -400,7 +400,7 @@ dependencies = [ "arrow-schema", "chrono", "half", - "indexmap 2.13.0", + "indexmap 2.13.1", "itoa", "lexical-core", "memchr", @@ -1916,7 +1916,7 @@ dependencies = [ "half", "hashbrown 0.16.1", "hex", - "indexmap 2.13.0", + "indexmap 2.13.1", "insta", "itertools 0.14.0", "libc", @@ -2175,7 +2175,7 @@ dependencies = [ "datafusion-functions-window-common", "datafusion-physical-expr-common", "env_logger", - "indexmap 2.13.0", + "indexmap 2.13.1", "insta", "itertools 0.14.0", "recursive", @@ -2189,7 +2189,7 @@ version = "53.0.0" dependencies = [ "arrow", "datafusion-common", - "indexmap 2.13.0", + "indexmap 2.13.1", "insta", "itertools 0.14.0", ] @@ -2385,7 +2385,7 @@ dependencies = [ "datafusion-physical-expr", "datafusion-sql", "env_logger", - "indexmap 2.13.0", + "indexmap 2.13.1", "insta", "itertools 0.14.0", "log", @@ -2408,7 +2408,7 @@ dependencies = [ "datafusion-physical-expr-common", "half", "hashbrown 0.16.1", - "indexmap 2.13.0", + "indexmap 2.13.1", "insta", "itertools 0.14.0", "parking_lot", @@ -2442,7 +2442,7 @@ dependencies = [ "datafusion-common", "datafusion-expr-common", "hashbrown 0.16.1", - "indexmap 2.13.0", + "indexmap 2.13.1", "itertools 0.14.0", "parking_lot", "rand 0.9.2", @@ -2492,7 +2492,7 @@ dependencies = [ "futures", "half", "hashbrown 0.16.1", - "indexmap 2.13.0", + "indexmap 2.13.1", "insta", "itertools 0.14.0", "log", @@ -2627,7 +2627,7 @@ dependencies = [ "datafusion-functions-nested", "datafusion-functions-window", "env_logger", - "indexmap 2.13.0", + "indexmap 2.13.1", "insta", "itertools 0.14.0", "log", @@ -3277,7 +3277,7 @@ dependencies = [ "futures-core", "futures-sink", "http 1.4.0", - "indexmap 2.13.0", + "indexmap 2.13.1", "slab", "tokio", "tokio-util", @@ -3699,9 +3699,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.13.0" +version = "2.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" +checksum = "45a8a2b9cb3e0b0c1803dbb0758ffac5de2f425b23c28f518faabd9d805342ff" dependencies = [ "equivalent", "hashbrown 0.16.1", @@ -3924,9 +3924,9 @@ checksum = "2c4a545a15244c7d945065b5d392b2d2d7f21526fba56ce51467b06ed445e8f7" [[package]] name = "libc" -version = "0.2.183" +version = "0.2.184" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d" +checksum = "48f5d2a454e16a5ea0f4ced81bd44e4cfc7bd3a507b61887c99fd3538b28e4af" [[package]] name = "libloading" @@ -4100,9 +4100,9 @@ dependencies = [ [[package]] name = "mio" -version = "1.1.1" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc" +checksum = "50b7e5b27aa02a74bac8c3f23f448f8d87ff11f92d3aac1a6ed369ee08cc56c1" dependencies = [ "libc", "wasi 0.11.1+wasi-snapshot-preview1", @@ -4544,7 +4544,7 @@ checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455" dependencies = [ "fixedbitset", "hashbrown 0.15.5", - "indexmap 2.13.0", + "indexmap 2.13.1", "serde", ] @@ -5482,9 +5482,9 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.27" +version = "1.0.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" +checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" dependencies = [ "serde", "serde_core", @@ -5543,7 +5543,7 @@ version = "1.0.149" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" dependencies = [ - "indexmap 2.13.0", + "indexmap 2.13.1", "itoa", "memchr", "serde", @@ -5596,7 +5596,7 @@ dependencies = [ "chrono", "hex", "indexmap 1.9.3", - "indexmap 2.13.0", + "indexmap 2.13.1", "schemars 0.9.0", "schemars 1.2.1", "serde_core", @@ -5623,7 +5623,7 @@ version = "0.9.34+deprecated" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" dependencies = [ - "indexmap 2.13.0", + "indexmap 2.13.1", "itoa", "ryu", "serde", @@ -5899,7 +5899,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e620ff4d5c02fd6f7752931aa74b16a26af66a63022cc1ad412c77edbe0bab47" dependencies = [ "heck", - "indexmap 2.13.0", + "indexmap 2.13.1", "pbjson 0.8.0", "pbjson-build 0.8.0", "pbjson-types", @@ -6162,9 +6162,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.50.0" +version = "1.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27ad5e34374e03cfffefc301becb44e9dc3c17584f414349ebe29ed26661822d" +checksum = "2bd1c4c0fc4a7ab90fc15ef6daaa3ec3b893f004f915f2392557ed23237820cd" dependencies = [ "bytes", "libc", @@ -6179,9 +6179,9 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "2.6.1" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c55a2eff8b69ce66c84f85e1da1c233edc36ceb85a2058d11b0d6a3c7e7569c" +checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" dependencies = [ "proc-macro2", "quote", @@ -6264,7 +6264,7 @@ version = "0.25.5+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ca1a40644a28bce036923f6a431df0b34236949d111cc07cb6dca830c9ef2e1" dependencies = [ - "indexmap 2.13.0", + "indexmap 2.13.1", "toml_datetime", "toml_parser", "winnow", @@ -6327,7 +6327,7 @@ checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" dependencies = [ "futures-core", "futures-util", - "indexmap 2.13.0", + "indexmap 2.13.1", "pin-project-lite", "slab", "sync_wrapper", @@ -6851,7 +6851,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" dependencies = [ "anyhow", - "indexmap 2.13.0", + "indexmap 2.13.1", "wasm-encoder", "wasmparser", ] @@ -6877,7 +6877,7 @@ checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" dependencies = [ "bitflags", "hashbrown 0.15.5", - "indexmap 2.13.0", + "indexmap 2.13.1", "semver", ] @@ -7257,7 +7257,7 @@ checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" dependencies = [ "anyhow", "heck", - "indexmap 2.13.0", + "indexmap 2.13.1", "prettyplease", "syn 2.0.117", "wasm-metadata", @@ -7288,7 +7288,7 @@ checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" dependencies = [ "anyhow", "bitflags", - "indexmap 2.13.0", + "indexmap 2.13.1", "log", "serde", "serde_derive", @@ -7307,7 +7307,7 @@ checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" dependencies = [ "anyhow", "id-arena", - "indexmap 2.13.0", + "indexmap 2.13.1", "log", "semver", "serde", diff --git a/Cargo.toml b/Cargo.toml index 64673c025d299..9ac070194161e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -162,7 +162,7 @@ glob = "0.3.0" half = { version = "2.7.0", default-features = false } hashbrown = { version = "0.16.1" } hex = { version = "0.4.3" } -indexmap = "2.13.0" +indexmap = "2.13.1" insta = { version = "1.47.2", features = ["glob", "filters"] } itertools = "0.14" itoa = "1.0" @@ -192,7 +192,7 @@ strum = "0.28.0" strum_macros = "0.28.0" tempfile = "3" testcontainers-modules = { version = "0.15" } -tokio = { version = "1.48", features = ["macros", "rt", "sync"] } +tokio = { version = "1.51", features = ["macros", "rt", "sync"] } tokio-stream = "0.1" tokio-util = "0.7" url = "2.5.7" diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index dbb81a4824c0d..fda229ae0ba6b 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -75,7 +75,7 @@ hashbrown = { workspace = true } hex = { workspace = true, optional = true } indexmap = { workspace = true } itertools = { workspace = true } -libc = "0.2.180" +libc = "0.2.184" log = { workspace = true } object_store = { workspace = true, optional = true } parquet = { workspace = true, optional = true, default-features = true } diff --git a/datafusion/ffi/Cargo.toml b/datafusion/ffi/Cargo.toml index 21264b59e05a3..1cb5207f21802 100644 --- a/datafusion/ffi/Cargo.toml +++ b/datafusion/ffi/Cargo.toml @@ -69,7 +69,7 @@ datafusion-session = { workspace = true } futures = { workspace = true } log = { workspace = true } prost = { workspace = true } -semver = "1.0.27" +semver = "1.0.28" tokio = { workspace = true } [dev-dependencies] From 94c7bc2fb6af71edceddfb3d0b1e5b00515e3007 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 7 Apr 2026 07:57:34 -0700 Subject: [PATCH 5/6] chore(deps): bump taiki-e/install-action from 2.70.3 to 2.74.0 (#21434) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [taiki-e/install-action](https://github.com/taiki-e/install-action) from 2.70.3 to 2.74.0.
Release notes

Sourced from taiki-e/install-action's releases.

2.74.0

  • Support cargo-deb. (#1669)

  • Update just@latest to 1.49.0.

  • Update mise@latest to 2026.4.4.

2.73.0

  • Introduce dependency cooldown when installing with taiki-e/install-action@<tool_name>, tool: <tool_name>@latest, or tool: <tool_name>@<omitted_version> to mitigate the risk of supply chain attacks by default. (#1666)

    This action without this cooldown already takes a few hours to a few days for new releases to be reflected (as with other common package managers that verify checksums or signatures), so this should not affect most users.

    See the "Security" section in readme for more details.

  • Improve robustness for network failure.

  • Documentation improvements.

2.72.0

  • Support cargo-xwin. (#1659, thanks @​daxpedda)

  • Support trailing comma in tool input option.

  • Update tombi@latest to 0.9.14.

2.71.3

  • Update wasm-tools@latest to 1.246.2.

  • Update mise@latest to 2026.4.3.

2.71.2

  • Implement workaround for windows-11-arm runner bug which sometimes causes installation failure. (#1657)

    This addresses an issue that was attempted to be worked around in 2.71.0 but was insufficient.

  • Update mise@latest to 2026.4.1.

  • Update uv@latest to 0.11.3.

2.71.1

  • Fix a regression that caused an execution policy violation on self-hosted Windows runner due to use of non-default powershell shell, introduced in 2.71.0.

  • Update dprint@latest to 0.53.2.

2.71.0

... (truncated)

Changelog

Sourced from taiki-e/install-action's changelog.

Changelog

All notable changes to this project will be documented in this file.

This project adheres to Semantic Versioning.

[Unreleased]

  • Update tombi@latest to 0.9.15.

[2.74.0] - 2026-04-06

  • Support cargo-deb. (#1669)

  • Update just@latest to 1.49.0.

  • Update mise@latest to 2026.4.4.

[2.73.0] - 2026-04-05

  • Introduce dependency cooldown when installing with taiki-e/install-action@<tool_name>, tool: <tool_name>@latest, or tool: <tool_name>@<omitted_version> to mitigate the risk of supply chain attacks by default. (#1666)

    This action without this cooldown already takes a few hours to a few days for new releases to be reflected (as with other common package managers that verify checksums or signatures), so this should not affect most users.

    See the "Security" section in readme for more details.

  • Improve robustness for network failure.

  • Documentation improvements.

[2.72.0] - 2026-04-04

  • Support cargo-xwin. (#1659, thanks @​daxpedda)

  • Support trailing comma in tool input option.

  • Update tombi@latest to 0.9.14.

[2.71.3] - 2026-04-04

  • Update wasm-tools@latest to 1.246.2.

  • Update mise@latest to 2026.4.3.

[2.71.2] - 2026-04-02

... (truncated)

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=taiki-e/install-action&package-manager=github_actions&previous-version=2.70.3&new-version=2.74.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/audit.yml | 2 +- .github/workflows/rust.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/audit.yml b/.github/workflows/audit.yml index c880d1bae000e..b66cb460ff086 100644 --- a/.github/workflows/audit.yml +++ b/.github/workflows/audit.yml @@ -42,7 +42,7 @@ jobs: steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Install cargo-audit - uses: taiki-e/install-action@6ef672efc2b5aabc787a9e94baf4989aa02a97df # v2.70.3 + uses: taiki-e/install-action@94cb46f8d6e437890146ffbd78a778b78e623fb2 # v2.74.0 with: tool: cargo-audit - name: Run audit check diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 60ee8d9736aeb..f8772e1ff2a51 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -430,7 +430,7 @@ jobs: sudo apt-get update -qq sudo apt-get install -y -qq clang - name: Setup wasm-pack - uses: taiki-e/install-action@6ef672efc2b5aabc787a9e94baf4989aa02a97df # v2.70.3 + uses: taiki-e/install-action@94cb46f8d6e437890146ffbd78a778b78e623fb2 # v2.74.0 with: tool: wasm-pack - name: Run tests with headless mode @@ -770,7 +770,7 @@ jobs: - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - name: Install cargo-msrv - uses: taiki-e/install-action@6ef672efc2b5aabc787a9e94baf4989aa02a97df # v2.70.3 + uses: taiki-e/install-action@94cb46f8d6e437890146ffbd78a778b78e623fb2 # v2.74.0 with: tool: cargo-msrv From 7c3b22c2219011c593484f76d6c1c955252e7842 Mon Sep 17 00:00:00 2001 From: Eren Avsarogullari Date: Tue, 7 Apr 2026 08:26:29 -0700 Subject: [PATCH 6/6] test: Add `datafusion.format.*` configs test coverage (#21355) ## Which issue does this PR close? - Closes #21354. ## Rationale for this change Currently, DataFusion supports 9 `datafusion.format.*` configs but their test coverage seem to be missed so this issue aims to add comprehensive test coverage for them. This is follow-up to recent `config framework` improvements: https://github.com/apache/datafusion/pull/20372 and https://github.com/apache/datafusion/pull/20816. ## What changes are included in this PR? New test coverage is being added for `datafusion.format.*` configs. ## Are these changes tested? Yes, new test coverage is being added for `datafusion.format.*` configs. ## Are there any user-facing changes? No --- .../sqllogictest/test_files/set_variable.slt | 198 ++++++++++++++++++ 1 file changed, 198 insertions(+) diff --git a/datafusion/sqllogictest/test_files/set_variable.slt b/datafusion/sqllogictest/test_files/set_variable.slt index 2ba3a73d85696..f270b9b169572 100644 --- a/datafusion/sqllogictest/test_files/set_variable.slt +++ b/datafusion/sqllogictest/test_files/set_variable.slt @@ -379,6 +379,204 @@ RESET datafusion.execution.batches_size statement error DataFusion error: Invalid or Unsupported Configuration: Config field is a scalar usize and does not have nested field "bar" RESET datafusion.execution.batch_size.bar +############################################# +## Test datafusion.format.* configurations ## +############################################# +query T +SELECT name FROM information_schema.df_settings WHERE name LIKE 'datafusion.format.%' ORDER BY name +---- +datafusion.format.date_format +datafusion.format.datetime_format +datafusion.format.duration_format +datafusion.format.null +datafusion.format.safe +datafusion.format.time_format +datafusion.format.timestamp_format +datafusion.format.timestamp_tz_format +datafusion.format.types_info + +# date_format: SET / SHOW / RESET / SHOW +statement ok +SET datafusion.format.date_format = '%d-%m-%Y' + +query TT +SHOW datafusion.format.date_format +---- +datafusion.format.date_format %d-%m-%Y + +statement ok +RESET datafusion.format.date_format + +query TT +SHOW datafusion.format.date_format +---- +datafusion.format.date_format %Y-%m-%d + +# datetime_format +statement ok +SET datafusion.format.datetime_format = '%Y/%m/%d %H:%M:%S' + +query TT +SHOW datafusion.format.datetime_format +---- +datafusion.format.datetime_format %Y/%m/%d %H:%M:%S + +statement ok +RESET datafusion.format.datetime_format + +query TT +SHOW datafusion.format.datetime_format +---- +datafusion.format.datetime_format %Y-%m-%dT%H:%M:%S%.f + +# timestamp_format +statement ok +SET datafusion.format.timestamp_format = '%FT%H:%M:%S' + +query TT +SHOW datafusion.format.timestamp_format +---- +datafusion.format.timestamp_format %FT%H:%M:%S + +statement ok +RESET datafusion.format.timestamp_format + +query TT +SHOW datafusion.format.timestamp_format +---- +datafusion.format.timestamp_format %Y-%m-%dT%H:%M:%S%.f + +# timestamp_tz_format (default NULL) +statement ok +SET datafusion.format.timestamp_tz_format = '%Y-%m-%d %H:%M:%S %z' + +query TT +SHOW datafusion.format.timestamp_tz_format +---- +datafusion.format.timestamp_tz_format %Y-%m-%d %H:%M:%S %z + +statement ok +RESET datafusion.format.timestamp_tz_format + +query TT +SHOW datafusion.format.timestamp_tz_format +---- +datafusion.format.timestamp_tz_format NULL + +# time_format +statement ok +SET datafusion.format.time_format = '%H-%M-%S' + +query TT +SHOW datafusion.format.time_format +---- +datafusion.format.time_format %H-%M-%S + +statement ok +RESET datafusion.format.time_format + +query TT +SHOW datafusion.format.time_format +---- +datafusion.format.time_format %H:%M:%S%.f + +# duration_format: values are normalized to lowercase; ISO8601 and pretty are valid +statement ok +SET datafusion.format.duration_format = ISO8601 + +query TT +SHOW datafusion.format.duration_format +---- +datafusion.format.duration_format iso8601 + +statement ok +SET datafusion.format.duration_format to 'PRETTY' + +query TT +SHOW datafusion.format.duration_format +---- +datafusion.format.duration_format pretty + +statement ok +RESET datafusion.format.duration_format + +query TT +SHOW datafusion.format.duration_format +---- +datafusion.format.duration_format pretty + +# null display string +statement ok +SET datafusion.format.null = 'NuLL' + +query TT +SHOW datafusion.format.null +---- +datafusion.format.null NuLL + +statement ok +RESET datafusion.format.null + +query TT +SHOW datafusion.format.null +---- +datafusion.format.null (empty) + +# safe +statement ok +SET datafusion.format.safe = false + +query TT +SHOW datafusion.format.safe +---- +datafusion.format.safe false + +statement ok +RESET datafusion.format.safe + +query TT +SHOW datafusion.format.safe +---- +datafusion.format.safe true + +# types_info +statement ok +SET datafusion.format.types_info to true + +query TT +SHOW datafusion.format.types_info +---- +datafusion.format.types_info true + +statement ok +RESET datafusion.format.types_info + +query TT +SHOW datafusion.format.types_info +---- +datafusion.format.types_info false + +# Case-insensitive variable name +statement ok +SET datafusion.FORMAT.DATE_FORMAT = '%m/%d/%Y' + +query TT +SHOW datafusion.format.date_format +---- +datafusion.format.date_format %m/%d/%Y + +statement ok +RESET datafusion.format.date_format + +query TT +SHOW datafusion.format.date_format +---- +datafusion.format.date_format %Y-%m-%d + +# Invalid format option name +statement error DataFusion error: Invalid or Unsupported Configuration: Config value "unknown_option" not found on FormatOptions +SET datafusion.format.unknown_option = true + ############ ## Test runtime configuration variables ############