From 2d9e268bc7f8462ebb08106e4d55d37a8a9dfb24 Mon Sep 17 00:00:00 2001 From: Eren Avsarogullari Date: Tue, 31 Mar 2026 06:14:58 -0700 Subject: [PATCH 01/15] docs: Add `RESET` Command Documentation (#21245) ## Which issue does this PR close? - Closes #21244. ## Rationale for this change `datafusion.catalog/execution/optimizer/explain/sql_parser/format.*` and `datafusion.runtime.*` configurations can be set by using `SET` Command. Also, they can be reset to their default value by using `RESET` Command. SET command has documentation but RESET does not have so this PR aims to add RESET Command documentation like SET Command. ## What changes are included in this PR? RESET Command Documentation is being added. ## Are these changes tested? Not required. ## Are there any user-facing changes? Yes --- dev/update_config_docs.sh | 17 ++++++++++++++++- docs/source/user-guide/configs.md | 15 +++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/dev/update_config_docs.sh b/dev/update_config_docs.sh index f39bdda3aee87..7ab998f3dad48 100755 --- a/dev/update_config_docs.sh +++ b/dev/update_config_docs.sh @@ -101,9 +101,24 @@ EOF echo "Running CLI and inserting config docs table" $PRINT_CONFIG_DOCS_COMMAND >> "$TARGET_FILE" -echo "Inserting runtime config header" +echo "Inserting reset command details and runtime config header" cat <<'EOF' >> "$TARGET_FILE" +You can also reset configuration options to default settings via SQL using the `RESET` command. For +example, to set and reset `datafusion.execution.batch_size`: + +```sql +SET datafusion.execution.batch_size = '10000'; + +SHOW datafusion.execution.batch_size; +datafusion.execution.batch_size 10000 + +RESET datafusion.execution.batch_size; + +SHOW datafusion.execution.batch_size; +datafusion.execution.batch_size 8192 +``` + # Runtime Configuration Settings DataFusion runtime configurations can be set via SQL using the `SET` command. diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 95abb2769d287..69627e3cb9148 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -197,6 +197,21 @@ The following configuration settings are available: | datafusion.format.duration_format | pretty | Duration format. Can be either `"pretty"` or `"ISO8601"` | | datafusion.format.types_info | false | Show types in visual representation batches | +You can also reset configuration options to default settings via SQL using the `RESET` command. For +example, to set and reset `datafusion.execution.batch_size`: + +```sql +SET datafusion.execution.batch_size = '10000'; + +SHOW datafusion.execution.batch_size; +datafusion.execution.batch_size 10000 + +RESET datafusion.execution.batch_size; + +SHOW datafusion.execution.batch_size; +datafusion.execution.batch_size 8192 +``` + # Runtime Configuration Settings DataFusion runtime configurations can be set via SQL using the `SET` command. From 0bf9defb1d9587ab2810e6e8adff512f4db58037 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Tue, 31 Mar 2026 09:15:15 -0400 Subject: [PATCH 02/15] fix: Fix three bugs in query decorrelation (#21208) ## Which issue does this PR close? - Closes #21205. - Closes #21206. - Closes #20315. ## Rationale for this change This PR fixes three separate bugs in query decorrelation: 1. When removing duplicate filters as part of pulling up `IN` subqueries, an operator precedence error meant that we would consider two filters to be duplicates even if they involved different operators (e.g., `=` and `>`). 2. When generating the `CASE` used to implement "count bug" handling, we referenced the subquery output column without qualifying it by the subquery alias. This could result in name-collisions with unrelated identifiers in the parent query. 3. After generating the `CASE` used for "count bug" handling, we rewrote the parent query to replace references to the subquery output column with the generated `CASE` expression. This rewrite only matched on unqualified column name only, which meant that unrelated parent query identifiers that happened to share the same column name as subquery aggregate aliases could been rewritten by mistake. The first and third issues could result in incorrect query results; the second would only cause spurious errors, as far as I can see. ## What changes are included in this PR? * Fix all three bugs * Add SLT tests * Add a `debug_assert!` to document/check that `remove_duplicated_filter` is called with a commutative operator ## Are these changes tested? Yes. ## Are there any user-facing changes? No, except in the sense that they fix user-visible bugs. --- datafusion/optimizer/src/decorrelate.rs | 34 +++++++--- .../optimizer/src/scalar_subquery_to_join.rs | 31 ++++----- .../sqllogictest/test_files/subquery.slt | 68 +++++++++++++++++++ 3 files changed, 108 insertions(+), 25 deletions(-) diff --git a/datafusion/optimizer/src/decorrelate.rs b/datafusion/optimizer/src/decorrelate.rs index e7bc62e8da097..08839b49ef4b0 100644 --- a/datafusion/optimizer/src/decorrelate.rs +++ b/datafusion/optimizer/src/decorrelate.rs @@ -26,7 +26,9 @@ use crate::simplify_expressions::ExprSimplifier; use datafusion_common::tree_node::{ Transformed, TransformedResult, TreeNode, TreeNodeRecursion, TreeNodeRewriter, }; -use datafusion_common::{Column, DFSchemaRef, HashMap, Result, ScalarValue, plan_err}; +use datafusion_common::{ + Column, DFSchemaRef, HashMap, Result, ScalarValue, assert_or_internal_err, plan_err, +}; use datafusion_expr::expr::Alias; use datafusion_expr::simplify::SimplifyContext; use datafusion_expr::utils::{ @@ -179,7 +181,7 @@ impl TreeNodeRewriter for PullUpCorrelatedExpr { find_join_exprs(subquery_filter_exprs)?; if let Some(in_predicate) = &self.in_predicate_opt { // in_predicate may be already included in the join filters, remove it from the join filters first. - join_filters = remove_duplicated_filter(join_filters, in_predicate); + join_filters = remove_duplicated_filter(join_filters, in_predicate)?; } let correlated_subquery_cols = collect_subquery_cols(&join_filters, subquery_schema)?; @@ -460,25 +462,39 @@ fn collect_local_correlated_cols( } } -fn remove_duplicated_filter(filters: Vec, in_predicate: &Expr) -> Vec { - filters +fn remove_duplicated_filter( + filters: Vec, + in_predicate: &Expr, +) -> Result> { + // We assume below that swapping the order of operands to an operator does + // not change behavior, which is only true if the operator is commutative. + assert_or_internal_err!( + match in_predicate { + Expr::BinaryExpr(b) => b.op.swap() == Some(b.op), + _ => true, + }, + "remove_duplicated_filter: in_predicate must use a commutative operator" + ); + + Ok(filters .into_iter() .filter(|filter| { if filter == in_predicate { return false; } - // ignore the binary order + // Treat swapped operand order to a binary operator as equivalent !match (filter, in_predicate) { (Expr::BinaryExpr(a_expr), Expr::BinaryExpr(b_expr)) => { - (a_expr.op == b_expr.op) - && (a_expr.left == b_expr.left && a_expr.right == b_expr.right) - || (a_expr.left == b_expr.right && a_expr.right == b_expr.left) + a_expr.op == b_expr.op + && ((a_expr.left == b_expr.left && a_expr.right == b_expr.right) + || (a_expr.left == b_expr.right + && a_expr.right == b_expr.left)) } _ => false, } }) - .collect::>() + .collect::>()) } fn agg_exprs_evaluation_result_on_empty_batch( diff --git a/datafusion/optimizer/src/scalar_subquery_to_join.rs b/datafusion/optimizer/src/scalar_subquery_to_join.rs index 975c234b38836..590b00098bd46 100644 --- a/datafusion/optimizer/src/scalar_subquery_to_join.rs +++ b/datafusion/optimizer/src/scalar_subquery_to_join.rs @@ -111,7 +111,7 @@ impl OptimizerRule for ScalarSubqueryToJoin { // replace column references with entry in map, if it exists if let Some(map_expr) = expr .try_as_col() - .and_then(|col| expr_check_map.get(&col.name)) + .and_then(|col| expr_check_map.get(col)) { Ok(Transformed::yes(map_expr.clone())) } else { @@ -176,7 +176,7 @@ impl OptimizerRule for ScalarSubqueryToJoin { // replace column references with entry in map, if it exists if let Some(map_expr) = expr .try_as_col() - .and_then(|col| expr_check_map.get(&col.name)) + .and_then(|col| expr_check_map.get(col)) { Ok(Transformed::yes(map_expr.clone())) } else { @@ -301,7 +301,7 @@ fn build_join( subquery: &Subquery, filter_input: &LogicalPlan, subquery_alias: &str, -) -> Result)>> { +) -> Result)>> { let subquery_plan = subquery.subquery.as_ref(); let mut pull_up = PullUpCorrelatedExpr::new().with_need_handle_count_bug(true); let new_plan = subquery_plan.clone().rewrite(&mut pull_up).data()?; @@ -358,14 +358,19 @@ fn build_join( // If expr always returns null when column is null, skip processing continue; } + + let indicator_col = + Column::new(Some(subquery_alias), UN_MATCHED_ROW_INDICATOR); + // Qualify with the subquery alias to avoid ambiguity when the + // outer table has a column with the same name as the aggregate. + let value_col = Column::new(Some(subquery_alias), name.clone()); + let computer_expr = if let Some(filter) = &pull_up.pull_up_having_expr { Expr::Case(expr::Case { expr: None, when_then_expr: vec![ ( - Box::new(Expr::IsNull(Box::new(Expr::Column( - Column::new_unqualified(UN_MATCHED_ROW_INDICATOR), - )))), + Box::new(Expr::IsNull(Box::new(Expr::Column(indicator_col)))), Box::new(result), ), ( @@ -373,29 +378,23 @@ fn build_join( Box::new(Expr::Literal(ScalarValue::Null, None)), ), ], - else_expr: Some(Box::new(Expr::Column(Column::new_unqualified( - name.clone(), - )))), + else_expr: Some(Box::new(Expr::Column(value_col.clone()))), }) } else { Expr::Case(expr::Case { expr: None, when_then_expr: vec![( - Box::new(Expr::IsNull(Box::new(Expr::Column( - Column::new_unqualified(UN_MATCHED_ROW_INDICATOR), - )))), + Box::new(Expr::IsNull(Box::new(Expr::Column(indicator_col)))), Box::new(result), )], - else_expr: Some(Box::new(Expr::Column(Column::new_unqualified( - name.clone(), - )))), + else_expr: Some(Box::new(Expr::Column(value_col.clone()))), }) }; let mut expr_rewrite = TypeCoercionRewriter { schema: new_plan.schema(), }; computation_project_expr - .insert(name, computer_expr.rewrite(&mut expr_rewrite).data()?); + .insert(value_col, computer_expr.rewrite(&mut expr_rewrite).data()?); } } diff --git a/datafusion/sqllogictest/test_files/subquery.slt b/datafusion/sqllogictest/test_files/subquery.slt index e5ca9d674e19f..7f88199b3c0ef 100644 --- a/datafusion/sqllogictest/test_files/subquery.slt +++ b/datafusion/sqllogictest/test_files/subquery.slt @@ -1671,3 +1671,71 @@ drop table employees; statement count 0 drop table project_assignments; + +# https://github.com/apache/datafusion/issues/21205 +statement ok +CREATE TABLE dup_filter_t1(id INTEGER) AS VALUES (1), (2), (3); + +statement ok +CREATE TABLE dup_filter_t2(id INTEGER) AS VALUES (1), (2), (3); + +query I +SELECT * FROM dup_filter_t1 WHERE dup_filter_t1.id IN ( + SELECT dup_filter_t2.id FROM dup_filter_t2 WHERE dup_filter_t2.id > dup_filter_t1.id +); +---- + +statement ok +DROP TABLE dup_filter_t1; + +statement ok +DROP TABLE dup_filter_t2; + +# https://github.com/apache/datafusion/issues/21206 +statement ok +CREATE TABLE sq_name_t1(id INTEGER) AS VALUES (1), (2), (3); + +statement ok +CREATE TABLE sq_name_t2(id INTEGER, outer_id INTEGER) AS VALUES (10, 1), (20, 1), (30, 2); + +query II +SELECT sq_name_t1.id, + (SELECT count(*) AS id FROM sq_name_t2 WHERE sq_name_t2.outer_id = sq_name_t1.id) AS cnt +FROM sq_name_t1 +ORDER BY sq_name_t1.id; +---- +1 2 +2 1 +3 0 + +query I +SELECT sq_name_t1.id +FROM sq_name_t1 +WHERE sq_name_t1.id > ( + SELECT count(*) AS id + FROM sq_name_t2 + WHERE sq_name_t2.outer_id = sq_name_t1.id +) +ORDER BY sq_name_t1.id; +---- +2 +3 + +query I +SELECT sq_name_t1.id * 10 + ( + SELECT count(*) AS id + FROM sq_name_t2 + WHERE sq_name_t2.outer_id = sq_name_t1.id +) AS total +FROM sq_name_t1 +ORDER BY sq_name_t1.id; +---- +12 +21 +30 + +statement ok +DROP TABLE sq_name_t1; + +statement ok +DROP TABLE sq_name_t2; From d87d8f6cacc05bb0c3618a6e3b3a218ef827a885 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Tue, 31 Mar 2026 11:43:14 -0400 Subject: [PATCH 03/15] perf: Optimize `string_to_array` for scalar args (#21131) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Which issue does this PR close? - Closes #21129. ## Rationale for this change When the delimiter (and null string, if supplied) are scalars, we can implement `string_to_array` more efficiently. In particular, we can construct a `memmem::Finder` and use it to search for delimiters more efficiently. This PR implements this optimization; it also fixes a place where we were allocating an intermediate `String` for every character when the delimiter is `NULL`. (This isn't a common case but worth fixing.) Benchmarks (M4 Max): ``` single_char_delim/5: 34.8 µs (was 61.1 µs) -43% single_char_delim/20: 145.1 µs (was 220.7 µs) -34% single_char_delim/100: 679.4 µs (was 1.04 ms) -35% multi_char_delim/5: 41.7 µs (was 56.7 µs) -27% multi_char_delim/20: 158.9 µs (was 185.1 µs) -14% multi_char_delim/100: 731.4 µs (was 858.3 µs) -15% with_null_str/5: 43.1 µs (was 68.7 µs) -37% with_null_str/20: 179.3 µs (was 244.3 µs) -27% with_null_str/100: 895.8 µs (was 1.16 ms) -23% null_delim/5: 17.4 µs (was 64.1 µs) -73% null_delim/20: 63.0 µs (was 233.4 µs) -73% null_delim/100: 280.2 µs (was 1.12 ms) -75% columnar_delim/5: 65.2 µs (was 60.2 µs) +8% columnar_delim/20: 217.2 µs (was 224.1 µs) -3% columnar_delim/100: 1.02 ms (was 1.05 ms) -3% ``` ## What changes are included in this PR? * Add benchmark for `string_to_array` * Implement optimizations described above * Refactor columnar (fallback) path to get rid of a lot of type dispatch boilerplate * Improve SLT test coverage for the "columnar string, scalar other-args" case ## Are these changes tested? Yes. ## Are there any user-facing changes? No. --- Cargo.lock | 1 + datafusion/functions-nested/Cargo.toml | 5 + .../benches/string_to_array.rs | 244 ++++++++ datafusion/functions-nested/src/string.rs | 544 +++++++++--------- datafusion/sqllogictest/test_files/array.slt | 68 +++ 5 files changed, 585 insertions(+), 277 deletions(-) create mode 100644 datafusion/functions-nested/benches/string_to_array.rs diff --git a/Cargo.lock b/Cargo.lock index e0cc5845c00e5..802bfc4703010 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2285,6 +2285,7 @@ dependencies = [ "itertools 0.14.0", "itoa", "log", + "memchr", "rand 0.9.2", ] diff --git a/datafusion/functions-nested/Cargo.toml b/datafusion/functions-nested/Cargo.toml index 6e96a44fc98c4..31462d5e509ed 100644 --- a/datafusion/functions-nested/Cargo.toml +++ b/datafusion/functions-nested/Cargo.toml @@ -61,6 +61,7 @@ hashbrown = { workspace = true } itertools = { workspace = true, features = ["use_std"] } itoa = { workspace = true } log = { workspace = true } +memchr = { workspace = true } [dev-dependencies] criterion = { workspace = true, features = ["async_tokio"] } @@ -117,3 +118,7 @@ name = "array_position" [[bench]] harness = false name = "array_sort" + +[[bench]] +harness = false +name = "string_to_array" diff --git a/datafusion/functions-nested/benches/string_to_array.rs b/datafusion/functions-nested/benches/string_to_array.rs new file mode 100644 index 0000000000000..e403d5e51bac8 --- /dev/null +++ b/datafusion/functions-nested/benches/string_to_array.rs @@ -0,0 +1,244 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ArrayRef, StringArray}; +use arrow::datatypes::{DataType, Field}; +use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; +use datafusion_common::ScalarValue; +use datafusion_common::config::ConfigOptions; +use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl}; +use datafusion_functions_nested::string::StringToArray; +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; +use std::hint::black_box; +use std::sync::Arc; + +const NUM_ROWS: usize = 1000; +const SEED: u64 = 42; + +fn criterion_benchmark(c: &mut Criterion) { + // Single-char delimiter + let comma = ColumnarValue::Scalar(ScalarValue::Utf8(Some(",".to_string()))); + bench_string_to_array( + c, + "string_to_array_single_char_delim", + create_csv_strings, + &comma, + None, + ); + + // Multi-char delimiter + let double_colon = ColumnarValue::Scalar(ScalarValue::Utf8(Some("::".to_string()))); + bench_string_to_array( + c, + "string_to_array_multi_char_delim", + create_multi_delim_strings, + &double_colon, + None, + ); + + // With null_str argument + let null_str = ColumnarValue::Scalar(ScalarValue::Utf8(Some("NULL".to_string()))); + bench_string_to_array( + c, + "string_to_array_with_null_str", + create_csv_strings_with_nulls, + &comma, + Some(&null_str), + ); + + // NULL delimiter + let null_delim = ColumnarValue::Scalar(ScalarValue::Utf8(None)); + bench_string_to_array( + c, + "string_to_array_null_delim", + create_short_strings, + &null_delim, + None, + ); + + // Columnar delimiter (fall-back path) + bench_string_to_array_columnar_delim(c); +} + +fn bench_string_to_array_columnar_delim(c: &mut Criterion) { + let mut group = c.benchmark_group("string_to_array_columnar_delim"); + + for &num_elements in &[5, 20, 100] { + let string_array = create_csv_strings(num_elements); + let delimiter_array: ArrayRef = + Arc::new(StringArray::from(vec![Some(","); NUM_ROWS])); + + let args = vec![ + ColumnarValue::Array(string_array.clone()), + ColumnarValue::Array(delimiter_array), + ]; + let arg_fields = vec![ + Field::new("str", DataType::Utf8, true).into(), + Field::new("delimiter", DataType::Utf8, false).into(), + ]; + + let return_field = Field::new( + "result", + DataType::List(Arc::new(Field::new_list_field(DataType::Utf8, true))), + true, + ); + + group.bench_with_input( + BenchmarkId::from_parameter(num_elements), + &num_elements, + |b, _| { + let udf = StringToArray::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args.clone(), + arg_fields: arg_fields.clone(), + number_rows: NUM_ROWS, + return_field: return_field.clone().into(), + config_options: Arc::new(ConfigOptions::default()), + }) + .unwrap(), + ) + }) + }, + ); + } + + group.finish(); +} + +fn bench_string_to_array( + c: &mut Criterion, + group_name: &str, + make_strings: fn(usize) -> ArrayRef, + delimiter: &ColumnarValue, + null_str: Option<&ColumnarValue>, +) { + let mut group = c.benchmark_group(group_name); + + for &num_elements in &[5, 20, 100] { + let string_array = make_strings(num_elements); + + let mut args = vec![ + ColumnarValue::Array(string_array.clone()), + delimiter.clone(), + ]; + let mut arg_fields = vec![ + Field::new("str", DataType::Utf8, true).into(), + Field::new("delimiter", DataType::Utf8, true).into(), + ]; + if let Some(ns) = null_str { + args.push(ns.clone()); + arg_fields.push(Field::new("null_str", DataType::Utf8, true).into()); + } + + let return_field = Field::new( + "result", + DataType::List(Arc::new(Field::new_list_field(DataType::Utf8, true))), + true, + ); + + group.bench_with_input( + BenchmarkId::from_parameter(num_elements), + &num_elements, + |b, _| { + let udf = StringToArray::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args.clone(), + arg_fields: arg_fields.clone(), + number_rows: NUM_ROWS, + return_field: return_field.clone().into(), + config_options: Arc::new(ConfigOptions::default()), + }) + .unwrap(), + ) + }) + }, + ); + } + + group.finish(); +} + +/// Creates strings like "val1,val2,val3,...,valN" with `num_elements` elements. +fn create_csv_strings(num_elements: usize) -> ArrayRef { + let mut rng = StdRng::seed_from_u64(SEED); + let strings: StringArray = (0..NUM_ROWS) + .map(|_| { + let parts: Vec = (0..num_elements) + .map(|_| format!("val{}", rng.random_range(0..1000))) + .collect(); + Some(parts.join(",")) + }) + .collect(); + Arc::new(strings) +} + +/// Creates strings like "val1::val2::val3::...::valN". +fn create_multi_delim_strings(num_elements: usize) -> ArrayRef { + let mut rng = StdRng::seed_from_u64(SEED); + let strings: StringArray = (0..NUM_ROWS) + .map(|_| { + let parts: Vec = (0..num_elements) + .map(|_| format!("val{}", rng.random_range(0..1000))) + .collect(); + Some(parts.join("::")) + }) + .collect(); + Arc::new(strings) +} + +/// Creates CSV strings where ~10% of elements are the literal "NULL". +fn create_csv_strings_with_nulls(num_elements: usize) -> ArrayRef { + let mut rng = StdRng::seed_from_u64(SEED); + let strings: StringArray = (0..NUM_ROWS) + .map(|_| { + let parts: Vec = (0..num_elements) + .map(|_| { + if rng.random::() < 0.1 { + "NULL".to_string() + } else { + format!("val{}", rng.random_range(0..1000)) + } + }) + .collect(); + Some(parts.join(",")) + }) + .collect(); + Arc::new(strings) +} + +/// Creates short strings (length = `num_chars`) for the NULL-delimiter +/// (split-into-characters) benchmark. +fn create_short_strings(num_chars: usize) -> ArrayRef { + let mut rng = StdRng::seed_from_u64(SEED); + let strings: StringArray = (0..NUM_ROWS) + .map(|_| { + let s: String = (0..num_chars) + .map(|_| rng.random_range(b'a'..=b'z') as char) + .collect(); + Some(s) + }) + .collect(); + Arc::new(strings) +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/datafusion/functions-nested/src/string.rs b/datafusion/functions-nested/src/string.rs index aa2ae69bea4e5..b76736672cffa 100644 --- a/datafusion/functions-nested/src/string.rs +++ b/datafusion/functions-nested/src/string.rs @@ -26,13 +26,13 @@ use arrow::array::{ use arrow::datatypes::{DataType, Field}; use datafusion_common::utils::ListCoercion; -use datafusion_common::{DataFusionError, Result, not_impl_err}; +use datafusion_common::{DataFusionError, Result, ScalarValue, not_impl_err}; use std::fmt::{self, Write}; use crate::utils::make_scalar_function; use arrow::array::{ - GenericStringArray, StringArrayType, StringViewArray, + StringArrayType, StringViewArray, builder::{ArrayBuilder, LargeStringBuilder, StringViewBuilder}, cast::AsArray, }; @@ -43,8 +43,8 @@ use arrow::datatypes::DataType::{ use datafusion_common::cast::{ as_fixed_size_list_array, as_large_list_array, as_list_array, }; -use datafusion_common::exec_err; use datafusion_common::types::logical_string; +use datafusion_common::{exec_datafusion_err, exec_err}; use datafusion_expr::{ ArrayFunctionArgument, ArrayFunctionSignature, Coercion, ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature, @@ -189,11 +189,17 @@ make_udf_expr_and_func!( ) )] #[derive(Debug, PartialEq, Eq, Hash)] -pub(super) struct StringToArray { +pub struct StringToArray { signature: Signature, aliases: Vec, } +impl Default for StringToArray { + fn default() -> Self { + Self::new() + } +} + impl StringToArray { pub fn new() -> Self { Self { @@ -233,13 +239,71 @@ impl ScalarUDFImpl for StringToArray { } fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { - let args = &args.args; - match args[0].data_type() { - Utf8 | Utf8View => make_scalar_function(string_to_array_inner::)(args), - LargeUtf8 => make_scalar_function(string_to_array_inner::)(args), + let ScalarFunctionArgs { args, .. } = args; + + let delimiter_is_scalar = matches!(&args[1], ColumnarValue::Scalar(_)); + let null_str_is_scalar = args + .get(2) + .is_none_or(|a| matches!(a, ColumnarValue::Scalar(_))); + + if !(delimiter_is_scalar && null_str_is_scalar) { + return make_scalar_function(string_to_array_fallback)(&args); + } + + // Delimiter and null_str (if given) are scalar, so use the fast path + let delimiter = match &args[1] { + ColumnarValue::Scalar(s) => s.try_as_str().ok_or_else(|| { + exec_datafusion_err!( + "unsupported type for string_to_array delimiter: {:?}", + args[1].data_type() + ) + })?, + _ => unreachable!("delimiter must be scalar in this branch"), + }; + let null_value = match args.get(2) { + Some(ColumnarValue::Scalar(s)) => s.try_as_str().ok_or_else(|| { + exec_datafusion_err!( + "unsupported type for string_to_array null_str: {:?}", + args[2].data_type() + ) + })?, + _ => None, + }; + + let (all_scalar, string_array) = match &args[0] { + ColumnarValue::Array(a) => (false, Arc::clone(a)), + ColumnarValue::Scalar(s) => (true, s.to_array_of_size(1)?), + }; + + let result = match string_array.data_type() { + Utf8 => { + let arr = string_array.as_string::(); + let builder = + StringBuilder::with_capacity(arr.len(), arr.get_buffer_memory_size()); + string_to_array_scalar_args(&arr, delimiter, null_value, builder) + } + Utf8View => { + let arr = string_array.as_string_view(); + let builder = StringViewBuilder::with_capacity(arr.len()); + string_to_array_scalar_args(&arr, delimiter, null_value, builder) + } + LargeUtf8 => { + let arr = string_array.as_string::(); + let builder = LargeStringBuilder::with_capacity( + arr.len(), + arr.get_buffer_memory_size(), + ); + string_to_array_scalar_args(&arr, delimiter, null_value, builder) + } other => { exec_err!("unsupported type for string_to_array function as {other:?}") } + }?; + + if all_scalar { + ScalarValue::try_from_array(&result, 0).map(ColumnarValue::Scalar) + } else { + Ok(ColumnarValue::Array(result)) } } @@ -252,6 +316,201 @@ impl ScalarUDFImpl for StringToArray { } } +/// Appends `value` to the string builder, or NULL if it matches `null_value`. +#[inline(always)] +fn append_part( + builder: &mut impl StringArrayBuilderType, + value: &str, + null_value: Option<&str>, +) { + if null_value == Some(value) { + builder.append_null(); + } else { + builder.append_value(value); + } +} + +/// Optimized `string_to_array` implementation for the common case where +/// delimiter and null_value are scalar values. +fn string_to_array_scalar_args<'a, StringArrType, StringBuilderType>( + string_array: &StringArrType, + delimiter: Option<&str>, + null_value: Option<&str>, + string_builder: StringBuilderType, +) -> Result +where + StringArrType: StringArrayType<'a>, + StringBuilderType: StringArrayBuilderType, +{ + let mut list_builder = ListBuilder::new(string_builder); + + match delimiter { + Some("") => { + // Empty delimiter: each non-empty string becomes a single-element list. + // Empty strings produce an empty array (PostgreSQL compat). + for i in 0..string_array.len() { + if string_array.is_null(i) { + list_builder.append(false); + continue; + } + let string = string_array.value(i); + if !string.is_empty() { + append_part(list_builder.values(), string, null_value); + } + list_builder.append(true); + } + } + Some(delimiter) => { + // Rather than using `str::split`, do the split ourselves using + // `memmem::Finder`. This allows pre-compiling the delimiter search + // pattern once and reusing it for all rows. + let finder = memchr::memmem::Finder::new(delimiter.as_bytes()); + let delim_len = delimiter.len(); + + for i in 0..string_array.len() { + if string_array.is_null(i) { + list_builder.append(false); + continue; + } + let string = string_array.value(i); + if !string.is_empty() { + let bytes = string.as_bytes(); + let mut start = 0; + for pos in finder.find_iter(bytes) { + append_part( + list_builder.values(), + &string[start..pos], + null_value, + ); + start = pos + delim_len; + } + // Trailing part after last delimiter (or entire string if no + // delimiter was found). + append_part(list_builder.values(), &string[start..], null_value); + } + list_builder.append(true); + } + } + None => { + // NULL delimiter: split into individual characters. + for i in 0..string_array.len() { + if string_array.is_null(i) { + list_builder.append(false); + continue; + } + let string = string_array.value(i); + for (pos, c) in string.char_indices() { + append_part( + list_builder.values(), + &string[pos..pos + c.len_utf8()], + null_value, + ); + } + list_builder.append(true); + } + } + } + + Ok(Arc::new(list_builder.finish()) as ArrayRef) +} + +/// Fallback path for `string_to_array` when delimiter and/or null_value +/// are array columns rather than scalars. +fn string_to_array_fallback(args: &[ArrayRef]) -> Result { + let null_value_array = args.get(2); + + match args[0].data_type() { + Utf8 => { + let arr = args[0].as_string::(); + let builder = + StringBuilder::with_capacity(arr.len(), arr.get_buffer_memory_size()); + string_to_array_column_args(&arr, &args[1], null_value_array, builder) + } + Utf8View => { + let arr = args[0].as_string_view(); + let builder = StringViewBuilder::with_capacity(arr.len()); + string_to_array_column_args(&arr, &args[1], null_value_array, builder) + } + LargeUtf8 => { + let arr = args[0].as_string::(); + let builder = LargeStringBuilder::with_capacity( + arr.len(), + arr.get_buffer_memory_size(), + ); + string_to_array_column_args(&arr, &args[1], null_value_array, builder) + } + other => exec_err!("unsupported type for string_to_array function as {other:?}"), + } +} + +fn string_to_array_column_args<'a, StringArrType, StringBuilderType>( + string_array: &StringArrType, + delimiter_array: &ArrayRef, + null_value_array: Option<&ArrayRef>, + string_builder: StringBuilderType, +) -> Result +where + StringArrType: StringArrayType<'a>, + StringBuilderType: StringArrayBuilderType, +{ + let mut list_builder = ListBuilder::new(string_builder); + + for i in 0..string_array.len() { + if string_array.is_null(i) { + list_builder.append(false); + continue; + } + + let string = string_array.value(i); + let delimiter = get_str_value(delimiter_array, i); + let null_value = null_value_array.and_then(|arr| get_str_value(arr, i)); + + match delimiter { + Some("") => { + if !string.is_empty() { + append_part(list_builder.values(), string, null_value); + } + } + Some(delimiter) => { + if !string.is_empty() { + for part in string.split(delimiter) { + append_part(list_builder.values(), part, null_value); + } + } + } + None => { + for (pos, c) in string.char_indices() { + append_part( + list_builder.values(), + &string[pos..pos + c.len_utf8()], + null_value, + ); + } + } + } + + list_builder.append(true); + } + + Ok(Arc::new(list_builder.finish()) as ArrayRef) +} + +/// Returns the string value at index `i` from a string array of any type. +fn get_str_value(array: &ArrayRef, i: usize) -> Option<&str> { + if array.is_null(i) { + return None; + } + match array.data_type() { + Utf8 => Some(array.as_string::().value(i)), + LargeUtf8 => Some(array.as_string::().value(i)), + Utf8View => Some(array.as_string_view().value(i)), + other => { + debug_assert!(false, "unexpected type in get_str_value: {other:?}"); + None + } + } +} + fn array_to_string_inner(args: &[ArrayRef]) -> Result { if args.len() < 2 || args.len() > 3 { return exec_err!("array_to_string expects two or three arguments"); @@ -521,275 +780,6 @@ where Ok(()) } -/// String_to_array SQL function -/// Splits string at occurrences of delimiter and returns an array of parts -/// string_to_array('abc~@~def~@~ghi', '~@~') = '["abc", "def", "ghi"]' -fn string_to_array_inner(args: &[ArrayRef]) -> Result { - if args.len() < 2 || args.len() > 3 { - return exec_err!("string_to_array expects two or three arguments"); - } - - match args[0].data_type() { - Utf8 => { - let string_array = args[0].as_string::(); - let builder = StringBuilder::with_capacity( - string_array.len(), - string_array.get_buffer_memory_size(), - ); - string_to_array_inner_2::<&GenericStringArray, StringBuilder>( - args, - &string_array, - builder, - ) - } - Utf8View => { - let string_array = args[0].as_string_view(); - let builder = StringViewBuilder::with_capacity(string_array.len()); - string_to_array_inner_2::<&StringViewArray, StringViewBuilder>( - args, - &string_array, - builder, - ) - } - LargeUtf8 => { - let string_array = args[0].as_string::(); - let builder = LargeStringBuilder::with_capacity( - string_array.len(), - string_array.get_buffer_memory_size(), - ); - string_to_array_inner_2::<&GenericStringArray, LargeStringBuilder>( - args, - &string_array, - builder, - ) - } - other => exec_err!( - "unsupported type for first argument to string_to_array function as {other:?}" - ), - } -} - -fn string_to_array_inner_2<'a, StringArrType, StringBuilderType>( - args: &'a [ArrayRef], - string_array: &StringArrType, - string_builder: StringBuilderType, -) -> Result -where - StringArrType: StringArrayType<'a>, - StringBuilderType: StringArrayBuilderType, -{ - match args[1].data_type() { - Utf8 => { - let delimiter_array = args[1].as_string::(); - if args.len() == 2 { - string_to_array_impl::< - StringArrType, - &GenericStringArray, - &StringViewArray, - StringBuilderType, - >(string_array, &delimiter_array, None, string_builder) - } else { - string_to_array_inner_3::< - StringArrType, - &GenericStringArray, - StringBuilderType, - >(args, string_array, &delimiter_array, string_builder) - } - } - Utf8View => { - let delimiter_array = args[1].as_string_view(); - - if args.len() == 2 { - string_to_array_impl::< - StringArrType, - &StringViewArray, - &StringViewArray, - StringBuilderType, - >(string_array, &delimiter_array, None, string_builder) - } else { - string_to_array_inner_3::< - StringArrType, - &StringViewArray, - StringBuilderType, - >(args, string_array, &delimiter_array, string_builder) - } - } - LargeUtf8 => { - let delimiter_array = args[1].as_string::(); - if args.len() == 2 { - string_to_array_impl::< - StringArrType, - &GenericStringArray, - &StringViewArray, - StringBuilderType, - >(string_array, &delimiter_array, None, string_builder) - } else { - string_to_array_inner_3::< - StringArrType, - &GenericStringArray, - StringBuilderType, - >(args, string_array, &delimiter_array, string_builder) - } - } - other => exec_err!( - "unsupported type for second argument to string_to_array function as {other:?}" - ), - } -} - -fn string_to_array_inner_3<'a, StringArrType, DelimiterArrType, StringBuilderType>( - args: &'a [ArrayRef], - string_array: &StringArrType, - delimiter_array: &DelimiterArrType, - string_builder: StringBuilderType, -) -> Result -where - StringArrType: StringArrayType<'a>, - DelimiterArrType: StringArrayType<'a>, - StringBuilderType: StringArrayBuilderType, -{ - match args[2].data_type() { - Utf8 => { - let null_type_array = Some(args[2].as_string::()); - string_to_array_impl::< - StringArrType, - DelimiterArrType, - &GenericStringArray, - StringBuilderType, - >( - string_array, - delimiter_array, - null_type_array, - string_builder, - ) - } - Utf8View => { - let null_type_array = Some(args[2].as_string_view()); - string_to_array_impl::< - StringArrType, - DelimiterArrType, - &StringViewArray, - StringBuilderType, - >( - string_array, - delimiter_array, - null_type_array, - string_builder, - ) - } - LargeUtf8 => { - let null_type_array = Some(args[2].as_string::()); - string_to_array_impl::< - StringArrType, - DelimiterArrType, - &GenericStringArray, - StringBuilderType, - >( - string_array, - delimiter_array, - null_type_array, - string_builder, - ) - } - other => { - exec_err!("unsupported type for string_to_array function as {other:?}") - } - } -} - -fn string_to_array_impl< - 'a, - StringArrType, - DelimiterArrType, - NullValueArrType, - StringBuilderType, ->( - string_array: &StringArrType, - delimiter_array: &DelimiterArrType, - null_value_array: Option, - string_builder: StringBuilderType, -) -> Result -where - StringArrType: StringArrayType<'a>, - DelimiterArrType: StringArrayType<'a>, - NullValueArrType: StringArrayType<'a>, - StringBuilderType: StringArrayBuilderType, -{ - let mut list_builder = ListBuilder::new(string_builder); - - match null_value_array { - None => string_array.iter().zip(delimiter_array.iter()).for_each( - |(string, delimiter)| match (string, delimiter) { - (Some(string), Some("")) => { - if !string.is_empty() { - list_builder.values().append_value(string); - } - list_builder.append(true); - } - (Some(string), Some(delimiter)) => { - if !string.is_empty() { - string.split(delimiter).for_each(|s| { - list_builder.values().append_value(s); - }); - } - list_builder.append(true); - } - (Some(string), None) => { - string.chars().map(|c| c.to_string()).for_each(|c| { - list_builder.values().append_value(c.as_str()); - }); - list_builder.append(true); - } - _ => list_builder.append(false), - }, - ), - Some(null_value_array) => string_array - .iter() - .zip(delimiter_array.iter()) - .zip(null_value_array.iter()) - .for_each(|((string, delimiter), null_value)| { - match (string, delimiter) { - (Some(string), Some("")) => { - if !string.is_empty() { - if Some(string) == null_value { - list_builder.values().append_null(); - } else { - list_builder.values().append_value(string); - } - } - list_builder.append(true); - } - (Some(string), Some(delimiter)) => { - if !string.is_empty() { - string.split(delimiter).for_each(|s| { - if Some(s) == null_value { - list_builder.values().append_null(); - } else { - list_builder.values().append_value(s); - } - }); - } - list_builder.append(true); - } - (Some(string), None) => { - string.chars().map(|c| c.to_string()).for_each(|c| { - if Some(c.as_str()) == null_value { - list_builder.values().append_null(); - } else { - list_builder.values().append_value(c.as_str()); - } - }); - list_builder.append(true); - } - _ => list_builder.append(false), // null value - } - }), - }; - - let list_array = list_builder.finish(); - Ok(Arc::new(list_array) as ArrayRef) -} - trait StringArrayBuilderType: ArrayBuilder { fn append_value(&mut self, val: &str); diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index 1216e1e0238dc..25136ca777c74 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -9358,6 +9358,74 @@ select string_to_list(e, 'm') from values; [adipiscing] NULL +# string_to_array: single-char delimiter producing multiple elements +query ? +SELECT string_to_array('a,b,c', ',') +---- +[a, b, c] + +# string_to_array: delimiter not found in input +query ? +SELECT string_to_array('abc', ',') +---- +[abc] + +# string_to_array: empty string input +query ? +SELECT string_to_array('', ',') +---- +[] + +# string_to_array: null_str matching multiple elements +query ? +SELECT string_to_array('a,NULL,b,NULL,c', ',', 'NULL') +---- +[a, NULL, b, NULL, c] + +# string_to_array: null_str matching all elements +query ? +SELECT string_to_array('x,x,x', ',', 'x') +---- +[NULL, NULL, NULL] + +# string_to_array: null_str with empty-string delimiter +query ? +SELECT string_to_array('abc', '', 'abc') +---- +[NULL] + +# string_to_array: NULL string input +query ? +SELECT string_to_array(NULL, ',') +---- +NULL + +# string_to_array: columnar delimiter +query ?? +SELECT string_to_array('a,b,c', col1), string_to_array('a::b::c', col2) + FROM (VALUES (',', '::')) AS t(col1, col2) +---- +[a, b, c] [a, b, c] + +# string_to_array: columnar null_str +query ? +SELECT string_to_array('a,NULL,b', ',', col1) + FROM (VALUES ('NULL')) AS t(col1) +---- +[a, NULL, b] + +# string_to_array: adjacent delimiters produce empty strings +query ? +SELECT string_to_array('a,,b', ',') +---- +[a, , b] + +# string_to_array: delimiter at start and end +query ? +SELECT string_to_array(',a,b,', ',') +---- +[, a, b, ] + # array_resize scalar function #1 query ? select array_resize(make_array(1, 2, 3), 1); From a120e4d1bde4c549d8a9323b8f35ab5451f6b337 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 31 Mar 2026 08:43:56 -0700 Subject: [PATCH 04/15] chore(deps): bump taiki-e/install-action from 2.69.7 to 2.70.3 (#21271) Bumps [taiki-e/install-action](https://github.com/taiki-e/install-action) from 2.69.7 to 2.70.3.
Release notes

Sourced from taiki-e/install-action's releases.

2.70.3

  • Update wasm-bindgen@latest to 0.2.116.

  • Update cargo-insta@latest to 1.47.2.

  • Update tombi@latest to 0.9.12.

  • Update biome@latest to 2.4.10.

2.70.2

  • Update vacuum@latest to 0.25.3.

  • Update tombi@latest to 0.9.11.

2.70.1

  • Update cargo-insta@latest to 1.47.1.

  • Update cargo-binstall@latest to 1.17.9.

  • Update tombi@latest to 0.9.10.

2.70.0

  • Install uv, uvw (Windows-only), and uvx binaries when installing uv. Previously, only uv binary was installed. (#1632)

2.69.14

  • Update just@latest to 1.48.1.

  • Update wasm-bindgen@latest to 0.2.115.

2.69.13

  • Update mise@latest to 2026.3.17.

  • Update cargo-insta@latest to 1.47.0.

2.69.12

  • Update uv@latest to 0.11.2.

2.69.11

  • Update dprint@latest to 0.53.1.

  • Update mise@latest to 2026.3.16.

2.69.10

  • Update biome@latest to 2.4.9.

  • Update mise@latest to 2026.3.15.

2.69.9

  • Update uv@latest to 0.11.1.

... (truncated)

Changelog

Sourced from taiki-e/install-action's changelog.

Changelog

All notable changes to this project will be documented in this file.

This project adheres to Semantic Versioning.

[Unreleased]

[2.70.3] - 2026-03-31

  • Update wasm-bindgen@latest to 0.2.116.

  • Update cargo-insta@latest to 1.47.2.

  • Update tombi@latest to 0.9.12.

  • Update biome@latest to 2.4.10.

[2.70.2] - 2026-03-30

  • Update vacuum@latest to 0.25.3.

  • Update tombi@latest to 0.9.11.

[2.70.1] - 2026-03-29

  • Update cargo-insta@latest to 1.47.1.

  • Update cargo-binstall@latest to 1.17.9.

  • Update tombi@latest to 0.9.10.

[2.70.0] - 2026-03-28

  • Install uv, uvw (Windows-only), and uvx binaries when installing uv. Previously, only uv binary was installed. (#1632)

[2.69.14] - 2026-03-28

  • Update just@latest to 1.48.1.

  • Update wasm-bindgen@latest to 0.2.115.

[2.69.13] - 2026-03-27

  • Update mise@latest to 2026.3.17.

... (truncated)

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=taiki-e/install-action&package-manager=github_actions&previous-version=2.69.7&new-version=2.70.3)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/audit.yml | 2 +- .github/workflows/rust.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/audit.yml b/.github/workflows/audit.yml index c4acb09fe8702..c880d1bae000e 100644 --- a/.github/workflows/audit.yml +++ b/.github/workflows/audit.yml @@ -42,7 +42,7 @@ jobs: steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Install cargo-audit - uses: taiki-e/install-action@0d865d5cc6d507df4765f1f866bfae8bab4e2a73 # v2.69.7 + uses: taiki-e/install-action@6ef672efc2b5aabc787a9e94baf4989aa02a97df # v2.70.3 with: tool: cargo-audit - name: Run audit check diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 24b988476fc14..0d12ddc375718 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -431,7 +431,7 @@ jobs: sudo apt-get update -qq sudo apt-get install -y -qq clang - name: Setup wasm-pack - uses: taiki-e/install-action@0d865d5cc6d507df4765f1f866bfae8bab4e2a73 # v2.69.7 + uses: taiki-e/install-action@6ef672efc2b5aabc787a9e94baf4989aa02a97df # v2.70.3 with: tool: wasm-pack - name: Run tests with headless mode @@ -771,7 +771,7 @@ jobs: - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - name: Install cargo-msrv - uses: taiki-e/install-action@0d865d5cc6d507df4765f1f866bfae8bab4e2a73 # v2.69.7 + uses: taiki-e/install-action@6ef672efc2b5aabc787a9e94baf4989aa02a97df # v2.70.3 with: tool: cargo-msrv From dcf818e4c57b39253c095b0a8098a74af6bcd635 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 31 Mar 2026 08:46:14 -0700 Subject: [PATCH 05/15] chore(deps): bump rustyline from 17.0.2 to 18.0.0 (#21276) Bumps [rustyline](https://github.com/kkawakam/rustyline) from 17.0.2 to 18.0.0.
Release notes

Sourced from rustyline's releases.

18.0.0

What's Changed

  • Support minimal repaint #882
  • Fix edit_kill #887, #885
  • On windows, check that prompt is not styled #890, #889, #836, #562, #702, #215
  • Bump windows-sys version #892
  • Check NO_COLOR environment variable #894
  • Fix clippy warning #896
  • Update radix_trie requirement from 0.2 to 0.3 #898
  • Introduce Prompt trait for styling #893
  • Fix partial refresh #899, #897
  • Remove doc_auto_cfg #900
  • Fix vi paste #901, #732
  • Install signal handlers only when actually reading #903, #902
  • Configure cargo deny #904, #637
  • Update rusqlite version #906
  • Update signal-hook to 0.4 #907
  • Replace fd-lock with std File::lock #909, #908
  • Bump nix version to 0.31 #911
  • Refactor code related to signal handling #912
  • Bump signal-hook version #913, #910
  • Update rand to 0.10 #917
  • Update skim from 0.10 to 3.3.0 #919
  • Allow getting handler to fail after partial key event matches. #924, #923
  • Update dependencies #929
  • Clippy #930
  • Add comments on how to debug with PowerShell #933
  • Try to fix a panic related to SIG_PIPE #932, #931

Full Changelog: https://github.com/kkawakam/rustyline/compare/v17.0.2...v18.0.0

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=rustyline&package-manager=cargo&previous-version=17.0.2&new-version=18.0.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 44 +++++++++------------------------------ datafusion-cli/Cargo.toml | 2 +- 2 files changed, 11 insertions(+), 35 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 802bfc4703010..6a4110ea33aa6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1746,7 +1746,7 @@ dependencies = [ "itertools 0.14.0", "liblzma", "log", - "nix 0.31.2", + "nix", "object_store", "parking_lot", "parquet", @@ -2085,7 +2085,7 @@ dependencies = [ "insta", "log", "mimalloc", - "nix 0.31.2", + "nix", "nom", "object_store", "prost", @@ -2808,9 +2808,9 @@ checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" [[package]] name = "endian-type" -version = "0.1.2" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d" +checksum = "869b0adbda23651a9c5c0c3d270aac9fcb52e8622a8f2b17e57802d7791962f2" [[package]] name = "enum-ordinalize" @@ -2905,17 +2905,6 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" -[[package]] -name = "fd-lock" -version = "4.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ce92ff622d6dadf7349484f42c93271a0d49b7cc4d466a936405bacbe10aa78" -dependencies = [ - "cfg-if", - "rustix", - "windows-sys 0.59.0", -] - [[package]] name = "ferroid" version = "0.8.9" @@ -4079,18 +4068,6 @@ dependencies = [ "smallvec", ] -[[package]] -name = "nix" -version = "0.30.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6" -dependencies = [ - "bitflags", - "cfg-if", - "cfg_aliases", - "libc", -] - [[package]] name = "nix" version = "0.31.2" @@ -4900,9 +4877,9 @@ checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" [[package]] name = "radix_trie" -version = "0.2.1" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c069c179fcdc6a2fe24d8d18305cf085fdbd4f922c041943e203685d6a1c58fd" +checksum = "3b4431027dcd37fc2a73ef740b5f233aa805897935b8bce0195e41bbf9a3289a" dependencies = [ "endian-type", "nibble_vec", @@ -5308,24 +5285,23 @@ checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" [[package]] name = "rustyline" -version = "17.0.2" +version = "18.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e902948a25149d50edc1a8e0141aad50f54e22ba83ff988cf8f7c9ef07f50564" +checksum = "4a990b25f351b25139ddc7f21ee3f6f56f86d6846b74ac8fad3a719a287cd4a0" dependencies = [ "bitflags", "cfg-if", "clipboard-win", - "fd-lock", "home", "libc", "log", "memchr", - "nix 0.30.1", + "nix", "radix_trie", "unicode-segmentation", "unicode-width 0.2.2", "utf8parse", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index 3fe6be964c3f6..40e0e50dacd7a 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -65,7 +65,7 @@ object_store = { workspace = true, features = ["aws", "gcp", "http"] } parking_lot = { workspace = true } parquet = { workspace = true, default-features = false } regex = { workspace = true } -rustyline = "17.0" +rustyline = "18.0" tokio = { workspace = true, features = ["macros", "parking_lot", "rt", "rt-multi-thread", "signal", "sync"] } url = { workspace = true } From 26783124f1a79969f455f08fb3548ad58811ed20 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 31 Mar 2026 15:46:35 +0000 Subject: [PATCH 06/15] chore(deps): bump ctor from 0.6.3 to 0.8.0 (#21282) Bumps [ctor](https://github.com/mmastrac/rust-ctor) from 0.6.3 to 0.8.0.
Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=ctor&package-manager=cargo&previous-version=0.6.3&new-version=0.8.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 8 ++++---- Cargo.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6a4110ea33aa6..3a4449f4a50a3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1627,9 +1627,9 @@ dependencies = [ [[package]] name = "ctor" -version = "0.6.3" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "424e0138278faeb2b401f174ad17e715c829512d74f3d1e81eb43365c2e0590e" +checksum = "352d39c2f7bef1d6ad73db6f5160efcaed66d94ef8c6c573a8410c00bf909a98" dependencies = [ "ctor-proc-macro", "dtor", @@ -2757,9 +2757,9 @@ dependencies = [ [[package]] name = "dtor" -version = "0.1.1" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "404d02eeb088a82cfd873006cb713fe411306c7d182c344905e101fb1167d301" +checksum = "f1057d6c64987086ff8ed0fd3fbf377a6b7d205cc7715868cd401705f715cbe4" dependencies = [ "dtor-proc-macro", ] diff --git a/Cargo.toml b/Cargo.toml index 1bf039845fb7f..ffdc14cc514dd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -114,7 +114,7 @@ bytes = "1.11" bzip2 = "0.6.1" chrono = { version = "0.4.44", default-features = false } criterion = "0.8" -ctor = "0.6.3" +ctor = "0.8.0" dashmap = "6.0.1" datafusion = { path = "datafusion/core", version = "53.0.0", default-features = false } datafusion-catalog = { path = "datafusion/catalog", version = "53.0.0" } From 7138a832e33200cd3ffc8a6fc3cc491d71971f1b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 31 Mar 2026 15:47:02 +0000 Subject: [PATCH 07/15] chore(deps): bump snmalloc-rs from 0.3.8 to 0.7.4 (#21280) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [snmalloc-rs](https://github.com/microsoft/snmalloc) from 0.3.8 to 0.7.4.
Release notes

Sourced from snmalloc-rs's releases.

0.7.4

What's Changed

Optimisations

Bug Fixes

Platform support

Rust

C++ support

Build

CI

New Contributors

Full Changelog: https://github.com/microsoft/snmalloc/compare/0.7.3...0.7.4

0.7.3

What's Changed

This release primarily improves for CMake for downstream uses of snmalloc.

Build

... (truncated)

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=snmalloc-rs&package-manager=cargo&previous-version=0.3.8&new-version=0.7.4)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 8 ++++---- benchmarks/Cargo.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3a4449f4a50a3..5380451bd0508 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5646,18 +5646,18 @@ checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" [[package]] name = "snmalloc-rs" -version = "0.3.8" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb317153089fdfa4d8a2eec059d40a5a23c3bde43995ea23b19121c3f621e74a" +checksum = "530a04ae687609072d0edd38866406fbbcd23d2f716791437e312ec4d64a355a" dependencies = [ "snmalloc-sys", ] [[package]] name = "snmalloc-sys" -version = "0.3.8" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "065fea53d32bb77bc36cca466cb191f2e5216ebfd0ed360b1d64889ee6e559ea" +checksum = "a96cbeb16d6bcc5979f80ec907582a886b7fb3b9a707678b63dd93a10d8ee858" dependencies = [ "cmake", ] diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 56f7704309780..f82f1c0a03e3d 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -56,7 +56,7 @@ rand = { workspace = true } regex.workspace = true serde = { version = "1.0.228", features = ["derive"] } serde_json = { workspace = true } -snmalloc-rs = { version = "0.3", optional = true } +snmalloc-rs = { version = "0.7", optional = true } tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] } tokio-util = { version = "0.7.17" } From 4460ae0d2615f42787506a2efd59c9187ba6417d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 31 Mar 2026 08:47:36 -0700 Subject: [PATCH 08/15] chore(deps): bump sha1 from 0.10.6 to 0.11.0 (#21277) Bumps [sha1](https://github.com/RustCrypto/hashes) from 0.10.6 to 0.11.0.
Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=sha1&package-manager=cargo&previous-version=0.10.6&new-version=0.11.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 88 ++++++++++++++++++++++++++++++++----- datafusion/spark/Cargo.toml | 2 +- 2 files changed, 77 insertions(+), 13 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5380451bd0508..8d87c5bc5b541 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -616,7 +616,7 @@ dependencies = [ "fastrand", "hex", "http 1.4.0", - "sha1", + "sha1 0.10.6", "time", "tokio", "tracing", @@ -1040,7 +1040,7 @@ version = "0.10.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "46502ad458c9a52b69d4d4d32775c788b7a1b85e8bc9d482d92250fc0e3f8efe" dependencies = [ - "digest", + "digest 0.10.7", ] [[package]] @@ -1054,7 +1054,7 @@ dependencies = [ "cc", "cfg-if", "constant_time_eq", - "cpufeatures", + "cpufeatures 0.2.17", ] [[package]] @@ -1066,6 +1066,15 @@ dependencies = [ "generic-array", ] +[[package]] +name = "block-buffer" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdd35008169921d80bc60d3d0ab416eecb028c4cd653352907921d95084790be" +dependencies = [ + "hybrid-array", +] + [[package]] name = "bollard" version = "0.20.2" @@ -1418,6 +1427,12 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "const-oid" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6ef517f0926dd24a1582492c791b6a4818a4d94e789a334894aa15b0d12f55c" + [[package]] name = "const-random" version = "0.1.18" @@ -1493,6 +1508,15 @@ dependencies = [ "libc", ] +[[package]] +name = "cpufeatures" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201" +dependencies = [ + "libc", +] + [[package]] name = "crc" version = "3.4.0" @@ -1604,6 +1628,15 @@ dependencies = [ "typenum", ] +[[package]] +name = "crypto-common" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77727bb15fa921304124b128af125e7e3b968275d1b108b379190264f4423710" +dependencies = [ + "hybrid-array", +] + [[package]] name = "csv" version = "1.4.0" @@ -2574,7 +2607,7 @@ dependencies = [ "percent-encoding", "rand 0.9.2", "serde_json", - "sha1", + "sha1 0.11.0", "sha2", "url", ] @@ -2701,11 +2734,22 @@ version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ - "block-buffer", - "crypto-common", + "block-buffer 0.10.4", + "crypto-common 0.1.7", "subtle", ] +[[package]] +name = "digest" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4850db49bf08e663084f7fb5c87d202ef91a3907271aff24a94eb97ff039153c" +dependencies = [ + "block-buffer 0.12.0", + "const-oid", + "crypto-common 0.2.1", +] + [[package]] name = "dirs" version = "6.0.0" @@ -3303,7 +3347,7 @@ version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" dependencies = [ - "digest", + "digest 0.10.7", ] [[package]] @@ -3388,6 +3432,15 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424" +[[package]] +name = "hybrid-array" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a79f2aff40c18ab8615ddc5caa9eb5b96314aef18fe5823090f204ad988e813" +dependencies = [ + "typenum", +] + [[package]] name = "hyper" version = "1.8.1" @@ -3998,7 +4051,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" dependencies = [ "cfg-if", - "digest", + "digest 0.10.7", ] [[package]] @@ -5562,8 +5615,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" dependencies = [ "cfg-if", - "cpufeatures", - "digest", + "cpufeatures 0.2.17", + "digest 0.10.7", +] + +[[package]] +name = "sha1" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aacc4cc499359472b4abe1bf11d0b12e688af9a805fa5e3016f9a386dc2d0214" +dependencies = [ + "cfg-if", + "cpufeatures 0.3.0", + "digest 0.11.2", ] [[package]] @@ -5573,8 +5637,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" dependencies = [ "cfg-if", - "cpufeatures", - "digest", + "cpufeatures 0.2.17", + "digest 0.10.7", ] [[package]] diff --git a/datafusion/spark/Cargo.toml b/datafusion/spark/Cargo.toml index 162b6d814e804..bd5f2bb18aaec 100644 --- a/datafusion/spark/Cargo.toml +++ b/datafusion/spark/Cargo.toml @@ -60,7 +60,7 @@ log = { workspace = true } percent-encoding = "2.3.2" rand = { workspace = true } serde_json = { workspace = true } -sha1 = "0.10" +sha1 = "0.11" sha2 = { workspace = true } url = { workspace = true } From 55d1995624d2d9c212b103217a0a3b692416a534 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 31 Mar 2026 09:06:38 -0700 Subject: [PATCH 09/15] chore(deps): bump astral-sh/setup-uv from 7.6.0 to 8.0.0 (#21272) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [astral-sh/setup-uv](https://github.com/astral-sh/setup-uv) from 7.6.0 to 8.0.0.
Release notes

Sourced from astral-sh/setup-uv's releases.

v8.0.0 🌈 Immutable releases and secure tags

This is the first immutable release of setup-uv 🥳

All future releases are also immutable, if you want to know more about what this means checkout the docs.

This release also has two breaking changes

New format for manifest-file

The previously deprecated way of defining a custom version manifest to control which uv versions are available and where to download them from got removed. The functionality is still there but you have to use the new format.

No more major and minor tags

To increase security even more we will stop publishing minor tags. You won't be able to use @v8 or @v8.0 any longer. We do this because pinning to major releases opens up users to supply chain attacks like what happened to tj-actions.

[!TIP] Use the immutable tag as a version astral-sh/setup-uv@v8.0.0 Or even better the githash astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57

🚨 Breaking changes

🧰 Maintenance

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=astral-sh/setup-uv&package-manager=github_actions&previous-version=7.6.0&new-version=8.0.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/docs.yaml | 2 +- .github/workflows/docs_pr.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index 63add4dacc812..89bd77670c12d 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -41,7 +41,7 @@ jobs: path: asf-site - name: Setup uv - uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7.6.0 + uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8.0.0 - name: Install dependencies run: uv sync --package datafusion-docs diff --git a/.github/workflows/docs_pr.yaml b/.github/workflows/docs_pr.yaml index cc5b9a1e44bb5..5abf9a119d2f5 100644 --- a/.github/workflows/docs_pr.yaml +++ b/.github/workflows/docs_pr.yaml @@ -45,7 +45,7 @@ jobs: submodules: true fetch-depth: 1 - name: Setup uv - uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7.6.0 + uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8.0.0 - name: Install doc dependencies run: uv sync --package datafusion-docs - name: Install dependency graph tooling From 3d177fcc67aa1acd51b26c549c3370cb66ae35a2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 31 Mar 2026 09:07:15 -0700 Subject: [PATCH 10/15] chore(deps): bump github/codeql-action from 4.34.1 to 4.35.1 (#21273) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [github/codeql-action](https://github.com/github/codeql-action) from 4.34.1 to 4.35.1.
Release notes

Sourced from github/codeql-action's releases.

v4.35.1

v4.35.0

Changelog

Sourced from github/codeql-action's changelog.

CodeQL Action Changelog

See the releases page for the relevant changes to the CodeQL CLI and language packs.

[UNRELEASED]

No user facing changes.

4.35.1 - 27 Mar 2026

4.35.0 - 27 Mar 2026

4.34.1 - 20 Mar 2026

  • Downgrade default CodeQL bundle version to 2.24.3 due to issues with a small percentage of Actions and JavaScript analyses. #3762

4.34.0 - 20 Mar 2026

  • Added an experimental change which disables TRAP caching when improved incremental analysis is enabled, since improved incremental analysis supersedes TRAP caching. This will improve performance and reduce Actions cache usage. We expect to roll this change out to everyone in March. #3569
  • We are rolling out improved incremental analysis to C/C++ analyses that use build mode none. We expect this rollout to be complete by the end of April 2026. #3584
  • Update default CodeQL bundle version to 2.25.0. #3585

4.33.0 - 16 Mar 2026

  • Upcoming change: Starting April 2026, the CodeQL Action will skip collecting file coverage information on pull requests to improve analysis performance. File coverage information will still be computed on non-PR analyses. Pull request analyses will log a warning about this upcoming change. #3562

    To opt out of this change:

    • Repositories owned by an organization: Create a custom repository property with the name github-codeql-file-coverage-on-prs and the type "True/false", then set this property to true in the repository's settings. For more information, see Managing custom properties for repositories in your organization. Alternatively, if you are using an advanced setup workflow, you can set the CODEQL_ACTION_FILE_COVERAGE_ON_PRS environment variable to true in your workflow.
    • User-owned repositories using default setup: Switch to an advanced setup workflow and set the CODEQL_ACTION_FILE_COVERAGE_ON_PRS environment variable to true in your workflow.
    • User-owned repositories using advanced setup: Set the CODEQL_ACTION_FILE_COVERAGE_ON_PRS environment variable to true in your workflow.
  • Fixed a bug which caused the CodeQL Action to fail loading repository properties if a "Multi select" repository property was configured for the repository. #3557

  • The CodeQL Action now loads custom repository properties on GitHub Enterprise Server, enabling the customization of features such as github-codeql-disable-overlay that was previously only available on GitHub.com. #3559

  • Once private package registries can be configured with OIDC-based authentication for organizations, the CodeQL Action will now be able to accept such configurations. #3563

  • Fixed the retry mechanism for database uploads. Previously this would fail with the error "Response body object should not be disturbed or locked". #3564

  • A warning is now emitted if the CodeQL Action detects a repository property whose name suggests that it relates to the CodeQL Action, but which is not one of the properties recognised by the current version of the CodeQL Action. #3570

4.32.6 - 05 Mar 2026

  • Update default CodeQL bundle version to 2.24.3. #3548

4.32.5 - 02 Mar 2026

  • Repositories owned by an organization can now set up the github-codeql-disable-overlay custom repository property to disable improved incremental analysis for CodeQL. First, create a custom repository property with the name github-codeql-disable-overlay and the type "True/false" in the organization's settings. Then in the repository's settings, set this property to true to disable improved incremental analysis. For more information, see Managing custom properties for repositories in your organization. This feature is not yet available on GitHub Enterprise Server. #3507
  • Added an experimental change so that when improved incremental analysis fails on a runner — potentially due to insufficient disk space — the failure is recorded in the Actions cache so that subsequent runs will automatically skip improved incremental analysis until something changes (e.g. a larger runner is provisioned or a new CodeQL version is released). We expect to roll this change out to everyone in March. #3487
  • The minimum memory check for improved incremental analysis is now skipped for CodeQL 2.24.3 and later, which has reduced peak RAM usage. #3515

... (truncated)

Commits
  • c10b806 Merge pull request #3782 from github/update-v4.35.1-d6d1743b8
  • c5ffd06 Update changelog for v4.35.1
  • d6d1743 Merge pull request #3781 from github/henrymercer/update-git-minimum-version
  • 65d2efa Add changelog note
  • 2437b20 Update minimum git version for overlay to 2.36.0
  • ea5f719 Merge pull request #3775 from github/dependabot/npm_and_yarn/node-forge-1.4.0
  • 45ceeea Merge pull request #3777 from github/mergeback/v4.35.0-to-main-b8bb9f28
  • 24448c9 Rebuild
  • 7c51060 Update changelog and version after v4.35.0
  • b8bb9f2 Merge pull request #3776 from github/update-v4.35.0-0078ad667
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=github/codeql-action&package-manager=github_actions&previous-version=4.34.1&new-version=4.35.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/codeql.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index be04992378b3d..920e1e79c8540 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -45,11 +45,11 @@ jobs: persist-credentials: false - name: Initialize CodeQL - uses: github/codeql-action/init@38697555549f1db7851b81482ff19f1fa5c4fedc # v4 + uses: github/codeql-action/init@c10b8064de6f491fea524254123dbe5e09572f13 # v4 with: languages: actions - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@38697555549f1db7851b81482ff19f1fa5c4fedc # v4 + uses: github/codeql-action/analyze@c10b8064de6f491fea524254123dbe5e09572f13 # v4 with: category: "/language:actions" From 19eb849b21b3a2d45f1451f57c922aaa482ec321 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 31 Mar 2026 09:07:52 -0700 Subject: [PATCH 11/15] chore(deps): bump pygments from 2.19.2 to 2.20.0 (#21256) Bumps [pygments](https://github.com/pygments/pygments) from 2.19.2 to 2.20.0.
Release notes

Sourced from pygments's releases.

2.20.0

  • New lexers:

  • Updated lexers:

    • archetype: Fix catastrophic backtracking in GUID and ID patterns (#3064)
    • ASN.1: Recognize minus sign and fix range operator (#3014, #3060)
    • C++: Add C++26 keywords (#2955), add integer literal suffixes (#2966)
    • ComponentPascal: Fix analyse_text (#3028, #3032)
    • Coq renamed to Rocq (#2883, #2908)
    • Cython: Various improvements (#2932, #2933)
    • Debian control: Improve architecture parsing (#3052)
    • Devicetree: Add support for overlay/fragments (#3021), add bytestring support (#3022), fix catastrophic backtracking (#3057)
    • Fennel: Various improvements (#2911)
    • Haskell: Handle escape sequences in character literals (#3069, #1795)
    • Java: Add module keywords (#2955)
    • Lean4: Add operators ]', ]?, ]! (#2946)
    • LESS: Support single-line comments (#3005)
    • LilyPond: Update to 2.25.29 (#2974)
    • LLVM: Support C-style comments (#3023, #2978)
    • Lua(u): Fix catastrophic backtracking (#3047)
    • Macaulay2: Update to 1.25.05 (#2893), 1.25.11 (#2988)
    • Mathematica: Various improvements (#2957)
    • meson: Add additional operators (#2919)
    • MySQL: Update keywords (#2970)
    • org-Mode: Support both schedule and deadline (#2899)
    • PHP: Add __PROPERTY__ magic constant (#2924), add reserved keywords (#3002)
    • PostgreSQL: Add more keywords (#2985)
    • protobuf: Fix namespace tokenization (#2929)
    • Python: Add t-string support (#2973, #3009, #3010)
    • Tablegen: Fix infinite loop (#2972, #2940)
    • Tera Term macro: Add commands introduced in v5.3 through v5.6 (#2951)
    • TOML: Support TOML 1.1.0 (#3026, #3027)
    • Turtle: Allow empty comment lines (#2980)
    • XML: Added .xbrl as file ending (#2890, #2891)
  • Drop Python 3.8, and add Python 3.14 as a supported version (#2987, #3012)

  • Various improvements to autopygmentize (#2894)

  • Update onedark style to support more token types (#2977)

  • Update rtt style to support more token types (#2895)

  • Cache entry points to improve performance (#2979)

  • Fix xterm-256 color table (#3043)

  • Fix kwargs dictionary getting mutated on each call (#3044)

Changelog

Sourced from pygments's changelog.

Version 2.20.0

(released March 29th, 2026)

  • New lexers:

  • Updated lexers:

    • archetype: Fix catastrophic backtracking in GUID and ID patterns (#3064)
    • ASN.1: Recognize minus sign and fix range operator (#3014, #3060)
    • C++: Add C++26 keywords (#2955), add integer literal suffixes (#2966)
    • ComponentPascal: Fix analyse_text (#3028, #3032)
    • Coq renamed to Rocq (#2883, #2908)
    • Cython: Various improvements (#2932, #2933)
    • Debian control: Improve architecture parsing (#3052)
    • Devicetree: Add support for overlay/fragments (#3021), add bytestring support (#3022), fix catastrophic backtracking (#3057)
    • Fennel: Various improvements (#2911)
    • Haskell: Handle escape sequences in character literals (#3069, #1795)
    • Java: Add module keywords (#2955)
    • Lean4: Add operators ]', ]?, ]! (#2946)
    • LESS: Support single-line comments (#3005)
    • LilyPond: Update to 2.25.29 (#2974)
    • LLVM: Support C-style comments (#3023, #2978)
    • Lua(u): Fix catastrophic backtracking (#3047)
    • Macaulay2: Update to 1.25.05 (#2893), 1.25.11 (#2988)
    • Mathematica: Various improvements (#2957)
    • meson: Add additional operators (#2919)
    • MySQL: Update keywords (#2970)
    • org-Mode: Support both schedule and deadline (#2899)
    • PHP: Add __PROPERTY__ magic constant (#2924), add reserved keywords (#3002)
    • PostgreSQL: Add more keywords (#2985)
    • protobuf: Fix namespace tokenization (#2929)
    • Python: Add t-string support (#2973, #3009, #3010)
    • Tablegen: Fix infinite loop (#2972, #2940)
    • Tera Term macro: Add commands introduced in v5.3 through v5.6 (#2951)
    • TOML: Support TOML 1.1.0 (#3026, #3027)
    • Turtle: Allow empty comment lines (#2980)
    • XML: Added .xbrl as file ending (#2890, #2891)
  • Drop Python 3.8, and add Python 3.14 as a supported version (#2987, #3012)

  • Various improvements to autopygmentize (#2894)

  • Update onedark style to support more token types (#2977)

  • Update rtt style to support more token types (#2895)

  • Cache entry points to improve performance (#2979)

  • Fix xterm-256 color table (#3043)

  • Fix kwargs dictionary getting mutated on each call (#3044)

Commits
  • 708197d Fix underline length.
  • 1d4538a Prepare 2.20 release.
  • 2ceaee4 Update CHANGES.
  • e3a3c54 Fix Haskell lexer: handle escape sequences in character literals (#3069)
  • d7c3453 Merge pull request #3071 from pygments/harden-html-formatter
  • 0f97e7c Harden the HTML formatter against CSS.
  • 9f981b2 Update CHANGES.
  • 1d88915 Update CHANGES.
  • c3d93ad Fix ASN.1 lexer: recognize minus sign and fix range operator (#3060)
  • 4f06bcf fix bad behaving backtracking regex in CommonLispLexer
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=pygments&package-manager=uv&previous-version=2.19.2&new-version=2.20.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself) You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/apache/datafusion/network/alerts).
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- uv.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/uv.lock b/uv.lock index 541fe15f43383..925d850bba42d 100644 --- a/uv.lock +++ b/uv.lock @@ -793,11 +793,11 @@ wheels = [ [[package]] name = "pygments" -version = "2.19.2" +version = "2.20.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/b0/77/a5b8c569bf593b0140bde72ea885a803b82086995367bf2037de0159d924/pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887", size = 4968631, upload-time = "2025-06-21T13:39:12.283Z" } +sdist = { url = "https://files.pythonhosted.org/packages/c3/b2/bc9c9196916376152d655522fdcebac55e66de6603a76a02bca1b6414f6c/pygments-2.20.0.tar.gz", hash = "sha256:6757cd03768053ff99f3039c1a36d6c0aa0b263438fcab17520b30a303a82b5f", size = 4955991, upload-time = "2026-03-29T13:29:33.898Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" }, + { url = "https://files.pythonhosted.org/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176", size = 1231151, upload-time = "2026-03-29T13:29:30.038Z" }, ] [[package]] From a9dc1dcac15ebb748fc865e4d73af97b1e61c3be Mon Sep 17 00:00:00 2001 From: Huaijin Date: Wed, 1 Apr 2026 00:10:27 +0800 Subject: [PATCH 12/15] chore: fix upgrade guide link for object_store release notes (#21283) fix object_store release notes link in datafusion 53 upgrade guide find this when i try upgrade datafusion to 53 --- docs/source/library-user-guide/upgrading/53.0.0.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/library-user-guide/upgrading/53.0.0.md b/docs/source/library-user-guide/upgrading/53.0.0.md index ef5f5743f5ea6..f616220778936 100644 --- a/docs/source/library-user-guide/upgrading/53.0.0.md +++ b/docs/source/library-user-guide/upgrading/53.0.0.md @@ -37,7 +37,7 @@ these crates. See the [Arrow 58.0.0 release notes] and the [object_store 0.13.0 upgrade guide] for details on breaking changes in those versions. [arrow 58.0.0 release notes]: https://github.com/apache/arrow-rs/releases/tag/58.0.0 -[object_store 0.13.0 upgrade guide]: https://github.com/apache/arrow-rs/releases/tag/58.0.0 +[object_store 0.13.0 upgrade guide]: https://github.com/apache/arrow-rs-object-store/blob/v0.13.0/CHANGELOG.md ### `ExecutionPlan::properties` now returns `&Arc` From c473c1852fad41541a2adfb252535321fd26a1b7 Mon Sep 17 00:00:00 2001 From: Bert Vermeiren <103956021+bert-beyondloops@users.noreply.github.com> Date: Tue, 31 Mar 2026 20:18:37 +0200 Subject: [PATCH 13/15] =?UTF-8?q?feat(memory=5Fpool):=20add=20`TrackConsum?= =?UTF-8?q?ersPool::metrics()`=20to=20expose=20cons=E2=80=A6=20(#21147)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Which issue does this PR close? - Closes #21146 ## Rationale for this change There is currently no way to programmatically inspect the memory consumption of individual consumers tracked by TrackConsumersPool. The only available method, report_top(), returns a formatted string intended for human-readable output, making it unsuitable for programmatic use (e.g., metrics collection, monitoring, or custom reporting). ## What changes are included in this PR? Added a metrics() method to TrackConsumersPool that returns a Vec — a snapshot of all currently tracked consumers. Each MemoryConsumerMetrics entry exposes: name — the consumer's name can_spill — whether the consumer supports spilling to disk reserved — current bytes reserved peak — peak bytes reserved This allows callers to inspect memory usage programmatically without parsing formatted strings. ## Are these changes tested? Yes. A dedicated unit test test_track_consumers_pool_metrics was added in pool.rs that verifies: - An empty pool returns no metrics - name, can_spill, reserved, and peak are correctly reported for each consumer - Peak is tracked independently from current reservation (grow then shrink scenario) - Dropped consumers are removed from metrics ## Are there any user-facing changes? No Co-authored-by: Bert Vermeiren Co-authored-by: Andrew Lamb --- datafusion/execution/src/memory_pool/pool.rs | 83 ++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/datafusion/execution/src/memory_pool/pool.rs b/datafusion/execution/src/memory_pool/pool.rs index b10270851cc06..19aaa0371ada3 100644 --- a/datafusion/execution/src/memory_pool/pool.rs +++ b/datafusion/execution/src/memory_pool/pool.rs @@ -302,6 +302,32 @@ impl TrackedConsumer { } } +/// A point-in-time snapshot of a tracked memory consumer's state. +/// +/// Returned by [`TrackConsumersPool::metrics()`]. +#[derive(Debug, Clone)] +pub struct MemoryConsumerMetrics { + /// The name of the memory consumer + pub name: String, + /// Whether this consumer can spill to disk + pub can_spill: bool, + /// The number of bytes currently reserved by this consumer + pub reserved: usize, + /// The peak number of bytes reserved by this consumer + pub peak: usize, +} + +impl From<&TrackedConsumer> for MemoryConsumerMetrics { + fn from(tracked: &TrackedConsumer) -> Self { + Self { + name: tracked.name.clone(), + can_spill: tracked.can_spill, + reserved: tracked.reserved(), + peak: tracked.peak(), + } + } +} + /// A [`MemoryPool`] that tracks the consumers that have /// reserved memory within the inner memory pool. /// @@ -381,6 +407,15 @@ impl TrackConsumersPool { } } + /// Returns a snapshot of all currently tracked consumers. + pub fn metrics(&self) -> Vec { + self.tracked_consumers + .lock() + .values() + .map(Into::into) + .collect() + } + /// Returns a formatted string with the top memory consumers. pub fn report_top(&self, top: usize) -> String { let mut consumers = self @@ -778,6 +813,54 @@ mod tests { test_per_pool_type(tracked_greedy_pool); } + #[test] + fn test_track_consumers_pool_metrics() { + let track_consumers_pool = Arc::new(TrackConsumersPool::new( + GreedyMemoryPool::new(1000), + NonZeroUsize::new(3).unwrap(), + )); + let memory_pool: Arc = Arc::clone(&track_consumers_pool) as _; + + // Empty pool has no metrics + assert!(track_consumers_pool.metrics().is_empty()); + + // Register consumers with different spill settings + let r1 = MemoryConsumer::new("spilling") + .with_can_spill(true) + .register(&memory_pool); + let r2 = MemoryConsumer::new("non-spilling").register(&memory_pool); + + // Grow r1 in two steps to verify peak tracking + r1.grow(100); + r1.grow(50); + r1.shrink(50); // reserved=100, peak=150 + + r2.grow(200); // reserved=200, peak=200 + + let mut metrics = track_consumers_pool.metrics(); + metrics.sort_by_key(|m| m.name.clone()); + + assert_eq!(metrics.len(), 2); + + let m_non = &metrics[0]; + assert_eq!(m_non.name, "non-spilling"); + assert!(!m_non.can_spill); + assert_eq!(m_non.reserved, 200); + assert_eq!(m_non.peak, 200); + + let m_spill = &metrics[1]; + assert_eq!(m_spill.name, "spilling"); + assert!(m_spill.can_spill); + assert_eq!(m_spill.reserved, 100); + assert_eq!(m_spill.peak, 150); + + // Unregistered consumers are removed from metrics + drop(r2); + let metrics = track_consumers_pool.metrics(); + assert_eq!(metrics.len(), 1); + assert_eq!(metrics[0].name, "spilling"); + } + #[test] fn test_tracked_consumers_pool_use_beyond_errors() { let setting = make_settings(); From e74b83ab026379158a8031c9d70f187fbb704172 Mon Sep 17 00:00:00 2001 From: Huaijin Date: Wed, 1 Apr 2026 02:27:21 +0800 Subject: [PATCH 14/15] fix: date overflow panic (#21233) ## Which issue does this PR close? - Closes #21234 ## Rationale for this change - see #21234 ## What changes are included in this PR? handle Date32 and Date64 in `get_extreme_value` function ## Are these changes tested? yes, add test case ## Are there any user-facing changes? --------- Co-authored-by: Andrew Lamb --- datafusion/expr-common/src/interval_arithmetic.rs | 2 ++ .../test_files/datetime/arith_date_interval.slt | 12 ++++++++++++ 2 files changed, 14 insertions(+) diff --git a/datafusion/expr-common/src/interval_arithmetic.rs b/datafusion/expr-common/src/interval_arithmetic.rs index 0f88723d116f5..883c721080611 100644 --- a/datafusion/expr-common/src/interval_arithmetic.rs +++ b/datafusion/expr-common/src/interval_arithmetic.rs @@ -49,6 +49,8 @@ macro_rules! get_extreme_value { DataType::Int64 => ScalarValue::Int64(Some(i64::$extreme)), DataType::Float32 => ScalarValue::Float32(Some(f32::$extreme)), DataType::Float64 => ScalarValue::Float64(Some(f64::$extreme)), + DataType::Date32 => ScalarValue::Date32(Some(i32::$extreme)), + DataType::Date64 => ScalarValue::Date64(Some(i64::$extreme)), DataType::Duration(TimeUnit::Second) => { ScalarValue::DurationSecond(Some(i64::$extreme)) } diff --git a/datafusion/sqllogictest/test_files/datetime/arith_date_interval.slt b/datafusion/sqllogictest/test_files/datetime/arith_date_interval.slt index ad2e7ed496f79..01e1939996dfc 100644 --- a/datafusion/sqllogictest/test_files/datetime/arith_date_interval.slt +++ b/datafusion/sqllogictest/test_files/datetime/arith_date_interval.slt @@ -35,3 +35,15 @@ query T SELECT arrow_typeof('2001-09-28'::date - interval '25 hour') ---- Date32 + +query error Arrow error: Compute error: Date arithmetic overflow +SELECT arrow_cast('2020-01-01', 'Date32') + INTERVAL '999999' YEAR + +query error Arrow error: Compute error: Date arithmetic overflow +SELECT arrow_cast('2020-01-01', 'Date32') - INTERVAL '999999' YEAR + +query error Arrow error: Compute error: Date arithmetic overflow +SELECT arrow_cast('2020-01-01', 'Date64') + INTERVAL '999999' YEAR + +query error Arrow error: Compute error: Date arithmetic overflow +SELECT arrow_cast('2020-01-01', 'Date64') - INTERVAL '999999' YEAR From 9de1253f2a8435fe486a93686db77531b54e7a3e Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Tue, 31 Mar 2026 14:33:11 -0400 Subject: [PATCH 15/15] Update repeat UDF to emit utf8view when input is utf8view (#20645) ## Which issue does this PR close? Part of https://github.com/apache/datafusion/issues/20585 ## Rationale for this change Functions ideally should emit strings in the same format as the input and previously the repeat function was emitting using utf8 for input that was in utf8view. ## What changes are included in this PR? Code, tests ## Are these changes tested? Yes ## Are there any user-facing changes? --- datafusion/functions/src/string/repeat.rs | 98 ++++++++++++++----- .../test_files/string/string_literal.slt | 24 +++++ 2 files changed, 95 insertions(+), 27 deletions(-) diff --git a/datafusion/functions/src/string/repeat.rs b/datafusion/functions/src/string/repeat.rs index 4e38ec9af3859..f100a29e309e2 100644 --- a/datafusion/functions/src/string/repeat.rs +++ b/datafusion/functions/src/string/repeat.rs @@ -20,7 +20,7 @@ use std::sync::Arc; use crate::utils::utf8_to_str_type; use arrow::array::{ Array, ArrayRef, AsArray, GenericStringArray, GenericStringBuilder, Int64Array, - OffsetSizeTrait, StringArrayType, StringViewArray, + StringArrayType, StringLikeArrayBuilder, StringViewArray, StringViewBuilder, }; use arrow::datatypes::DataType; use arrow::datatypes::DataType::{LargeUtf8, Utf8, Utf8View}; @@ -91,6 +91,9 @@ impl ScalarUDFImpl for RepeatFunc { } fn return_type(&self, arg_types: &[DataType]) -> Result { + if arg_types[0] == Utf8View { + return Ok(Utf8View); + } utf8_to_str_type(&arg_types[0], "repeat") } @@ -126,13 +129,12 @@ impl ScalarUDFImpl for RepeatFunc { }; let result = match string_scalar { - ScalarValue::Utf8(Some(s)) | ScalarValue::Utf8View(Some(s)) => { - ScalarValue::Utf8(Some(compute_repeat( - s, - count, - i32::MAX as usize, - )?)) - } + ScalarValue::Utf8View(Some(s)) => ScalarValue::Utf8View(Some( + compute_repeat(s, count, i32::MAX as usize)?, + )), + ScalarValue::Utf8(Some(s)) => ScalarValue::Utf8(Some( + compute_repeat(s, count, i32::MAX as usize)?, + )), ScalarValue::LargeUtf8(Some(s)) => ScalarValue::LargeUtf8(Some( compute_repeat(s, count, i64::MAX as usize)?, )), @@ -183,26 +185,47 @@ fn repeat(string_array: &ArrayRef, count_array: &ArrayRef) -> Result { match string_array.data_type() { Utf8View => { let string_view_array = string_array.as_string_view(); - repeat_impl::( + let (_, max_item_capacity) = calculate_capacities( &string_view_array, number_array, i32::MAX as usize, + )?; + let builder = StringViewBuilder::with_capacity(string_array.len()); + repeat_impl::<&StringViewArray, StringViewBuilder>( + &string_view_array, + number_array, + max_item_capacity, + builder, ) } Utf8 => { let string_arr = string_array.as_string::(); - repeat_impl::>( + let (total_capacity, max_item_capacity) = + calculate_capacities(&string_arr, number_array, i32::MAX as usize)?; + let builder = GenericStringBuilder::::with_capacity( + string_array.len(), + total_capacity, + ); + repeat_impl::<&GenericStringArray, GenericStringBuilder>( &string_arr, number_array, - i32::MAX as usize, + max_item_capacity, + builder, ) } LargeUtf8 => { let string_arr = string_array.as_string::(); - repeat_impl::>( + let (total_capacity, max_item_capacity) = + calculate_capacities(&string_arr, number_array, i64::MAX as usize)?; + let builder = GenericStringBuilder::::with_capacity( + string_array.len(), + total_capacity, + ); + repeat_impl::<&GenericStringArray, GenericStringBuilder>( &string_arr, number_array, - i64::MAX as usize, + max_item_capacity, + builder, ) } other => exec_err!( @@ -212,17 +235,17 @@ fn repeat(string_array: &ArrayRef, count_array: &ArrayRef) -> Result { } } -fn repeat_impl<'a, T, S>( +fn calculate_capacities<'a, S>( string_array: &S, number_array: &Int64Array, max_str_len: usize, -) -> Result +) -> Result<(usize, usize)> where - T: OffsetSizeTrait, - S: StringArrayType<'a> + 'a, + S: StringArrayType<'a>, { let mut total_capacity = 0; let mut max_item_capacity = 0; + string_array.iter().zip(number_array.iter()).try_for_each( |(string, number)| -> Result<(), DataFusionError> { match (string, number) { @@ -244,9 +267,19 @@ where }, )?; - let mut builder = - GenericStringBuilder::::with_capacity(string_array.len(), total_capacity); + Ok((total_capacity, max_item_capacity)) +} +fn repeat_impl<'a, S, B>( + string_array: &S, + number_array: &Int64Array, + max_item_capacity: usize, + mut builder: B, +) -> Result +where + S: StringArrayType<'a> + 'a, + B: StringLikeArrayBuilder, +{ // Reusable buffer to avoid allocations in string.repeat() let mut buffer = Vec::::with_capacity(max_item_capacity); @@ -303,8 +336,8 @@ where #[cfg(test)] mod tests { - use arrow::array::{Array, StringArray}; - use arrow::datatypes::DataType::Utf8; + use arrow::array::{Array, LargeStringArray, StringArray, StringViewArray}; + use arrow::datatypes::DataType::{LargeUtf8, Utf8, Utf8View}; use datafusion_common::ScalarValue; use datafusion_common::{Result, exec_err}; @@ -357,8 +390,8 @@ mod tests { ], Ok(Some("PgPgPgPg")), &str, - Utf8, - StringArray + Utf8View, + StringViewArray ); test_function!( RepeatFunc::new(), @@ -368,8 +401,19 @@ mod tests { ], Ok(None), &str, - Utf8, - StringArray + Utf8View, + StringViewArray + ); + test_function!( + RepeatFunc::new(), + vec![ + ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(String::from("Pg")))), + ColumnarValue::Scalar(ScalarValue::Int64(None)), + ], + Ok(None), + &str, + LargeUtf8, + LargeStringArray ); test_function!( RepeatFunc::new(), @@ -379,8 +423,8 @@ mod tests { ], Ok(None), &str, - Utf8, - StringArray + Utf8View, + StringViewArray ); test_function!( RepeatFunc::new(), diff --git a/datafusion/sqllogictest/test_files/string/string_literal.slt b/datafusion/sqllogictest/test_files/string/string_literal.slt index 569dfe0336f74..d4fe8ee178719 100644 --- a/datafusion/sqllogictest/test_files/string/string_literal.slt +++ b/datafusion/sqllogictest/test_files/string/string_literal.slt @@ -347,11 +347,35 @@ SELECT repeat('foo', 3) ---- foofoofoo +query T +SELECT repeat(arrow_cast('foo', 'LargeUtf8'), 3) +---- +foofoofoo + +query T +SELECT repeat(arrow_cast('foo', 'Utf8View'), 3) +---- +foofoofoo + query T SELECT repeat(arrow_cast('foo', 'Dictionary(Int32, Utf8)'), 3) ---- foofoofoo +query T +SELECT arrow_typeof(repeat('foo', 3)) +---- +Utf8 + +query T +SELECT arrow_typeof(repeat(arrow_cast('foo', 'LargeUtf8'), 3)) +---- +LargeUtf8 + +query T +SELECT arrow_typeof(repeat(arrow_cast('foo', 'Utf8View'), 3)) +---- +Utf8View query T SELECT replace('foobar', 'bar', 'hello')