From 052add63f335ddc9030aba8202e0776bbfa7c253 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 8 Mar 2026 01:32:51 +0800 Subject: [PATCH 1/3] Native engine crashes on all-literal RLIKE expression --- .../spark-expr/src/predicate_funcs/rlike.rs | 28 +++++++++++++++++-- .../expressions/string/rlike_enabled.sql | 2 +- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/native/spark-expr/src/predicate_funcs/rlike.rs b/native/spark-expr/src/predicate_funcs/rlike.rs index 099e9852cb..02dc414ec5 100644 --- a/native/spark-expr/src/predicate_funcs/rlike.rs +++ b/native/spark-expr/src/predicate_funcs/rlike.rs @@ -21,7 +21,7 @@ use arrow::array::types::Int32Type; use arrow::array::{Array, BooleanArray, DictionaryArray, RecordBatch, StringArray}; use arrow::compute::take; use arrow::datatypes::{DataType, Schema}; -use datafusion::common::{internal_err, Result}; +use datafusion::common::{internal_err, Result, ScalarValue}; use datafusion::physical_expr::PhysicalExpr; use datafusion::physical_plan::ColumnarValue; use regex::Regex; @@ -140,8 +140,30 @@ impl PhysicalExpr for RLike { let array = self.is_match(inputs); Ok(ColumnarValue::Array(Arc::new(array))) } - ColumnarValue::Scalar(_) => { - internal_err!("non scalar regexp patterns are not supported") + ColumnarValue::Scalar(scalar) => { + // Handle scalar input (all-literal RLIKE expressions) + // This case occurs when ConstantFolding is disabled and both + // the input string and pattern are literals + if scalar.is_null() { + // NULL RLIKE pattern -> NULL result + return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None))); + } + + // Extract string value from scalar and match pattern + // We handle each type separately to avoid lifetime issues with Utf8View + let is_match = match scalar { + ScalarValue::Utf8(Some(s)) => self.pattern.is_match(s.as_str()), + ScalarValue::LargeUtf8(Some(s)) => self.pattern.is_match(s.as_str()), + ScalarValue::Utf8View(Some(s)) => self.pattern.is_match(s.as_str()), + _ => { + return internal_err!( + "RLike requires string type for input, got {:?}", + scalar.data_type() + ); + } + }; + + Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(is_match)))) } } } diff --git a/spark/src/test/resources/sql-tests/expressions/string/rlike_enabled.sql b/spark/src/test/resources/sql-tests/expressions/string/rlike_enabled.sql index 822fb3ddb8..1de215a770 100644 --- a/spark/src/test/resources/sql-tests/expressions/string/rlike_enabled.sql +++ b/spark/src/test/resources/sql-tests/expressions/string/rlike_enabled.sql @@ -35,5 +35,5 @@ query SELECT s RLIKE '' FROM test_rlike_enabled -- literal arguments -query ignore(https://github.com/apache/datafusion-comet/issues/3343) +query SELECT 'hello' RLIKE '^[a-z]+$', '12345' RLIKE '^[a-z]+$', '' RLIKE '', NULL RLIKE 'a' From 778e3f77d6bc2e156e4655729b1444f890695987 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 9 Mar 2026 12:27:32 +0800 Subject: [PATCH 2/3] add test --- .../spark-expr/src/predicate_funcs/rlike.rs | 36 ++++++++++++++----- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/native/spark-expr/src/predicate_funcs/rlike.rs b/native/spark-expr/src/predicate_funcs/rlike.rs index 02dc414ec5..a70190f8dc 100644 --- a/native/spark-expr/src/predicate_funcs/rlike.rs +++ b/native/spark-expr/src/predicate_funcs/rlike.rs @@ -22,6 +22,8 @@ use arrow::array::{Array, BooleanArray, DictionaryArray, RecordBatch, StringArra use arrow::compute::take; use arrow::datatypes::{DataType, Schema}; use datafusion::common::{internal_err, Result, ScalarValue}; +#[cfg(test)] +use datafusion::physical_expr::expressions::Literal; use datafusion::physical_expr::PhysicalExpr; use datafusion::physical_plan::ColumnarValue; use regex::Regex; @@ -141,20 +143,14 @@ impl PhysicalExpr for RLike { Ok(ColumnarValue::Array(Arc::new(array))) } ColumnarValue::Scalar(scalar) => { - // Handle scalar input (all-literal RLIKE expressions) - // This case occurs when ConstantFolding is disabled and both - // the input string and pattern are literals if scalar.is_null() { - // NULL RLIKE pattern -> NULL result return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None))); } - // Extract string value from scalar and match pattern - // We handle each type separately to avoid lifetime issues with Utf8View let is_match = match scalar { - ScalarValue::Utf8(Some(s)) => self.pattern.is_match(s.as_str()), - ScalarValue::LargeUtf8(Some(s)) => self.pattern.is_match(s.as_str()), - ScalarValue::Utf8View(Some(s)) => self.pattern.is_match(s.as_str()), + ScalarValue::Utf8(Some(s)) + | ScalarValue::LargeUtf8(Some(s)) + | ScalarValue::Utf8View(Some(s)) => self.pattern.is_match(s.as_str()), _ => { return internal_err!( "RLike requires string type for input, got {:?}", @@ -187,3 +183,25 @@ impl PhysicalExpr for RLike { Display::fmt(self, f) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_rlike_scalar_utf8_literal() { + let expr = RLike::try_new( + Arc::new(Literal::new(ScalarValue::Utf8(Some("Rose".to_string())))), + "R[a-z]+", + ) + .unwrap(); + let result = expr + .evaluate(&RecordBatch::new_empty(Arc::new(Schema::empty()))) + .unwrap(); + let ColumnarValue::Scalar(result) = result else { + panic!("expected scalar result"); + }; + + assert_eq!(result, ScalarValue::Boolean(Some(true))); + } +} From 8e33b42b5eb9f9730533bf381df623f00a57fb56 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 11 Mar 2026 11:50:46 +0800 Subject: [PATCH 3/3] address comment --- native/spark-expr/src/predicate_funcs/rlike.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/native/spark-expr/src/predicate_funcs/rlike.rs b/native/spark-expr/src/predicate_funcs/rlike.rs index a70190f8dc..80cfb94980 100644 --- a/native/spark-expr/src/predicate_funcs/rlike.rs +++ b/native/spark-expr/src/predicate_funcs/rlike.rs @@ -22,8 +22,6 @@ use arrow::array::{Array, BooleanArray, DictionaryArray, RecordBatch, StringArra use arrow::compute::take; use arrow::datatypes::{DataType, Schema}; use datafusion::common::{internal_err, Result, ScalarValue}; -#[cfg(test)] -use datafusion::physical_expr::expressions::Literal; use datafusion::physical_expr::PhysicalExpr; use datafusion::physical_plan::ColumnarValue; use regex::Regex; @@ -187,6 +185,7 @@ impl PhysicalExpr for RLike { #[cfg(test)] mod tests { use super::*; + use datafusion::physical_expr::expressions::Literal; #[test] fn test_rlike_scalar_utf8_literal() {