From 6d930affdc2264c3273982d6d8334fec79885add Mon Sep 17 00:00:00 2001 From: Allen Xu Date: Mon, 9 Mar 2026 17:21:24 +0800 Subject: [PATCH 1/3] [AutoSparkUT] Fix LIKE with invalid escape pattern to match CPU behavior (#14117) Validate LIKE escape patterns on GPU to match CPU semantics. Previously, GpuLike passed patterns directly to cuDF which silently accepted invalid escape sequences (e.g. LIKE 'm%@ca' ESCAPE '%'), returning wrong results instead of throwing AnalysisException like CPU does. - Add tagExprForGpu in BinaryExprMeta[Like] to detect invalid escape patterns and fall back to CPU, which throws the correct exception. - Add defensive validation in GpuLike.doColumnar as a safety net. - Remove SPARK-33677 exclusion from RapidsTestSettings. Closes https://github.com/NVIDIA/spark-rapids/issues/14117 Maven validation: mvn test -pl tests -Dbuildver=330 \ -DwildcardSuites=...RapidsSQLQuerySuite Tests: succeeded 215, failed 0, ignored 18 Signed-off-by: Allen Xu Made-with: Cursor --- .../com/nvidia/spark/rapids/GpuOverrides.scala | 18 ++++++++++++++++++ .../spark/sql/rapids/stringFunctions.scala | 8 ++++++++ .../sql/rapids/utils/RapidsTestSettings.scala | 1 - 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala index fef7bea85dd..78b3a96bdd4 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala @@ -3501,6 +3501,24 @@ object GpuOverrides extends Logging { ("src", TypeSig.STRING, TypeSig.STRING), ("search", TypeSig.lit(TypeEnum.STRING), TypeSig.STRING)), (a, conf, p, r) => new BinaryExprMeta[Like](a, conf, p, r) { + override def tagExprForGpu(): Unit = { + import org.apache.spark.sql.catalyst.util.StringUtils + try { + a.right match { + case l: Literal + if l.value != null && + l.dataType == StringType => + StringUtils.escapeLikeRegex( + l.value.asInstanceOf[UTF8String].toString, + a.escapeChar) + case _ => + } + } catch { + case _: Exception => + willNotWorkOnGpu( + "invalid LIKE escape pattern") + } + } override def convertToGpu(lhs: Expression, rhs: Expression): GpuExpression = GpuLike(lhs, rhs, a.escapeChar) }), diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala index 1608446357e..8b95b334730 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala @@ -36,6 +36,7 @@ import com.nvidia.spark.rapids.jni.RegexRewriteUtils import com.nvidia.spark.rapids.shims.{NullIntolerantShim, ShimExpression, SparkShimImpl} import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.util.StringUtils import org.apache.spark.sql.errors.ConvUtils import org.apache.spark.sql.rapids.catalyst.expressions._ import org.apache.spark.sql.types._ @@ -976,12 +977,19 @@ case class GpuLike(left: Expression, right: Expression, escapeChar: Char) def this(left: Expression, right: Expression) = this(left, right, '\\') + @transient private var escapeValidated = false + override def toString: String = escapeChar match { case '\\' => s"$left gpulike $right" case c => s"$left gpulike $right ESCAPE '$c'" } override def doColumnar(lhs: GpuColumnVector, rhs: GpuScalar): ColumnVector = { + if (!escapeValidated && rhs.isValid) { + StringUtils.escapeLikeRegex( + rhs.getValue.asInstanceOf[UTF8String].toString, escapeChar) + escapeValidated = true + } withResource(Scalar.fromString(Character.toString(escapeChar))) { escapeScalar => lhs.getBase.like(rhs.getBase, escapeScalar) } diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala index 085d823d383..72d17d3b4da 100644 --- a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala @@ -227,7 +227,6 @@ class RapidsTestSettings extends BackendTestSettings { .exclude("SPARK-19650: An action on a Command should not trigger a Spark job", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/14110")) .exclude("SPARK-31594: Do not display the seed of rand/randn with no argument in output schema", ADJUST_UT("Replaced by testRapids version with a correct regex expression to match the projectExplainOutput, randn isn't supported now. See https://github.com/NVIDIA/spark-rapids/issues/11613")) .exclude("normalize special floating numbers in subquery", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/14116")) - .exclude("SPARK-33677: LikeSimplification should be skipped if pattern contains any escapeChar", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/14117")) .exclude("SPARK-33593: Vector reader got incorrect data with binary partition value", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/14118")) .exclude("SPARK-33084: Add jar support Ivy URI in SQL -- jar contains udf class", ADJUST_UT("Replaced by testRapids version that uses testFile() to access Spark test resources instead of getContextClassLoader")) .exclude("SPARK-33482: Fix FileScan canonicalization", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/14122")) From 0e1b4ad3d51732b8af972f84aa790544d3dc6e64 Mon Sep 17 00:00:00 2001 From: Allen Xu Date: Tue, 10 Mar 2026 12:58:21 +0800 Subject: [PATCH 2/3] Address review: use NonFatal, safer type handling in LIKE validation - Use NonFatal(e) instead of catching Exception in tagExprForGpu to avoid swallowing fatal errors; include e.getMessage for diagnostics. - Check l.value.isInstanceOf[UTF8String] instead of l.dataType == StringType to correctly handle CharType/VarcharType whose values are UTF8String at runtime. - Use rhs.getValue.toString in GpuLike.doColumnar to safely handle both UTF8String and String backed scalars. Signed-off-by: Allen Xu Made-with: Cursor --- .../scala/com/nvidia/spark/rapids/GpuOverrides.scala | 11 +++++------ .../org/apache/spark/sql/rapids/stringFunctions.scala | 2 +- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala index 78b3a96bdd4..9dda8aae3fe 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala @@ -3506,17 +3506,16 @@ object GpuOverrides extends Logging { try { a.right match { case l: Literal - if l.value != null && - l.dataType == StringType => + if l.value.isInstanceOf[UTF8String] => StringUtils.escapeLikeRegex( - l.value.asInstanceOf[UTF8String].toString, - a.escapeChar) + l.value.toString, a.escapeChar) case _ => } } catch { - case _: Exception => + case NonFatal(e) => willNotWorkOnGpu( - "invalid LIKE escape pattern") + s"invalid LIKE escape pattern: " + + s"${e.getMessage}") } } override def convertToGpu(lhs: Expression, rhs: Expression): GpuExpression = diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala index 8b95b334730..a33dedff3dc 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala @@ -987,7 +987,7 @@ case class GpuLike(left: Expression, right: Expression, escapeChar: Char) override def doColumnar(lhs: GpuColumnVector, rhs: GpuScalar): ColumnVector = { if (!escapeValidated && rhs.isValid) { StringUtils.escapeLikeRegex( - rhs.getValue.asInstanceOf[UTF8String].toString, escapeChar) + rhs.getValue.toString, escapeChar) escapeValidated = true } withResource(Scalar.fromString(Character.toString(escapeChar))) { escapeScalar => From bd7f8eb17d77c8dfa586d4f577bd6953e527a081 Mon Sep 17 00:00:00 2001 From: Allen Xu Date: Thu, 12 Mar 2026 15:52:41 +0800 Subject: [PATCH 3/3] Address revans2/res-life review: simplify LIKE escape validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace StringUtils.escapeLikeRegex call with a focused O(n) escape char validation that only checks the two invalid cases (escape char at end of pattern, escape char followed by non-special character). This avoids building a full regex string during planning. - Remove runtime safety net in GpuLike.doColumnar — tagExprForGpu already prevents GpuLike from being created for invalid patterns. - Remove now-unused StringUtils import from stringFunctions.scala. Signed-off-by: Allen Xu Made-with: Cursor Signed-off-by: Allen Xu --- .../nvidia/spark/rapids/GpuOverrides.scala | 39 ++++++++++++------- .../spark/sql/rapids/stringFunctions.scala | 8 ---- 2 files changed, 25 insertions(+), 22 deletions(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala index 9dda8aae3fe..573c22e3312 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala @@ -3502,20 +3502,31 @@ object GpuOverrides extends Logging { ("search", TypeSig.lit(TypeEnum.STRING), TypeSig.STRING)), (a, conf, p, r) => new BinaryExprMeta[Like](a, conf, p, r) { override def tagExprForGpu(): Unit = { - import org.apache.spark.sql.catalyst.util.StringUtils - try { - a.right match { - case l: Literal - if l.value.isInstanceOf[UTF8String] => - StringUtils.escapeLikeRegex( - l.value.toString, a.escapeChar) - case _ => - } - } catch { - case NonFatal(e) => - willNotWorkOnGpu( - s"invalid LIKE escape pattern: " + - s"${e.getMessage}") + a.right match { + case Literal(v: UTF8String, _) => + val pattern = v.toString + val esc = a.escapeChar + var i = 0 + while (i < pattern.length) { + if (pattern.charAt(i) == esc) { + val j = i + 1 + if (j >= pattern.length) { + willNotWorkOnGpu( + "invalid LIKE escape pattern") + return + } + val c = pattern.charAt(j) + if (c != '_' && c != '%' && c != esc) { + willNotWorkOnGpu( + "invalid LIKE escape pattern") + return + } + i = j + 1 + } else { + i += 1 + } + } + case _ => } } override def convertToGpu(lhs: Expression, rhs: Expression): GpuExpression = diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala index a33dedff3dc..1608446357e 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala @@ -36,7 +36,6 @@ import com.nvidia.spark.rapids.jni.RegexRewriteUtils import com.nvidia.spark.rapids.shims.{NullIntolerantShim, ShimExpression, SparkShimImpl} import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.util.StringUtils import org.apache.spark.sql.errors.ConvUtils import org.apache.spark.sql.rapids.catalyst.expressions._ import org.apache.spark.sql.types._ @@ -977,19 +976,12 @@ case class GpuLike(left: Expression, right: Expression, escapeChar: Char) def this(left: Expression, right: Expression) = this(left, right, '\\') - @transient private var escapeValidated = false - override def toString: String = escapeChar match { case '\\' => s"$left gpulike $right" case c => s"$left gpulike $right ESCAPE '$c'" } override def doColumnar(lhs: GpuColumnVector, rhs: GpuScalar): ColumnVector = { - if (!escapeValidated && rhs.isValid) { - StringUtils.escapeLikeRegex( - rhs.getValue.toString, escapeChar) - escapeValidated = true - } withResource(Scalar.fromString(Character.toString(escapeChar))) { escapeScalar => lhs.getBase.like(rhs.getBase, escapeScalar) }