From b79b83edeeffadc6d951e2126fa7502be78bc576 Mon Sep 17 00:00:00 2001 From: iiFeung Date: Mon, 24 Mar 2025 10:36:01 +0800 Subject: [PATCH 1/6] rebase onto origin:main --- .../gluten/utils/CHExpressionUtil.scala | 5 ++- .../ScalarFunctionsValidateSuite.scala | 41 +++++++++++++++++++ .../gluten/expression/ExpressionNames.scala | 5 +++ 3 files changed, 50 insertions(+), 1 deletion(-) diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala index 8365245b848d..37308f8c0b43 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala @@ -207,6 +207,9 @@ object CHExpressionUtil { MAKE_DATE -> DefaultValidator(), ARRAY_APPEND -> DefaultValidator(), JSON_OBJECT_KEYS -> DefaultValidator(), - LUHN_CHECK -> DefaultValidator() + LUHN_CHECK -> DefaultValidator(), + VARCHAR_TYPE_WRITE_SIDE_CHECK -> DefaultValidator(), + CHAR_TYPE_WRITE_SIDE_CHECK -> DefaultValidator(), + READ_SIDE_PADDING -> DefaultValidator() ) } diff --git a/backends-velox/src/test/scala/org/apache/gluten/functions/ScalarFunctionsValidateSuite.scala b/backends-velox/src/test/scala/org/apache/gluten/functions/ScalarFunctionsValidateSuite.scala index 097e6fc68e35..02c597bbbfa1 100644 --- a/backends-velox/src/test/scala/org/apache/gluten/functions/ScalarFunctionsValidateSuite.scala +++ b/backends-velox/src/test/scala/org/apache/gluten/functions/ScalarFunctionsValidateSuite.scala @@ -622,6 +622,47 @@ abstract class ScalarFunctionsValidateSuite extends FunctionsValidateSuite { } } + // Add test suite for CharVarcharCodegenUtils functions. + // A ProjectExecTransformer is expected to be constructed after expr support. + // We currently test below functions with Spark v3.4 + testWithMinSparkVersion("Test charTypeWriteSideCheck function", "3.4") { + withTable("src", "dest") { + + sql("create table src(id string) USING PARQUET") + sql("insert into src values('s')") + sql("create table dest(id char(3)) USING PARQUET") + // check whether the executed plan of a dataframe contains the expected plan. + runQueryAndCompare("insert into dest select id from src") { + checkGlutenOperatorMatch[ProjectExecTransformer] + } + } + } + + testWithMinSparkVersion("Test varcharTypeWriteSideCheck function", "3.4") { + withTable("src", "dest") { + + sql("create table src(id string) USING PARQUET") + sql("insert into src values('abc')") + sql("create table dest(id varchar(10)) USING PARQUET") + // check whether the executed plan of a dataframe contains the expected plan. + runQueryAndCompare("insert into dest select id from src") { + checkGlutenOperatorMatch[ProjectExecTransformer] + } + } + } + + testWithMinSparkVersion("Test readSidePadding function", "3.4") { + withTable("src", "dest") { + + sql("create table tgt(id char(3)) USING PARQUET") + sql("insert into tgt values('p')") + // check whether the executed plan of a dataframe contains the expected plan. + runQueryAndCompare("select id from tgt") { + checkGlutenOperatorMatch[ProjectExecTransformer] + } + } + } + test("soundex") { runQueryAndCompare("select soundex(c_comment) from customer limit 50") { checkGlutenOperatorMatch[ProjectExecTransformer] diff --git a/shims/common/src/main/scala/org/apache/gluten/expression/ExpressionNames.scala b/shims/common/src/main/scala/org/apache/gluten/expression/ExpressionNames.scala index 8fcdc01e5e86..329ac65e76f9 100644 --- a/shims/common/src/main/scala/org/apache/gluten/expression/ExpressionNames.scala +++ b/shims/common/src/main/scala/org/apache/gluten/expression/ExpressionNames.scala @@ -372,6 +372,11 @@ object ExpressionNames { final val UDF_PLACEHOLDER = "udf_placeholder" final val UDAF_PLACEHOLDER = "udaf_placeholder" + // Spark StaticInvoke Catalyst util functions + final val VARCHAR_TYPE_WRITE_SIDE_CHECK = "varchar_type_write_side_check" + final val CHAR_TYPE_WRITE_SIDE_CHECK = "char_type_write_side_check" + final val READ_SIDE_PADDING = "read_side_padding" + // Iceberg function names final val YEARS = "years" final val MONTHS = "months" From 104cb445806f7ac2b80e8908f8e7f1be35a266d6 Mon Sep 17 00:00:00 2001 From: iiFeung Date: Mon, 15 Sep 2025 20:02:40 +0800 Subject: [PATCH 2/6] Rename test names in ScalarFunctionsValidateSuite to be consistent in style with other cases. --- .../gluten/functions/ScalarFunctionsValidateSuite.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backends-velox/src/test/scala/org/apache/gluten/functions/ScalarFunctionsValidateSuite.scala b/backends-velox/src/test/scala/org/apache/gluten/functions/ScalarFunctionsValidateSuite.scala index 02c597bbbfa1..f6be6061316a 100644 --- a/backends-velox/src/test/scala/org/apache/gluten/functions/ScalarFunctionsValidateSuite.scala +++ b/backends-velox/src/test/scala/org/apache/gluten/functions/ScalarFunctionsValidateSuite.scala @@ -625,7 +625,7 @@ abstract class ScalarFunctionsValidateSuite extends FunctionsValidateSuite { // Add test suite for CharVarcharCodegenUtils functions. // A ProjectExecTransformer is expected to be constructed after expr support. // We currently test below functions with Spark v3.4 - testWithMinSparkVersion("Test charTypeWriteSideCheck function", "3.4") { + testWithMinSparkVersion("charTypeWriteSideCheck", "3.4") { withTable("src", "dest") { sql("create table src(id string) USING PARQUET") @@ -638,7 +638,7 @@ abstract class ScalarFunctionsValidateSuite extends FunctionsValidateSuite { } } - testWithMinSparkVersion("Test varcharTypeWriteSideCheck function", "3.4") { + testWithMinSparkVersion("varcharTypeWriteSideCheck", "3.4") { withTable("src", "dest") { sql("create table src(id string) USING PARQUET") @@ -651,7 +651,7 @@ abstract class ScalarFunctionsValidateSuite extends FunctionsValidateSuite { } } - testWithMinSparkVersion("Test readSidePadding function", "3.4") { + testWithMinSparkVersion("readSidePadding", "3.4") { withTable("src", "dest") { sql("create table tgt(id char(3)) USING PARQUET") From 4abd4b526222bfde4a3cfe9a25c5bb9e91016363 Mon Sep 17 00:00:00 2001 From: iiFeung Date: Thu, 25 Sep 2025 10:55:13 +0800 Subject: [PATCH 3/6] Fix ut test fails in CharVarcharTestSuite across multiple Spark versions. --- .../utils/velox/VeloxTestSettings.scala | 4 + .../sql/GlutenCharVarcharTestSuite.scala | 69 ++++++- .../utils/velox/VeloxTestSettings.scala | 2 + .../sql/GlutenCharVarcharTestSuite.scala | 44 +++- .../utils/velox/VeloxTestSettings.scala | 15 ++ .../sql/GlutenCharVarcharTestSuite.scala | 192 +++++++++++++++++- .../utils/velox/VeloxTestSettings.scala | 13 ++ .../sql/GlutenCharVarcharTestSuite.scala | 190 ++++++++++++++++- 8 files changed, 515 insertions(+), 14 deletions(-) diff --git a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala index 43a3250fc936..847fde62da7e 100644 --- a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala @@ -422,7 +422,11 @@ class VeloxTestSettings extends BackendTestSettings { .excludeByPrefix("SPARK-24705") .excludeByPrefix("determining the number of reducers") enableSuite[GlutenFileSourceCharVarcharTestSuite] + .exclude("length check for input string values: nested in struct") + .exclude("length check for input string values: nested in struct of array") enableSuite[GlutenDSV2CharVarcharTestSuite] + .exclude("length check for input string values: nested in struct") + .exclude("length check for input string values: nested in struct of array") enableSuite[GlutenFileScanSuite] enableSuite[GlutenNestedDataSourceV1Suite] enableSuite[GlutenNestedDataSourceV2Suite] diff --git a/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenCharVarcharTestSuite.scala b/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenCharVarcharTestSuite.scala index 84502ace5110..8dcb7bbfd842 100644 --- a/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenCharVarcharTestSuite.scala +++ b/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenCharVarcharTestSuite.scala @@ -16,8 +16,73 @@ */ package org.apache.spark.sql +import org.apache.spark.SparkException + class GlutenFileSourceCharVarcharTestSuite extends FileSourceCharVarcharTestSuite - with GlutenSQLTestsTrait {} + with GlutenSQLTestsTrait { + + private def testTableWrite(f: String => Unit): Unit = { + withTable("t")(f("char")) + withTable("t")(f("varchar")) + } + + private val VELOX_ERROR_MESSAGE = "Exceeds allowed length limitation: 5" + + testGluten("length check for input string values: nested in struct") { + testTableWrite { + typeName => + sql(s"CREATE TABLE t(c STRUCT) USING $format") + sql("INSERT INTO t SELECT struct(null)") + checkAnswer(spark.table("t"), Row(Row(null))) + val e = intercept[SparkException] { + sql("INSERT INTO t SELECT struct('123456')") + } + assert(e.getCause.getMessage.contains(VELOX_ERROR_MESSAGE)) + } + } + + testGluten("length check for input string values: nested in struct of array") { + testTableWrite { + typeName => + sql(s"CREATE TABLE t(c STRUCT>) USING $format") + sql("INSERT INTO t SELECT struct(array(null))") + checkAnswer(spark.table("t"), Row(Row(Seq(null)))) + val e = intercept[SparkException](sql("INSERT INTO t SELECT struct(array('123456'))")) + assert(e.getCause.getMessage.contains(VELOX_ERROR_MESSAGE)) + } + } +} + +class GlutenDSV2CharVarcharTestSuite extends DSV2CharVarcharTestSuite with GlutenSQLTestsTrait { + private def testTableWrite(f: String => Unit): Unit = { + withTable("t")(f("char")) + withTable("t")(f("varchar")) + } + + private val VELOX_ERROR_MESSAGE = "Exceeds allowed length limitation: 5" + + testGluten("length check for input string values: nested in struct") { + testTableWrite { + typeName => + sql(s"CREATE TABLE t(c STRUCT) USING $format") + sql("INSERT INTO t SELECT struct(null)") + checkAnswer(spark.table("t"), Row(Row(null))) + val e = intercept[SparkException] { + sql("INSERT INTO t SELECT struct('123456')") + } + assert(e.getCause.getMessage.contains(VELOX_ERROR_MESSAGE)) + } + } -class GlutenDSV2CharVarcharTestSuite extends DSV2CharVarcharTestSuite with GlutenSQLTestsTrait {} + testGluten("length check for input string values: nested in struct of array") { + testTableWrite { + typeName => + sql(s"CREATE TABLE t(c STRUCT>) USING $format") + sql("INSERT INTO t SELECT struct(array(null))") + checkAnswer(spark.table("t"), Row(Row(Seq(null)))) + val e = intercept[SparkException](sql("INSERT INTO t SELECT struct(array('123456'))")) + assert(e.getCause.getMessage.contains(VELOX_ERROR_MESSAGE)) + } + } +} diff --git a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala index 52ce14bda370..c0ff132ae834 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala @@ -616,7 +616,9 @@ class VeloxTestSettings extends BackendTestSettings { // Extra ColumnarToRow is needed to transform vanilla columnar data to gluten columnar data. .exclude("SPARK-37369: Avoid redundant ColumnarToRow transition on InMemoryTableScan") enableSuite[GlutenFileSourceCharVarcharTestSuite] + .exclude("length check for input string values: nested in struct of array") enableSuite[GlutenDSV2CharVarcharTestSuite] + .exclude("length check for input string values: nested in struct of array") enableSuite[GlutenColumnExpressionSuite] // Velox raise_error('errMsg') throws a velox_user_error exception with the message 'errMsg'. // The final caught Spark exception's getCause().getMessage() contains 'errMsg' but does not diff --git a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenCharVarcharTestSuite.scala b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenCharVarcharTestSuite.scala index 84502ace5110..8c59c323ee26 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenCharVarcharTestSuite.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenCharVarcharTestSuite.scala @@ -16,8 +16,48 @@ */ package org.apache.spark.sql +import org.apache.spark.SparkException + class GlutenFileSourceCharVarcharTestSuite extends FileSourceCharVarcharTestSuite - with GlutenSQLTestsTrait {} + with GlutenSQLTestsTrait { + + private val VELOX_ERROR_MESSAGE = "Exceeds allowed length limitation: 5" + + private def testTableWrite(f: String => Unit): Unit = { + withTable("t")(f("char")) + withTable("t")(f("varchar")) + } + + testGluten("length check for input string values: nested in struct of array") { + testTableWrite { + typeName => + sql(s"CREATE TABLE t(c STRUCT>) USING $format") + sql("INSERT INTO t SELECT struct(array(null))") + checkAnswer(spark.table("t"), Row(Row(Seq(null)))) + val e = intercept[SparkException](sql("INSERT INTO t SELECT struct(array('123456'))")) + assert(e.getCause.getMessage.contains(VELOX_ERROR_MESSAGE)) + } + } +} + +class GlutenDSV2CharVarcharTestSuite extends DSV2CharVarcharTestSuite with GlutenSQLTestsTrait { + + private val VELOX_ERROR_MESSAGE = "Exceeds allowed length limitation: 5" + + private def testTableWrite(f: String => Unit): Unit = { + withTable("t")(f("char")) + withTable("t")(f("varchar")) + } -class GlutenDSV2CharVarcharTestSuite extends DSV2CharVarcharTestSuite with GlutenSQLTestsTrait {} + testGluten("length check for input string values: nested in struct of array") { + testTableWrite { + typeName => + sql(s"CREATE TABLE t(c STRUCT>) USING $format") + sql("INSERT INTO t SELECT struct(array(null))") + checkAnswer(spark.table("t"), Row(Row(Seq(null)))) + val e = intercept[SparkException](sql("INSERT INTO t SELECT struct(array('123456'))")) + assert(e.getCause.getMessage.contains(VELOX_ERROR_MESSAGE)) + } + } +} diff --git a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala index 519cef5b7678..831ea1a2e91b 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala @@ -654,7 +654,22 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("length check for input string values: nested in both map key and value") .exclude("length check for input string values: nested in array of struct") .exclude("length check for input string values: nested in array of array") + .exclude("length check for input string values: top-level columns") + .exclude("length check for input string values: partitioned columns") + .exclude("length check for input string values: nested in struct of array") + .exclude("length check for input string values: with implicit cast") + .exclude("char/varchar type values length check: partitioned columns of other types") + enableSuite[GlutenDSV2CharVarcharTestSuite] + .exclude("SPARK-42611: check char/varchar length in reordered structs within arrays") + .exclude("char/varchar type values length check: partitioned columns of other types") + .exclude("length check for input string values: top-level columns") + .exclude("length check for input string values: nested in array") + .exclude("length check for input string values: nested in struct of array") + .exclude("length check for input string values: nested in array of struct") + .exclude("length check for input string values: nested in array of array") + .exclude("length check for input string values: with implicit cast") + enableSuite[GlutenColumnExpressionSuite] // Velox raise_error('errMsg') throws a velox_user_error exception with the message 'errMsg'. // The final caught Spark exception's getCause().getMessage() contains 'errMsg' but does not diff --git a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenCharVarcharTestSuite.scala b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenCharVarcharTestSuite.scala index 89d9114870b6..4f0d826f57b7 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenCharVarcharTestSuite.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenCharVarcharTestSuite.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql import org.apache.spark.SparkException +import org.apache.spark.sql.internal.SQLConf + class GlutenFileSourceCharVarcharTestSuite extends FileSourceCharVarcharTestSuite with GlutenSQLTestsTrait { @@ -27,6 +29,8 @@ class GlutenFileSourceCharVarcharTestSuite private val ERROR_MESSAGE = "Exceeds char/varchar type length limitation: 5" + private val VELOX_ERROR_MESSAGE = + "Exceeds allowed length limitation: 5" testGluten("length check for input string values: nested in struct") { testTableWrite { @@ -50,7 +54,7 @@ class GlutenFileSourceCharVarcharTestSuite val e = intercept[SparkException] { sql("INSERT INTO t VALUES (array('a', '123456'))") } - assert(e.getMessage.contains(ERROR_MESSAGE)) + assert(e.getMessage.contains(VELOX_ERROR_MESSAGE)) } } @@ -92,7 +96,7 @@ class GlutenFileSourceCharVarcharTestSuite sql("INSERT INTO t SELECT struct(array(null))") checkAnswer(spark.table("t"), Row(Row(Seq(null)))) val e = intercept[SparkException](sql("INSERT INTO t SELECT struct(array('123456'))")) - assert(e.getMessage.contains(ERROR_MESSAGE)) + assert(e.getCause.getMessage.contains(VELOX_ERROR_MESSAGE)) } } @@ -103,7 +107,7 @@ class GlutenFileSourceCharVarcharTestSuite sql("INSERT INTO t VALUES (array(struct(null)))") checkAnswer(spark.table("t"), Row(Seq(Row(null)))) val e = intercept[SparkException](sql("INSERT INTO t VALUES (array(struct('123456')))")) - assert(e.getMessage.contains(ERROR_MESSAGE)) + assert(e.getMessage.contains(VELOX_ERROR_MESSAGE)) } } @@ -114,9 +118,187 @@ class GlutenFileSourceCharVarcharTestSuite sql("INSERT INTO t VALUES (array(array(null)))") checkAnswer(spark.table("t"), Row(Seq(Seq(null)))) val e = intercept[SparkException](sql("INSERT INTO t VALUES (array(array('123456')))")) - assert(e.getMessage.contains(ERROR_MESSAGE)) + assert(e.getMessage.contains(VELOX_ERROR_MESSAGE)) + } + } + + testGluten("length check for input string values: top-level columns") { + testTableWrite { + typeName => + sql(s"CREATE TABLE t(c $typeName(5)) USING $format") + sql("INSERT INTO t VALUES (null)") + checkAnswer(spark.table("t"), Row(null)) + val e = intercept[SparkException](sql("INSERT INTO t VALUES ('123456')")) + assert(e.getMessage.contains(VELOX_ERROR_MESSAGE)) } } + + testGluten("length check for input string values: partitioned columns") { + // DS V2 doesn't support partitioned table. + if (!conf.contains(SQLConf.DEFAULT_CATALOG.key)) { + val tableName = "t" + testTableWrite { + typeName => + sql(s"CREATE TABLE $tableName(i INT, c $typeName(5)) USING $format PARTITIONED BY (c)") + sql(s"INSERT INTO $tableName VALUES (1, null)") + checkAnswer(spark.table(tableName), Row(1, null)) + val e = intercept[SparkException](sql(s"INSERT INTO $tableName VALUES (1, '123456')")) + assert(e.getMessage.contains(VELOX_ERROR_MESSAGE)) + } + } + } + + testGluten("length check for input string values: with implicit cast") { + withTable("t") { + sql(s"CREATE TABLE t(c1 CHAR(5), c2 VARCHAR(5)) USING $format") + sql("INSERT INTO t VALUES (1234, 1234)") + checkAnswer(spark.table("t"), Row("1234 ", "1234")) + val e1 = intercept[SparkException](sql("INSERT INTO t VALUES (123456, 1)")) + assert(e1.getMessage.contains(VELOX_ERROR_MESSAGE)) + val e2 = intercept[SparkException](sql("INSERT INTO t VALUES (1, 123456)")) + assert(e2.getMessage.contains(VELOX_ERROR_MESSAGE)) + } + } + + testGluten("char/varchar type values length check: partitioned columns of other types") { + val tableName = "t" + Seq("CHAR(5)", "VARCHAR(5)").foreach { + typ => + withTable(tableName) { + sql(s"CREATE TABLE $tableName(i STRING, c $typ) USING $format PARTITIONED BY (c)") + Seq(1, 10, 100, 1000, 10000).foreach { + v => + sql(s"INSERT OVERWRITE $tableName VALUES ('1', $v)") + checkPlainResult(spark.table(tableName), typ, v.toString) + sql(s"ALTER TABLE $tableName DROP PARTITION(c=$v)") + checkAnswer(spark.table(tableName), Nil) + } + + val e1 = + intercept[SparkException](sql(s"INSERT OVERWRITE $tableName VALUES ('1', 100000)")) + assert(e1.getMessage.contains(VELOX_ERROR_MESSAGE)) + + val e2 = intercept[RuntimeException](sql("ALTER TABLE t DROP PARTITION(c=100000)")) + assert(e2.getMessage.contains(ERROR_MESSAGE)) + } + } + } + } -class GlutenDSV2CharVarcharTestSuite extends DSV2CharVarcharTestSuite with GlutenSQLTestsTrait {} +class GlutenDSV2CharVarcharTestSuite extends DSV2CharVarcharTestSuite with GlutenSQLTestsTrait { + private val ERROR_MESSAGE = + "Exceeds char/varchar type length limitation: 5" + private val VELOX_ERROR_MESSAGE = + "Exceeds allowed length limitation: 5" + + private def testTableWrite(f: String => Unit): Unit = { + withTable("t")(f("char")) + withTable("t")(f("varchar")) + } + + testGluten("length check for input string values: top-level columns") { + testTableWrite { + typeName => + sql(s"CREATE TABLE t(c $typeName(5)) USING $format") + sql("INSERT INTO t VALUES (null)") + checkAnswer(spark.table("t"), Row(null)) + val e = intercept[SparkException](sql("INSERT INTO t VALUES ('123456')")) + assert(e.getMessage.contains(VELOX_ERROR_MESSAGE)) + } + } + + testGluten("length check for input string values: nested in array") { + testTableWrite { + typeName => + sql(s"CREATE TABLE t(c ARRAY<$typeName(5)>) USING $format") + sql("INSERT INTO t VALUES (array(null))") + checkAnswer(spark.table("t"), Row(Seq(null))) + val e = intercept[SparkException] { + sql("INSERT INTO t VALUES (array('a', '123456'))") + } + assert(e.getMessage.contains(VELOX_ERROR_MESSAGE)) + } + } + + testGluten("length check for input string values: nested in struct of array") { + testTableWrite { + typeName => + sql(s"CREATE TABLE t(c STRUCT>) USING $format") + sql("INSERT INTO t SELECT struct(array(null))") + checkAnswer(spark.table("t"), Row(Row(Seq(null)))) + val e = intercept[SparkException](sql("INSERT INTO t SELECT struct(array('123456'))")) + assert(e.getCause.getMessage.contains(VELOX_ERROR_MESSAGE)) + } + } + + testGluten("length check for input string values: nested in array of struct") { + testTableWrite { + typeName => + sql(s"CREATE TABLE t(c ARRAY>) USING $format") + sql("INSERT INTO t VALUES (array(struct(null)))") + checkAnswer(spark.table("t"), Row(Seq(Row(null)))) + val e = intercept[SparkException](sql("INSERT INTO t VALUES (array(struct('123456')))")) + assert(e.getMessage.contains(VELOX_ERROR_MESSAGE)) + } + } + + testGluten("length check for input string values: nested in array of array") { + testTableWrite { + typeName => + sql(s"CREATE TABLE t(c ARRAY>) USING $format") + sql("INSERT INTO t VALUES (array(array(null)))") + checkAnswer(spark.table("t"), Row(Seq(Seq(null)))) + val e = intercept[SparkException](sql("INSERT INTO t VALUES (array(array('123456')))")) + assert(e.getMessage.contains(VELOX_ERROR_MESSAGE)) + } + } + + testGluten("length check for input string values: with implicit cast") { + withTable("t") { + sql(s"CREATE TABLE t(c1 CHAR(5), c2 VARCHAR(5)) USING $format") + sql("INSERT INTO t VALUES (1234, 1234)") + checkAnswer(spark.table("t"), Row("1234 ", "1234")) + val e1 = intercept[SparkException](sql("INSERT INTO t VALUES (123456, 1)")) + assert(e1.getMessage.contains(VELOX_ERROR_MESSAGE)) + val e2 = intercept[SparkException](sql("INSERT INTO t VALUES (1, 123456)")) + assert(e2.getMessage.contains(VELOX_ERROR_MESSAGE)) + } + } + + testGluten("char/varchar type values length check: partitioned columns of other types") { + val tableName = "t" + Seq("CHAR(5)", "VARCHAR(5)").foreach { + typ => + withTable(tableName) { + sql(s"CREATE TABLE $tableName(i STRING, c $typ) USING $format PARTITIONED BY (c)") + Seq(1, 10, 100, 1000, 10000).foreach { + v => + sql(s"INSERT OVERWRITE $tableName VALUES ('1', $v)") + checkPlainResult(spark.table(tableName), typ, v.toString) + sql(s"ALTER TABLE $tableName DROP PARTITION(c=$v)") + checkAnswer(spark.table(tableName), Nil) + } + + val e1 = + intercept[SparkException](sql(s"INSERT OVERWRITE $tableName VALUES ('1', 100000)")) + assert(e1.getMessage.contains(VELOX_ERROR_MESSAGE)) + + val e2 = intercept[RuntimeException](sql("ALTER TABLE t DROP PARTITION(c=100000)")) + assert(e2.getMessage.contains(ERROR_MESSAGE)) + } + } + } + + testGluten("SPARK-42611: check char/varchar length in reordered structs within arrays") { + Seq("CHAR(5)", "VARCHAR(5)").foreach { + typ => + withTable("t") { + sql(s"CREATE TABLE t(a ARRAY>) USING $format") + val inputDF = sql("SELECT array(named_struct('n_i', 1, 'n_c', '123456')) AS a") + val e = intercept[SparkException](inputDF.writeTo("t").append()) + assert(e.getMessage.contains(VELOX_ERROR_MESSAGE)) + } + } + } +} diff --git a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala index 27af909029a9..da4db937cc0b 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala @@ -676,7 +676,20 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("length check for input string values: nested in both map key and value") .exclude("length check for input string values: nested in array of struct") .exclude("length check for input string values: nested in array of array") + .exclude("length check for input string values: top-level columns") + .exclude("length check for input string values: partitioned columns") + .exclude("length check for input string values: nested in struct of array") + .exclude("length check for input string values: with implicit cast") + .exclude("char/varchar type values length check: partitioned columns of other types") enableSuite[GlutenDSV2CharVarcharTestSuite] + .exclude("length check for input string values: top-level columns") + .exclude("length check for input string values: nested in array") + .exclude("length check for input string values: nested in struct of array") + .exclude("length check for input string values: nested in array of struct") + .exclude("length check for input string values: nested in array of array") + .exclude("length check for input string values: with implicit cast") + .exclude("char/varchar type values length check: partitioned columns of other types") + .exclude("SPARK-42611: check char/varchar length in reordered structs within arrays") enableSuite[GlutenColumnExpressionSuite] // Velox raise_error('errMsg') throws a velox_user_error exception with the message 'errMsg'. // The final caught Spark exception's getCause().getMessage() contains 'errMsg' but does not diff --git a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenCharVarcharTestSuite.scala b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenCharVarcharTestSuite.scala index ce2f1b465e7f..689946547d15 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenCharVarcharTestSuite.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenCharVarcharTestSuite.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql import org.apache.spark.SparkException +import org.apache.spark.sql.internal.SQLConf class GlutenFileSourceCharVarcharTestSuite extends FileSourceCharVarcharTestSuite @@ -28,6 +29,8 @@ class GlutenFileSourceCharVarcharTestSuite private val ERROR_MESSAGE = "Exceeds char/varchar type length limitation: 5" + private val VELOX_ERROR_MESSAGE = + "Exceeds allowed length limitation: 5" testGluten("length check for input string values: nested in struct") { testTableWrite { @@ -51,7 +54,7 @@ class GlutenFileSourceCharVarcharTestSuite val e = intercept[SparkException] { sql("INSERT INTO t VALUES (array('a', '123456'))") } - assert(e.getMessage.contains(ERROR_MESSAGE)) + assert(e.getMessage.contains(VELOX_ERROR_MESSAGE)) } } @@ -93,7 +96,7 @@ class GlutenFileSourceCharVarcharTestSuite sql("INSERT INTO t SELECT struct(array(null))") checkAnswer(spark.table("t"), Row(Row(Seq(null)))) val e = intercept[SparkException](sql("INSERT INTO t SELECT struct(array('123456'))")) - assert(e.getMessage.contains(ERROR_MESSAGE)) + assert(e.getCause.getMessage.contains(VELOX_ERROR_MESSAGE)) } } @@ -104,7 +107,7 @@ class GlutenFileSourceCharVarcharTestSuite sql("INSERT INTO t VALUES (array(struct(null)))") checkAnswer(spark.table("t"), Row(Seq(Row(null)))) val e = intercept[SparkException](sql("INSERT INTO t VALUES (array(struct('123456')))")) - assert(e.getMessage.contains(ERROR_MESSAGE)) + assert(e.getMessage.contains(VELOX_ERROR_MESSAGE)) } } @@ -115,9 +118,186 @@ class GlutenFileSourceCharVarcharTestSuite sql("INSERT INTO t VALUES (array(array(null)))") checkAnswer(spark.table("t"), Row(Seq(Seq(null)))) val e = intercept[SparkException](sql("INSERT INTO t VALUES (array(array('123456')))")) - assert(e.getMessage.contains(ERROR_MESSAGE)) + assert(e.getMessage.contains(VELOX_ERROR_MESSAGE)) + } + } + + testGluten("length check for input string values: top-level columns") { + testTableWrite { + typeName => + sql(s"CREATE TABLE t(c $typeName(5)) USING $format") + sql("INSERT INTO t VALUES (null)") + checkAnswer(spark.table("t"), Row(null)) + val e = intercept[SparkException](sql("INSERT INTO t VALUES ('123456')")) + assert(e.getMessage.contains(VELOX_ERROR_MESSAGE)) + } + } + + testGluten("length check for input string values: partitioned columns") { + // DS V2 doesn't support partitioned table. + if (!conf.contains(SQLConf.DEFAULT_CATALOG.key)) { + val tableName = "t" + testTableWrite { + typeName => + sql(s"CREATE TABLE $tableName(i INT, c $typeName(5)) USING $format PARTITIONED BY (c)") + sql(s"INSERT INTO $tableName VALUES (1, null)") + checkAnswer(spark.table(tableName), Row(1, null)) + val e = intercept[SparkException](sql(s"INSERT INTO $tableName VALUES (1, '123456')")) + assert(e.getMessage.contains(VELOX_ERROR_MESSAGE)) + } + } + } + + testGluten("length check for input string values: with implicit cast") { + withTable("t") { + sql(s"CREATE TABLE t(c1 CHAR(5), c2 VARCHAR(5)) USING $format") + sql("INSERT INTO t VALUES (1234, 1234)") + checkAnswer(spark.table("t"), Row("1234 ", "1234")) + val e1 = intercept[SparkException](sql("INSERT INTO t VALUES (123456, 1)")) + assert(e1.getMessage.contains(VELOX_ERROR_MESSAGE)) + val e2 = intercept[SparkException](sql("INSERT INTO t VALUES (1, 123456)")) + assert(e2.getMessage.contains(VELOX_ERROR_MESSAGE)) + } + } + + testGluten("char/varchar type values length check: partitioned columns of other types") { + val tableName = "t" + Seq("CHAR(5)", "VARCHAR(5)").foreach { + typ => + withTable(tableName) { + sql(s"CREATE TABLE $tableName(i STRING, c $typ) USING $format PARTITIONED BY (c)") + Seq(1, 10, 100, 1000, 10000).foreach { + v => + sql(s"INSERT OVERWRITE $tableName VALUES ('1', $v)") + checkPlainResult(spark.table(tableName), typ, v.toString) + sql(s"ALTER TABLE $tableName DROP PARTITION(c=$v)") + checkAnswer(spark.table(tableName), Nil) + } + + val e1 = + intercept[SparkException](sql(s"INSERT OVERWRITE $tableName VALUES ('1', 100000)")) + assert(e1.getMessage.contains(VELOX_ERROR_MESSAGE)) + + val e2 = intercept[RuntimeException](sql("ALTER TABLE t DROP PARTITION(c=100000)")) + assert(e2.getMessage.contains(ERROR_MESSAGE)) + } } } } -class GlutenDSV2CharVarcharTestSuite extends DSV2CharVarcharTestSuite with GlutenSQLTestsTrait {} +class GlutenDSV2CharVarcharTestSuite extends DSV2CharVarcharTestSuite with GlutenSQLTestsTrait { + private val ERROR_MESSAGE = + "Exceeds char/varchar type length limitation: 5" + private val VELOX_ERROR_MESSAGE = + "Exceeds allowed length limitation: 5" + + private def testTableWrite(f: String => Unit): Unit = { + withTable("t")(f("char")) + withTable("t")(f("varchar")) + } + + testGluten("length check for input string values: top-level columns") { + testTableWrite { + typeName => + sql(s"CREATE TABLE t(c $typeName(5)) USING $format") + sql("INSERT INTO t VALUES (null)") + checkAnswer(spark.table("t"), Row(null)) + val e = intercept[SparkException](sql("INSERT INTO t VALUES ('123456')")) + assert(e.getMessage.contains(VELOX_ERROR_MESSAGE)) + } + } + + testGluten("length check for input string values: nested in array") { + testTableWrite { + typeName => + sql(s"CREATE TABLE t(c ARRAY<$typeName(5)>) USING $format") + sql("INSERT INTO t VALUES (array(null))") + checkAnswer(spark.table("t"), Row(Seq(null))) + val e = intercept[SparkException] { + sql("INSERT INTO t VALUES (array('a', '123456'))") + } + assert(e.getMessage.contains(VELOX_ERROR_MESSAGE)) + } + } + + testGluten("length check for input string values: nested in struct of array") { + testTableWrite { + typeName => + sql(s"CREATE TABLE t(c STRUCT>) USING $format") + sql("INSERT INTO t SELECT struct(array(null))") + checkAnswer(spark.table("t"), Row(Row(Seq(null)))) + val e = intercept[SparkException](sql("INSERT INTO t SELECT struct(array('123456'))")) + assert(e.getCause.getMessage.contains(VELOX_ERROR_MESSAGE)) + } + } + + testGluten("length check for input string values: nested in array of struct") { + testTableWrite { + typeName => + sql(s"CREATE TABLE t(c ARRAY>) USING $format") + sql("INSERT INTO t VALUES (array(struct(null)))") + checkAnswer(spark.table("t"), Row(Seq(Row(null)))) + val e = intercept[SparkException](sql("INSERT INTO t VALUES (array(struct('123456')))")) + assert(e.getMessage.contains(VELOX_ERROR_MESSAGE)) + } + } + + testGluten("length check for input string values: nested in array of array") { + testTableWrite { + typeName => + sql(s"CREATE TABLE t(c ARRAY>) USING $format") + sql("INSERT INTO t VALUES (array(array(null)))") + checkAnswer(spark.table("t"), Row(Seq(Seq(null)))) + val e = intercept[SparkException](sql("INSERT INTO t VALUES (array(array('123456')))")) + assert(e.getMessage.contains(VELOX_ERROR_MESSAGE)) + } + } + + testGluten("length check for input string values: with implicit cast") { + withTable("t") { + sql(s"CREATE TABLE t(c1 CHAR(5), c2 VARCHAR(5)) USING $format") + sql("INSERT INTO t VALUES (1234, 1234)") + checkAnswer(spark.table("t"), Row("1234 ", "1234")) + val e1 = intercept[SparkException](sql("INSERT INTO t VALUES (123456, 1)")) + assert(e1.getMessage.contains(VELOX_ERROR_MESSAGE)) + val e2 = intercept[SparkException](sql("INSERT INTO t VALUES (1, 123456)")) + assert(e2.getMessage.contains(VELOX_ERROR_MESSAGE)) + } + } + + testGluten("char/varchar type values length check: partitioned columns of other types") { + val tableName = "t" + Seq("CHAR(5)", "VARCHAR(5)").foreach { + typ => + withTable(tableName) { + sql(s"CREATE TABLE $tableName(i STRING, c $typ) USING $format PARTITIONED BY (c)") + Seq(1, 10, 100, 1000, 10000).foreach { + v => + sql(s"INSERT OVERWRITE $tableName VALUES ('1', $v)") + checkPlainResult(spark.table(tableName), typ, v.toString) + sql(s"ALTER TABLE $tableName DROP PARTITION(c=$v)") + checkAnswer(spark.table(tableName), Nil) + } + + val e1 = + intercept[SparkException](sql(s"INSERT OVERWRITE $tableName VALUES ('1', 100000)")) + assert(e1.getMessage.contains(VELOX_ERROR_MESSAGE)) + + val e2 = intercept[RuntimeException](sql("ALTER TABLE t DROP PARTITION(c=100000)")) + assert(e2.getMessage.contains(ERROR_MESSAGE)) + } + } + } + + testGluten("SPARK-42611: check char/varchar length in reordered structs within arrays") { + Seq("CHAR(5)", "VARCHAR(5)").foreach { + typ => + withTable("t") { + sql(s"CREATE TABLE t(a ARRAY>) USING $format") + val inputDF = sql("SELECT array(named_struct('n_i', 1, 'n_c', '123456')) AS a") + val e = intercept[SparkException](inputDF.writeTo("t").append()) + assert(e.getMessage.contains(VELOX_ERROR_MESSAGE)) + } + } + } +} From a0fbecbaf94920dfa5c8b7065acfd3725265b2a1 Mon Sep 17 00:00:00 2001 From: iiFeung Date: Tue, 21 Oct 2025 21:44:56 +0800 Subject: [PATCH 4/6] fix: Extract StaticInvoke handling whilst keep `replaceIcebergStaticInvoke` logics intact. --- .../expression/ExpressionConverter.scala | 83 ++++++++++++++----- 1 file changed, 61 insertions(+), 22 deletions(-) diff --git a/gluten-substrait/src/main/scala/org/apache/gluten/expression/ExpressionConverter.scala b/gluten-substrait/src/main/scala/org/apache/gluten/expression/ExpressionConverter.scala index 61f5c1be8dd0..41a2a2ff8230 100644 --- a/gluten-substrait/src/main/scala/org/apache/gluten/expression/ExpressionConverter.scala +++ b/gluten-substrait/src/main/scala/org/apache/gluten/expression/ExpressionConverter.scala @@ -144,6 +144,66 @@ object ExpressionConverter extends SQLConfHelper with Logging { DecimalArithmeticExpressionTransformer(substraitName, leftChild, rightChild, resultType, b) } + private def replaceStaticInvokeWithExpressionTransformer( + i: StaticInvoke, + attributeSeq: Seq[Attribute], + expressionsMap: Map[Class[_], String]): ExpressionTransformer = { + def validateAndTransform( + exprName: String, + childTransformers: => Seq[ExpressionTransformer]): ExpressionTransformer = { + if (!BackendsApiManager.getValidatorApiInstance.doExprValidate(exprName, i)) { + throw new GlutenNotSupportException( + s"Not supported to map current ${i.getClass} call on function: ${i.functionName}.") + } + GenericExpressionTransformer(exprName, childTransformers, i) + } + + i.functionName match { + case "encode" | "decode" if i.objectName.endsWith("UrlCodec") => + validateAndTransform( + "url_" + i.functionName, + Seq(replaceWithExpressionTransformer0(i.arguments.head, attributeSeq, expressionsMap)) + ) + + case "isLuhnNumber" => + validateAndTransform( + ExpressionNames.LUHN_CHECK, + Seq(replaceWithExpressionTransformer0(i.arguments.head, attributeSeq, expressionsMap)) + ) + + case "encode" | "decode" if i.objectName.endsWith("Base64") => + if (!BackendsApiManager.getValidatorApiInstance.doExprValidate(ExpressionNames.BASE64, i)) { + throw new GlutenNotSupportException( + s"Not supported to map current ${i.getClass} call on function: ${i.functionName}.") + } + BackendsApiManager.getSparkPlanExecApiInstance.genBase64StaticInvokeTransformer( + ExpressionNames.BASE64, + replaceWithExpressionTransformer0(i.arguments.head, attributeSeq, expressionsMap), + i + ) + + case fn + if i.objectName.endsWith("CharVarcharCodegenUtils") && Set( + "varcharTypeWriteSideCheck", + "charTypeWriteSideCheck", + "readSidePadding").contains(fn) => + val exprName = fn match { + case "varcharTypeWriteSideCheck" => ExpressionNames.VARCHAR_TYPE_WRITE_SIDE_CHECK + case "charTypeWriteSideCheck" => ExpressionNames.CHAR_TYPE_WRITE_SIDE_CHECK + case "readSidePadding" => ExpressionNames.READ_SIDE_PADDING + } + validateAndTransform( + exprName, + i.arguments.map(replaceWithExpressionTransformer0(_, attributeSeq, expressionsMap)) + ) + + case _ => + throw new GlutenNotSupportException( + s"Not supported to transform StaticInvoke with object: ${i.staticObject.getName}, " + + s"function: ${i.functionName}") + } + } + private def replaceIcebergStaticInvoke( s: StaticInvoke, attributeSeq: Seq[Attribute], @@ -186,33 +246,12 @@ object ExpressionConverter extends SQLConfHelper with Logging { return BackendsApiManager.getSparkPlanExecApiInstance.genHiveUDFTransformer( expr, attributeSeq) - case i: StaticInvoke - if Seq("encode", "decode").contains(i.functionName) && i.objectName.endsWith( - "UrlCodec") => - return GenericExpressionTransformer( - "url_" + i.functionName, - replaceWithExpressionTransformer0(i.arguments.head, attributeSeq, expressionsMap), - i) - case i: StaticInvoke if i.functionName.equals("isLuhnNumber") => - return GenericExpressionTransformer( - ExpressionNames.LUHN_CHECK, - replaceWithExpressionTransformer0(i.arguments.head, attributeSeq, expressionsMap), - i) - case i: StaticInvoke - if Seq("encode", "decode").contains(i.functionName) && i.objectName.endsWith("Base64") => - return BackendsApiManager.getSparkPlanExecApiInstance.genBase64StaticInvokeTransformer( - ExpressionNames.BASE64, - replaceWithExpressionTransformer0(i.arguments.head, attributeSeq, expressionsMap), - i - ) case i: StaticInvoke if i.functionName == "invoke" && i.staticObject.getName.startsWith( "org.apache.iceberg.spark.functions.") => return replaceIcebergStaticInvoke(i, attributeSeq, expressionsMap) case i: StaticInvoke => - throw new GlutenNotSupportException( - s"Not supported to transform StaticInvoke with object: ${i.staticObject.getName}, " + - s"function: ${i.functionName}") + return replaceStaticInvokeWithExpressionTransformer(i, attributeSeq, expressionsMap) case _ => } From 1430dc713fbf672e81c360aba4513058c2bc7f4c Mon Sep 17 00:00:00 2001 From: iiFeung Date: Wed, 22 Oct 2025 11:44:45 +0800 Subject: [PATCH 5/6] Exclude gluten tests for ClickHouse backend --- .../clickhouse/ClickHouseTestSettings.scala | 4 ++++ .../clickhouse/ClickHouseTestSettings.scala | 2 ++ .../clickhouse/ClickHouseTestSettings.scala | 16 ++++++++++++++++ .../clickhouse/ClickHouseTestSettings.scala | 16 ++++++++++++++++ 4 files changed, 38 insertions(+) diff --git a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index 7f9bdba52b23..ba9287873594 100644 --- a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -135,6 +135,8 @@ class ClickHouseTestSettings extends BackendTestSettings { enableSuite[GlutenDSV2CharVarcharTestSuite] // failed on spark32 UT, see https://github.com/oap-project/gluten/issues/4043 .exclude("SPARK-34833: right-padding applied correctly for correlated subqueries - other preds") + .excludeGlutenTest("length check for input string values: nested in struct") + .excludeGlutenTest("length check for input string values: nested in struct of array") enableSuite[GlutenDSV2SQLInsertTestSuite] enableSuite[GlutenDataFrameAggregateSuite] .exclude("average") @@ -354,6 +356,8 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("varchar type values length check and trim: partitioned columns") .exclude("char/varchar type values length check: partitioned columns of other types") .exclude("char type comparison: partitioned columns") + .excludeGlutenTest("length check for input string values: nested in struct") + .excludeGlutenTest("length check for input string values: nested in struct of array") enableSuite[GlutenFileSourceSQLInsertTestSuite] .exclude("SPARK-33474: Support typed literals as partition spec values") .exclude( diff --git a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index 6ce7d1e325a2..bba6b30fbe0f 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -152,6 +152,7 @@ class ClickHouseTestSettings extends BackendTestSettings { enableSuite[GlutenCountMinSketchAggQuerySuite] enableSuite[GlutenCsvFunctionsSuite] enableSuite[GlutenDSV2CharVarcharTestSuite] + .excludeGlutenTest("length check for input string values: nested in struct of array") enableSuite[GlutenDSV2SQLInsertTestSuite] enableSuite[GlutenDataFrameAggregateSuite] .exclude("average") @@ -367,6 +368,7 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("varchar type values length check and trim: partitioned columns") .exclude("char/varchar type values length check: partitioned columns of other types") .exclude("char type comparison: partitioned columns") + .excludeGlutenTest("length check for input string values: nested in struct of array") enableSuite[GlutenFileSourceSQLInsertTestSuite] .exclude("SPARK-33474: Support typed literals as partition spec values") .exclude( diff --git a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index 21cf94a61e7c..9b36cdd6da31 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -154,6 +154,14 @@ class ClickHouseTestSettings extends BackendTestSettings { enableSuite[GlutenCountMinSketchAggQuerySuite] enableSuite[GlutenCsvFunctionsSuite] enableSuite[GlutenDSV2CharVarcharTestSuite] + .excludeGlutenTest("length check for input string values: top-level columns") + .excludeGlutenTest("length check for input string values: nested in array") + .excludeGlutenTest("length check for input string values: nested in struct of array") + .excludeGlutenTest("length check for input string values: nested in array of struct") + .excludeGlutenTest("length check for input string values: nested in array of array") + .excludeGlutenTest("length check for input string values: with implicit cast") + .excludeGlutenTest("char/varchar type values length check: partitioned columns of other types") + .excludeGlutenTest("SPARK-42611: check char/varchar length in reordered structs within arrays") enableSuite[GlutenDSV2SQLInsertTestSuite] enableSuite[GlutenDataFrameAggregateSuite] .exclude("average") @@ -367,6 +375,14 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("varchar type values length check and trim: partitioned columns") .exclude("char/varchar type values length check: partitioned columns of other types") .exclude("char type comparison: partitioned columns") + .excludeGlutenTest("length check for input string values: top-level columns") + .excludeGlutenTest("length check for input string values: partitioned columns") + .excludeGlutenTest("length check for input string values: nested in struct of array") + .excludeGlutenTest("length check for input string values: with implicit cast") + .excludeGlutenTest("char/varchar type values length check: partitioned columns of other types") + .excludeGlutenTest("length check for input string values: nested in array of array") + .excludeGlutenTest("length check for input string values: nested in array of struct") + .excludeGlutenTest("length check for input string values: nested in array") enableSuite[GlutenFileSourceSQLInsertTestSuite] .exclude("SPARK-33474: Support typed literals as partition spec values") .exclude( diff --git a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index e7e3ddf8a034..03dce4b35d52 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -445,6 +445,14 @@ class ClickHouseTestSettings extends BackendTestSettings { enableSuite[GlutenCustomerExtensionSuite] enableSuite[GlutenDDLSourceLoadSuite] enableSuite[GlutenDSV2CharVarcharTestSuite] + .excludeGlutenTest("length check for input string values: top-level columns") + .excludeGlutenTest("length check for input string values: nested in array") + .excludeGlutenTest("length check for input string values: nested in struct of array") + .excludeGlutenTest("length check for input string values: nested in array of struct") + .excludeGlutenTest("length check for input string values: nested in array of array") + .excludeGlutenTest("length check for input string values: with implicit cast") + .excludeGlutenTest("char/varchar type values length check: partitioned columns of other types") + .excludeGlutenTest("SPARK-42611: check char/varchar length in reordered structs within arrays") enableSuite[GlutenDSV2SQLInsertTestSuite] enableSuite[GlutenDataFrameAggregateSuite] // Test for vanilla spark codegen, not apply for Gluten @@ -848,6 +856,14 @@ class ClickHouseTestSettings extends BackendTestSettings { .includeCH("length check for input string values: nested in both map key and value") .includeCH("length check for input string values: nested in array of struct") .includeCH("length check for input string values: nested in array of array") + .excludeGlutenTest("length check for input string values: top-level columns") + .excludeGlutenTest("length check for input string values: partitioned columns") + .excludeGlutenTest("length check for input string values: nested in struct of array") + .excludeGlutenTest("length check for input string values: with implicit cast") + .excludeGlutenTest("char/varchar type values length check: partitioned columns of other types") + .excludeGlutenTest("length check for input string values: nested in array") + .excludeGlutenTest("length check for input string values: nested in array of struct") + .excludeGlutenTest("length check for input string values: nested in array of array") enableSuite[GlutenFileSourceCustomMetadataStructSuite] enableSuite[GlutenFileSourceSQLInsertTestSuite] .excludeCH("SPARK-33474: Support typed literals as partition spec values") From df011d58f5a46de762e2ed5e8e2b8ee78ca4b3e0 Mon Sep 17 00:00:00 2001 From: iiFeung Date: Thu, 23 Oct 2025 16:53:55 +0800 Subject: [PATCH 6/6] Update ClickHouse and Velox test settings comments. --- .../gluten/utils/clickhouse/ClickHouseTestSettings.scala | 6 ++++++ .../org/apache/gluten/utils/velox/VeloxTestSettings.scala | 6 ++++++ .../gluten/utils/clickhouse/ClickHouseTestSettings.scala | 6 ++++++ .../org/apache/gluten/utils/velox/VeloxTestSettings.scala | 6 ++++++ .../gluten/utils/clickhouse/ClickHouseTestSettings.scala | 6 ++++++ .../org/apache/gluten/utils/velox/VeloxTestSettings.scala | 6 ++++++ .../gluten/utils/clickhouse/ClickHouseTestSettings.scala | 6 ++++++ .../org/apache/gluten/utils/velox/VeloxTestSettings.scala | 6 ++++++ 8 files changed, 48 insertions(+) diff --git a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index ba9287873594..dc44facd7ef9 100644 --- a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -135,6 +135,9 @@ class ClickHouseTestSettings extends BackendTestSettings { enableSuite[GlutenDSV2CharVarcharTestSuite] // failed on spark32 UT, see https://github.com/oap-project/gluten/issues/4043 .exclude("SPARK-34833: right-padding applied correctly for correlated subqueries - other preds") + // Excluded. The Gluten tests for char/varchar validation were rewritten for Velox. + // ClickHouse backend doesn't support this feature and falls back to vanilla Spark, + // causing mismatches in error messages. .excludeGlutenTest("length check for input string values: nested in struct") .excludeGlutenTest("length check for input string values: nested in struct of array") enableSuite[GlutenDSV2SQLInsertTestSuite] @@ -356,6 +359,9 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("varchar type values length check and trim: partitioned columns") .exclude("char/varchar type values length check: partitioned columns of other types") .exclude("char type comparison: partitioned columns") + // Excluded. The Gluten tests for char/varchar validation were rewritten for Velox. + // ClickHouse backend doesn't support this feature and falls back to vanilla Spark, + // causing mismatches in error messages. .excludeGlutenTest("length check for input string values: nested in struct") .excludeGlutenTest("length check for input string values: nested in struct of array") enableSuite[GlutenFileSourceSQLInsertTestSuite] diff --git a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala index 847fde62da7e..8929dfbefab0 100644 --- a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala @@ -422,9 +422,15 @@ class VeloxTestSettings extends BackendTestSettings { .excludeByPrefix("SPARK-24705") .excludeByPrefix("determining the number of reducers") enableSuite[GlutenFileSourceCharVarcharTestSuite] + // Following tests are excluded as these are overridden in Gluten test suite.. + // The overridden tests assert against Velox-specific error messages for char/varchar + // length validation, which differ from the original vanilla Spark tests. .exclude("length check for input string values: nested in struct") .exclude("length check for input string values: nested in struct of array") enableSuite[GlutenDSV2CharVarcharTestSuite] + // Following tests are excluded as these are overridden in Gluten test suite.. + // The overridden tests assert against Velox-specific error messages for char/varchar + // length validation, which differ from the original vanilla Spark tests. .exclude("length check for input string values: nested in struct") .exclude("length check for input string values: nested in struct of array") enableSuite[GlutenFileScanSuite] diff --git a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index bba6b30fbe0f..2fcd692598c1 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -152,6 +152,9 @@ class ClickHouseTestSettings extends BackendTestSettings { enableSuite[GlutenCountMinSketchAggQuerySuite] enableSuite[GlutenCsvFunctionsSuite] enableSuite[GlutenDSV2CharVarcharTestSuite] + // Excluded. The Gluten tests for char/varchar validation were rewritten for Velox. + // ClickHouse backend doesn't support this feature and falls back to vanilla Spark, + // causing mismatches in error messages. .excludeGlutenTest("length check for input string values: nested in struct of array") enableSuite[GlutenDSV2SQLInsertTestSuite] enableSuite[GlutenDataFrameAggregateSuite] @@ -368,6 +371,9 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("varchar type values length check and trim: partitioned columns") .exclude("char/varchar type values length check: partitioned columns of other types") .exclude("char type comparison: partitioned columns") + // Excluded. The Gluten tests for char/varchar validation were rewritten for Velox. + // ClickHouse backend doesn't support this feature and falls back to vanilla Spark, + // causing mismatches in error messages. .excludeGlutenTest("length check for input string values: nested in struct of array") enableSuite[GlutenFileSourceSQLInsertTestSuite] .exclude("SPARK-33474: Support typed literals as partition spec values") diff --git a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala index c0ff132ae834..3249017c3b18 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala @@ -616,8 +616,14 @@ class VeloxTestSettings extends BackendTestSettings { // Extra ColumnarToRow is needed to transform vanilla columnar data to gluten columnar data. .exclude("SPARK-37369: Avoid redundant ColumnarToRow transition on InMemoryTableScan") enableSuite[GlutenFileSourceCharVarcharTestSuite] + // Following test is excluded as it is overridden in Gluten test suite.. + // The overridden tests assert against Velox-specific error messages for char/varchar + // length validation, which differ from the original vanilla Spark tests. .exclude("length check for input string values: nested in struct of array") enableSuite[GlutenDSV2CharVarcharTestSuite] + // Following test is excluded as it is overridden in Gluten test suite.. + // The overridden tests assert against Velox-specific error messages for char/varchar + // length validation, which differ from the original vanilla Spark tests. .exclude("length check for input string values: nested in struct of array") enableSuite[GlutenColumnExpressionSuite] // Velox raise_error('errMsg') throws a velox_user_error exception with the message 'errMsg'. diff --git a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index 9b36cdd6da31..71b621081746 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -154,6 +154,9 @@ class ClickHouseTestSettings extends BackendTestSettings { enableSuite[GlutenCountMinSketchAggQuerySuite] enableSuite[GlutenCsvFunctionsSuite] enableSuite[GlutenDSV2CharVarcharTestSuite] + // Excluded. The Gluten tests for char/varchar validation were rewritten for Velox. + // ClickHouse backend doesn't support this feature and falls back to vanilla Spark, + // causing mismatches in error messages. .excludeGlutenTest("length check for input string values: top-level columns") .excludeGlutenTest("length check for input string values: nested in array") .excludeGlutenTest("length check for input string values: nested in struct of array") @@ -375,6 +378,9 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("varchar type values length check and trim: partitioned columns") .exclude("char/varchar type values length check: partitioned columns of other types") .exclude("char type comparison: partitioned columns") + // Excluded. The Gluten tests for char/varchar validation were rewritten for Velox. + // ClickHouse backend doesn't support this feature and falls back to vanilla Spark, + // causing mismatches in error messages. .excludeGlutenTest("length check for input string values: top-level columns") .excludeGlutenTest("length check for input string values: partitioned columns") .excludeGlutenTest("length check for input string values: nested in struct of array") diff --git a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala index 831ea1a2e91b..2fa7c7a77e43 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala @@ -654,6 +654,9 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("length check for input string values: nested in both map key and value") .exclude("length check for input string values: nested in array of struct") .exclude("length check for input string values: nested in array of array") + // Following tests are excluded as these are overridden in Gluten test suite.. + // The overridden tests assert against Velox-specific error messages for char/varchar + // length validation, which differ from the original vanilla Spark tests. .exclude("length check for input string values: top-level columns") .exclude("length check for input string values: partitioned columns") .exclude("length check for input string values: nested in struct of array") @@ -661,6 +664,9 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("char/varchar type values length check: partitioned columns of other types") enableSuite[GlutenDSV2CharVarcharTestSuite] + // Following tests are excluded as these are overridden in Gluten test suite.. + // The overridden tests assert against Velox-specific error messages for char/varchar + // length validation, which differ from the original vanilla Spark tests. .exclude("SPARK-42611: check char/varchar length in reordered structs within arrays") .exclude("char/varchar type values length check: partitioned columns of other types") .exclude("length check for input string values: top-level columns") diff --git a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index 03dce4b35d52..d325d8a6b9c0 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -445,6 +445,9 @@ class ClickHouseTestSettings extends BackendTestSettings { enableSuite[GlutenCustomerExtensionSuite] enableSuite[GlutenDDLSourceLoadSuite] enableSuite[GlutenDSV2CharVarcharTestSuite] + // Excluded. The Gluten tests for char/varchar validation were rewritten for Velox. + // ClickHouse backend doesn't support this feature and falls back to vanilla Spark, + // causing mismatches in error messages. .excludeGlutenTest("length check for input string values: top-level columns") .excludeGlutenTest("length check for input string values: nested in array") .excludeGlutenTest("length check for input string values: nested in struct of array") @@ -856,6 +859,9 @@ class ClickHouseTestSettings extends BackendTestSettings { .includeCH("length check for input string values: nested in both map key and value") .includeCH("length check for input string values: nested in array of struct") .includeCH("length check for input string values: nested in array of array") + // Excluded. The Gluten tests for char/varchar validation were rewritten for Velox. + // ClickHouse backend doesn't support this feature and falls back to vanilla Spark, + // causing mismatches in error messages. .excludeGlutenTest("length check for input string values: top-level columns") .excludeGlutenTest("length check for input string values: partitioned columns") .excludeGlutenTest("length check for input string values: nested in struct of array") diff --git a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala index da4db937cc0b..66a0583450bf 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala @@ -676,12 +676,18 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("length check for input string values: nested in both map key and value") .exclude("length check for input string values: nested in array of struct") .exclude("length check for input string values: nested in array of array") + // Following tests are excluded as these are overridden in Gluten test suite.. + // The overridden tests assert against Velox-specific error messages for char/varchar + // length validation, which differ from the original vanilla Spark tests. .exclude("length check for input string values: top-level columns") .exclude("length check for input string values: partitioned columns") .exclude("length check for input string values: nested in struct of array") .exclude("length check for input string values: with implicit cast") .exclude("char/varchar type values length check: partitioned columns of other types") enableSuite[GlutenDSV2CharVarcharTestSuite] + // Following tests are excluded as these are overridden in Gluten test suite.. + // The overridden tests assert against Velox-specific error messages for char/varchar + // length validation, which differ from the original vanilla Spark tests. .exclude("length check for input string values: top-level columns") .exclude("length check for input string values: nested in array") .exclude("length check for input string values: nested in struct of array")