Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,9 @@ object CHExpressionUtil {
MAKE_DATE -> DefaultValidator(),
ARRAY_APPEND -> DefaultValidator(),
JSON_OBJECT_KEYS -> DefaultValidator(),
LUHN_CHECK -> DefaultValidator()
LUHN_CHECK -> DefaultValidator(),
VARCHAR_TYPE_WRITE_SIDE_CHECK -> DefaultValidator(),
CHAR_TYPE_WRITE_SIDE_CHECK -> DefaultValidator(),
READ_SIDE_PADDING -> DefaultValidator()
Comment on lines +211 to +213
Copy link

Copilot AI Aug 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DefaultValidator always returns false in its doValidate method, which means these CharVarcharCodegenUtils functions will never be validated as supported in the ClickHouse backend. This will cause these functions to fall back instead of being properly handled.

Suggested change
VARCHAR_TYPE_WRITE_SIDE_CHECK -> DefaultValidator(),
CHAR_TYPE_WRITE_SIDE_CHECK -> DefaultValidator(),
READ_SIDE_PADDING -> DefaultValidator()
VARCHAR_TYPE_WRITE_SIDE_CHECK -> CharVarcharCodegenValidator(),
CHAR_TYPE_WRITE_SIDE_CHECK -> CharVarcharCodegenValidator(),
READ_SIDE_PADDING -> CharVarcharCodegenValidator()

Copilot uses AI. Check for mistakes.
)
}
Original file line number Diff line number Diff line change
Expand Up @@ -622,6 +622,47 @@ abstract class ScalarFunctionsValidateSuite extends FunctionsValidateSuite {
}
}

// Add test suite for CharVarcharCodegenUtils functions.
// A ProjectExecTransformer is expected to be constructed after expr support.
// We currently test below functions with Spark v3.4
testWithMinSparkVersion("charTypeWriteSideCheck", "3.4") {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why only test with Spark3,4? Does Spark 3.3 not have this function?

Copy link
Contributor Author

@Yifeng-Wang Yifeng-Wang Oct 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi, thanks for pointing out. From my obervation turns out that although Spark 3.3 does have this function, the execution plan is different, not triggering staticInvoke, thus tested with min=Spark 3.4

withTable("src", "dest") {

sql("create table src(id string) USING PARQUET")
sql("insert into src values('s')")
sql("create table dest(id char(3)) USING PARQUET")
// check whether the executed plan of a dataframe contains the expected plan.
runQueryAndCompare("insert into dest select id from src") {
checkGlutenOperatorMatch[ProjectExecTransformer]
}
}
}

testWithMinSparkVersion("varcharTypeWriteSideCheck", "3.4") {
withTable("src", "dest") {

sql("create table src(id string) USING PARQUET")
sql("insert into src values('abc')")
sql("create table dest(id varchar(10)) USING PARQUET")
// check whether the executed plan of a dataframe contains the expected plan.
runQueryAndCompare("insert into dest select id from src") {
checkGlutenOperatorMatch[ProjectExecTransformer]
}
}
}

testWithMinSparkVersion("readSidePadding", "3.4") {
withTable("src", "dest") {

sql("create table tgt(id char(3)) USING PARQUET")
sql("insert into tgt values('p')")
// check whether the executed plan of a dataframe contains the expected plan.
runQueryAndCompare("select id from tgt") {
checkGlutenOperatorMatch[ProjectExecTransformer]
}
}
}

test("soundex") {
runQueryAndCompare("select soundex(c_comment) from customer limit 50") {
checkGlutenOperatorMatch[ProjectExecTransformer]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,66 @@ object ExpressionConverter extends SQLConfHelper with Logging {
DecimalArithmeticExpressionTransformer(substraitName, leftChild, rightChild, resultType, b)
}

private def replaceStaticInvokeWithExpressionTransformer(
i: StaticInvoke,
attributeSeq: Seq[Attribute],
expressionsMap: Map[Class[_], String]): ExpressionTransformer = {
def validateAndTransform(
exprName: String,
childTransformers: => Seq[ExpressionTransformer]): ExpressionTransformer = {
if (!BackendsApiManager.getValidatorApiInstance.doExprValidate(exprName, i)) {
throw new GlutenNotSupportException(
s"Not supported to map current ${i.getClass} call on function: ${i.functionName}.")
}
GenericExpressionTransformer(exprName, childTransformers, i)
}

i.functionName match {
case "encode" | "decode" if i.objectName.endsWith("UrlCodec") =>
validateAndTransform(
"url_" + i.functionName,
Seq(replaceWithExpressionTransformer0(i.arguments.head, attributeSeq, expressionsMap))
)

case "isLuhnNumber" =>
validateAndTransform(
ExpressionNames.LUHN_CHECK,
Seq(replaceWithExpressionTransformer0(i.arguments.head, attributeSeq, expressionsMap))
)

case "encode" | "decode" if i.objectName.endsWith("Base64") =>
if (!BackendsApiManager.getValidatorApiInstance.doExprValidate(ExpressionNames.BASE64, i)) {
throw new GlutenNotSupportException(
s"Not supported to map current ${i.getClass} call on function: ${i.functionName}.")
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the refactor. This validations seem to be newly introduced, would you please clarify a bit?

Copy link
Contributor Author

@Yifeng-Wang Yifeng-Wang Oct 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi rui-mo, thanks for mentioning this. Apologize for such a long context within this PR.
This doExprValidate call on validator instance was added deliberately to ensure backend actually supports.
Technically this was from my earlier findings (if you scroll up a bit, on Mar 26) , and currently it ensures that:

  • An explicit call on doExprValidate.
  • CH_BLACKLIST_SCALAR_FUNCTION is looked through during e.g., CHValidatorApi::doExprValidate so that the functions which are ONLY supported at Velox side, won't cause "Unknown function parser" for clickhouse BE(in CI tests).
  • Intercept at Gluten with GlutenNotSupportException, instead of sending to actual unsupported (ClickHouse) backend.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, I understand.

BackendsApiManager.getSparkPlanExecApiInstance.genBase64StaticInvokeTransformer(
ExpressionNames.BASE64,
replaceWithExpressionTransformer0(i.arguments.head, attributeSeq, expressionsMap),
i
)

case fn
if i.objectName.endsWith("CharVarcharCodegenUtils") && Set(
"varcharTypeWriteSideCheck",
"charTypeWriteSideCheck",
"readSidePadding").contains(fn) =>
val exprName = fn match {
case "varcharTypeWriteSideCheck" => ExpressionNames.VARCHAR_TYPE_WRITE_SIDE_CHECK
case "charTypeWriteSideCheck" => ExpressionNames.CHAR_TYPE_WRITE_SIDE_CHECK
case "readSidePadding" => ExpressionNames.READ_SIDE_PADDING
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The mappings from Spark's name to Velox's name are more commonly to be put at https://github.com/apache/incubator-gluten/blob/main/cpp/velox/substrait/SubstraitParser.cc#L387 to allow compatibility among different backends.

Copy link
Contributor Author

@Yifeng-Wang Yifeng-Wang Oct 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rui-mo Hi rui-mo, could you help give me some guidance on how I should update the codes?
Currently the gluten-velox run of my expression convert does not seems like relying on this map?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To add some context,
at Gluten side in ExpressionNames.scala. I had e.g.:
final val VARCHAR_TYPE_WRITE_SIDE_CHECK = "varchar_type_write_side_check" .
And at velox side, there is

registerFunction<
VarcharTypeWriteSideCheckFunction,
Varchar,
Varchar,
int32_t>
({prefix + "varchar_type_write_side_check"});

so I guess substrait function name (from ExpressionNames.VARCHAR_TYPE_WRITE_SIDE_CHECK = "varchar_type_write_side_check") already matches the Velox registered function name (prefix + "varchar_type_write_side_check").

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typically, in Gluten’s ExpressionNames.scala, the Spark function names are defined, while in the C++ map, the corresponding function names for a specific backend are provided. For instance, the Spark function add might be represented as plus in certain backends. To maintain consistency, Gluten’s Scala side keeps Spark’s original naming (e.g., add), and it is the responsibility of each backend to map it to the appropriate equivalent name. This is just a minor suggestion—please feel free to decide whether you’d like to make this change.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi rui-mo! ​Thanks for clarifying the reasoning behind this design. Maybe I'll keep the current state for this PR, but I'll be sure to follow this mapping mechanism in subsequent PR. Really appreciate the guidance!

}
validateAndTransform(
exprName,
i.arguments.map(replaceWithExpressionTransformer0(_, attributeSeq, expressionsMap))
)

case _ =>
throw new GlutenNotSupportException(
s"Not supported to transform StaticInvoke with object: ${i.staticObject.getName}, " +
s"function: ${i.functionName}")
}
}

private def replaceIcebergStaticInvoke(
s: StaticInvoke,
attributeSeq: Seq[Attribute],
Expand Down Expand Up @@ -186,33 +246,12 @@ object ExpressionConverter extends SQLConfHelper with Logging {
return BackendsApiManager.getSparkPlanExecApiInstance.genHiveUDFTransformer(
expr,
attributeSeq)
case i: StaticInvoke
if Seq("encode", "decode").contains(i.functionName) && i.objectName.endsWith(
"UrlCodec") =>
return GenericExpressionTransformer(
"url_" + i.functionName,
replaceWithExpressionTransformer0(i.arguments.head, attributeSeq, expressionsMap),
i)
case i: StaticInvoke if i.functionName.equals("isLuhnNumber") =>
return GenericExpressionTransformer(
ExpressionNames.LUHN_CHECK,
replaceWithExpressionTransformer0(i.arguments.head, attributeSeq, expressionsMap),
i)
case i: StaticInvoke
if Seq("encode", "decode").contains(i.functionName) && i.objectName.endsWith("Base64") =>
return BackendsApiManager.getSparkPlanExecApiInstance.genBase64StaticInvokeTransformer(
ExpressionNames.BASE64,
replaceWithExpressionTransformer0(i.arguments.head, attributeSeq, expressionsMap),
i
)
case i: StaticInvoke
if i.functionName == "invoke" && i.staticObject.getName.startsWith(
"org.apache.iceberg.spark.functions.") =>
return replaceIcebergStaticInvoke(i, attributeSeq, expressionsMap)
case i: StaticInvoke =>
throw new GlutenNotSupportException(
s"Not supported to transform StaticInvoke with object: ${i.staticObject.getName}, " +
s"function: ${i.functionName}")
return replaceStaticInvokeWithExpressionTransformer(i, attributeSeq, expressionsMap)
case _ =>
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,11 @@ class ClickHouseTestSettings extends BackendTestSettings {
enableSuite[GlutenDSV2CharVarcharTestSuite]
// failed on spark32 UT, see https://github.com/oap-project/gluten/issues/4043
.exclude("SPARK-34833: right-padding applied correctly for correlated subqueries - other preds")
// Excluded. The Gluten tests for char/varchar validation were rewritten for Velox.
// ClickHouse backend doesn't support this feature and falls back to vanilla Spark,
// causing mismatches in error messages.
.excludeGlutenTest("length check for input string values: nested in struct")
.excludeGlutenTest("length check for input string values: nested in struct of array")
enableSuite[GlutenDSV2SQLInsertTestSuite]
enableSuite[GlutenDataFrameAggregateSuite]
.exclude("average")
Expand Down Expand Up @@ -354,6 +359,11 @@ class ClickHouseTestSettings extends BackendTestSettings {
.exclude("varchar type values length check and trim: partitioned columns")
.exclude("char/varchar type values length check: partitioned columns of other types")
.exclude("char type comparison: partitioned columns")
// Excluded. The Gluten tests for char/varchar validation were rewritten for Velox.
// ClickHouse backend doesn't support this feature and falls back to vanilla Spark,
// causing mismatches in error messages.
.excludeGlutenTest("length check for input string values: nested in struct")
.excludeGlutenTest("length check for input string values: nested in struct of array")
enableSuite[GlutenFileSourceSQLInsertTestSuite]
.exclude("SPARK-33474: Support typed literals as partition spec values")
.exclude(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -422,7 +422,17 @@ class VeloxTestSettings extends BackendTestSettings {
.excludeByPrefix("SPARK-24705")
.excludeByPrefix("determining the number of reducers")
enableSuite[GlutenFileSourceCharVarcharTestSuite]
// Following tests are excluded as these are overridden in Gluten test suite..
// The overridden tests assert against Velox-specific error messages for char/varchar
// length validation, which differ from the original vanilla Spark tests.
.exclude("length check for input string values: nested in struct")
.exclude("length check for input string values: nested in struct of array")
enableSuite[GlutenDSV2CharVarcharTestSuite]
// Following tests are excluded as these are overridden in Gluten test suite..
// The overridden tests assert against Velox-specific error messages for char/varchar
// length validation, which differ from the original vanilla Spark tests.
.exclude("length check for input string values: nested in struct")
.exclude("length check for input string values: nested in struct of array")
enableSuite[GlutenFileScanSuite]
enableSuite[GlutenNestedDataSourceV1Suite]
enableSuite[GlutenNestedDataSourceV2Suite]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,73 @@
*/
package org.apache.spark.sql

import org.apache.spark.SparkException

class GlutenFileSourceCharVarcharTestSuite
extends FileSourceCharVarcharTestSuite
with GlutenSQLTestsTrait {}
with GlutenSQLTestsTrait {

private def testTableWrite(f: String => Unit): Unit = {
withTable("t")(f("char"))
withTable("t")(f("varchar"))
}

private val VELOX_ERROR_MESSAGE = "Exceeds allowed length limitation: 5"

testGluten("length check for input string values: nested in struct") {
testTableWrite {
typeName =>
sql(s"CREATE TABLE t(c STRUCT<c: $typeName(5)>) USING $format")
sql("INSERT INTO t SELECT struct(null)")
checkAnswer(spark.table("t"), Row(Row(null)))
val e = intercept[SparkException] {
sql("INSERT INTO t SELECT struct('123456')")
}
assert(e.getCause.getMessage.contains(VELOX_ERROR_MESSAGE))
}
}

testGluten("length check for input string values: nested in struct of array") {
testTableWrite {
typeName =>
sql(s"CREATE TABLE t(c STRUCT<c: ARRAY<$typeName(5)>>) USING $format")
sql("INSERT INTO t SELECT struct(array(null))")
checkAnswer(spark.table("t"), Row(Row(Seq(null))))
val e = intercept[SparkException](sql("INSERT INTO t SELECT struct(array('123456'))"))
assert(e.getCause.getMessage.contains(VELOX_ERROR_MESSAGE))
}
}
}

class GlutenDSV2CharVarcharTestSuite extends DSV2CharVarcharTestSuite with GlutenSQLTestsTrait {
private def testTableWrite(f: String => Unit): Unit = {
withTable("t")(f("char"))
withTable("t")(f("varchar"))
}

private val VELOX_ERROR_MESSAGE = "Exceeds allowed length limitation: 5"

testGluten("length check for input string values: nested in struct") {
testTableWrite {
typeName =>
sql(s"CREATE TABLE t(c STRUCT<c: $typeName(5)>) USING $format")
sql("INSERT INTO t SELECT struct(null)")
checkAnswer(spark.table("t"), Row(Row(null)))
val e = intercept[SparkException] {
sql("INSERT INTO t SELECT struct('123456')")
}
assert(e.getCause.getMessage.contains(VELOX_ERROR_MESSAGE))
}
}

class GlutenDSV2CharVarcharTestSuite extends DSV2CharVarcharTestSuite with GlutenSQLTestsTrait {}
testGluten("length check for input string values: nested in struct of array") {
testTableWrite {
typeName =>
sql(s"CREATE TABLE t(c STRUCT<c: ARRAY<$typeName(5)>>) USING $format")
sql("INSERT INTO t SELECT struct(array(null))")
checkAnswer(spark.table("t"), Row(Row(Seq(null))))
val e = intercept[SparkException](sql("INSERT INTO t SELECT struct(array('123456'))"))
assert(e.getCause.getMessage.contains(VELOX_ERROR_MESSAGE))
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,10 @@ class ClickHouseTestSettings extends BackendTestSettings {
enableSuite[GlutenCountMinSketchAggQuerySuite]
enableSuite[GlutenCsvFunctionsSuite]
enableSuite[GlutenDSV2CharVarcharTestSuite]
// Excluded. The Gluten tests for char/varchar validation were rewritten for Velox.
// ClickHouse backend doesn't support this feature and falls back to vanilla Spark,
// causing mismatches in error messages.
.excludeGlutenTest("length check for input string values: nested in struct of array")
enableSuite[GlutenDSV2SQLInsertTestSuite]
enableSuite[GlutenDataFrameAggregateSuite]
.exclude("average")
Expand Down Expand Up @@ -367,6 +371,10 @@ class ClickHouseTestSettings extends BackendTestSettings {
.exclude("varchar type values length check and trim: partitioned columns")
.exclude("char/varchar type values length check: partitioned columns of other types")
.exclude("char type comparison: partitioned columns")
// Excluded. The Gluten tests for char/varchar validation were rewritten for Velox.
// ClickHouse backend doesn't support this feature and falls back to vanilla Spark,
// causing mismatches in error messages.
.excludeGlutenTest("length check for input string values: nested in struct of array")
enableSuite[GlutenFileSourceSQLInsertTestSuite]
.exclude("SPARK-33474: Support typed literals as partition spec values")
.exclude(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -616,7 +616,15 @@ class VeloxTestSettings extends BackendTestSettings {
// Extra ColumnarToRow is needed to transform vanilla columnar data to gluten columnar data.
.exclude("SPARK-37369: Avoid redundant ColumnarToRow transition on InMemoryTableScan")
enableSuite[GlutenFileSourceCharVarcharTestSuite]
// Following test is excluded as it is overridden in Gluten test suite..
// The overridden tests assert against Velox-specific error messages for char/varchar
// length validation, which differ from the original vanilla Spark tests.
.exclude("length check for input string values: nested in struct of array")
enableSuite[GlutenDSV2CharVarcharTestSuite]
// Following test is excluded as it is overridden in Gluten test suite..
// The overridden tests assert against Velox-specific error messages for char/varchar
// length validation, which differ from the original vanilla Spark tests.
.exclude("length check for input string values: nested in struct of array")
enableSuite[GlutenColumnExpressionSuite]
// Velox raise_error('errMsg') throws a velox_user_error exception with the message 'errMsg'.
// The final caught Spark exception's getCause().getMessage() contains 'errMsg' but does not
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,48 @@
*/
package org.apache.spark.sql

import org.apache.spark.SparkException

class GlutenFileSourceCharVarcharTestSuite
extends FileSourceCharVarcharTestSuite
with GlutenSQLTestsTrait {}
with GlutenSQLTestsTrait {

private val VELOX_ERROR_MESSAGE = "Exceeds allowed length limitation: 5"

private def testTableWrite(f: String => Unit): Unit = {
withTable("t")(f("char"))
withTable("t")(f("varchar"))
}

testGluten("length check for input string values: nested in struct of array") {
testTableWrite {
typeName =>
sql(s"CREATE TABLE t(c STRUCT<c: ARRAY<$typeName(5)>>) USING $format")
sql("INSERT INTO t SELECT struct(array(null))")
checkAnswer(spark.table("t"), Row(Row(Seq(null))))
val e = intercept[SparkException](sql("INSERT INTO t SELECT struct(array('123456'))"))
assert(e.getCause.getMessage.contains(VELOX_ERROR_MESSAGE))
}
}
}

class GlutenDSV2CharVarcharTestSuite extends DSV2CharVarcharTestSuite with GlutenSQLTestsTrait {

private val VELOX_ERROR_MESSAGE = "Exceeds allowed length limitation: 5"

private def testTableWrite(f: String => Unit): Unit = {
withTable("t")(f("char"))
withTable("t")(f("varchar"))
}

class GlutenDSV2CharVarcharTestSuite extends DSV2CharVarcharTestSuite with GlutenSQLTestsTrait {}
testGluten("length check for input string values: nested in struct of array") {
testTableWrite {
typeName =>
sql(s"CREATE TABLE t(c STRUCT<c: ARRAY<$typeName(5)>>) USING $format")
sql("INSERT INTO t SELECT struct(array(null))")
checkAnswer(spark.table("t"), Row(Row(Seq(null))))
val e = intercept[SparkException](sql("INSERT INTO t SELECT struct(array('123456'))"))
assert(e.getCause.getMessage.contains(VELOX_ERROR_MESSAGE))
}
}
}
Loading