-
Notifications
You must be signed in to change notification settings - Fork 582
[GLUTEN-9106][VL] Add support for staticInvoke CharVarcharCodegenUtils. #9107 #9107
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
b79b83e
104cb44
4abd4b5
a0fbecb
1430dc7
df011d5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -622,6 +622,47 @@ abstract class ScalarFunctionsValidateSuite extends FunctionsValidateSuite { | |
| } | ||
| } | ||
|
|
||
| // Add test suite for CharVarcharCodegenUtils functions. | ||
| // A ProjectExecTransformer is expected to be constructed after expr support. | ||
| // We currently test below functions with Spark v3.4 | ||
| testWithMinSparkVersion("charTypeWriteSideCheck", "3.4") { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why only test with Spark3,4? Does Spark 3.3 not have this function?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi, thanks for pointing out. From my obervation turns out that although Spark 3.3 does have this function, the execution plan is different, not triggering |
||
| withTable("src", "dest") { | ||
|
|
||
| sql("create table src(id string) USING PARQUET") | ||
| sql("insert into src values('s')") | ||
| sql("create table dest(id char(3)) USING PARQUET") | ||
| // check whether the executed plan of a dataframe contains the expected plan. | ||
| runQueryAndCompare("insert into dest select id from src") { | ||
| checkGlutenOperatorMatch[ProjectExecTransformer] | ||
| } | ||
| } | ||
| } | ||
|
|
||
| testWithMinSparkVersion("varcharTypeWriteSideCheck", "3.4") { | ||
| withTable("src", "dest") { | ||
|
|
||
| sql("create table src(id string) USING PARQUET") | ||
| sql("insert into src values('abc')") | ||
| sql("create table dest(id varchar(10)) USING PARQUET") | ||
| // check whether the executed plan of a dataframe contains the expected plan. | ||
| runQueryAndCompare("insert into dest select id from src") { | ||
| checkGlutenOperatorMatch[ProjectExecTransformer] | ||
| } | ||
| } | ||
| } | ||
|
|
||
| testWithMinSparkVersion("readSidePadding", "3.4") { | ||
| withTable("src", "dest") { | ||
|
|
||
| sql("create table tgt(id char(3)) USING PARQUET") | ||
| sql("insert into tgt values('p')") | ||
| // check whether the executed plan of a dataframe contains the expected plan. | ||
| runQueryAndCompare("select id from tgt") { | ||
| checkGlutenOperatorMatch[ProjectExecTransformer] | ||
| } | ||
| } | ||
| } | ||
|
|
||
| test("soundex") { | ||
| runQueryAndCompare("select soundex(c_comment) from customer limit 50") { | ||
| checkGlutenOperatorMatch[ProjectExecTransformer] | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -144,6 +144,66 @@ object ExpressionConverter extends SQLConfHelper with Logging { | |
| DecimalArithmeticExpressionTransformer(substraitName, leftChild, rightChild, resultType, b) | ||
| } | ||
|
|
||
| private def replaceStaticInvokeWithExpressionTransformer( | ||
| i: StaticInvoke, | ||
| attributeSeq: Seq[Attribute], | ||
| expressionsMap: Map[Class[_], String]): ExpressionTransformer = { | ||
| def validateAndTransform( | ||
| exprName: String, | ||
| childTransformers: => Seq[ExpressionTransformer]): ExpressionTransformer = { | ||
| if (!BackendsApiManager.getValidatorApiInstance.doExprValidate(exprName, i)) { | ||
| throw new GlutenNotSupportException( | ||
| s"Not supported to map current ${i.getClass} call on function: ${i.functionName}.") | ||
| } | ||
| GenericExpressionTransformer(exprName, childTransformers, i) | ||
| } | ||
|
|
||
| i.functionName match { | ||
| case "encode" | "decode" if i.objectName.endsWith("UrlCodec") => | ||
| validateAndTransform( | ||
| "url_" + i.functionName, | ||
| Seq(replaceWithExpressionTransformer0(i.arguments.head, attributeSeq, expressionsMap)) | ||
| ) | ||
|
|
||
| case "isLuhnNumber" => | ||
| validateAndTransform( | ||
| ExpressionNames.LUHN_CHECK, | ||
| Seq(replaceWithExpressionTransformer0(i.arguments.head, attributeSeq, expressionsMap)) | ||
| ) | ||
|
|
||
| case "encode" | "decode" if i.objectName.endsWith("Base64") => | ||
| if (!BackendsApiManager.getValidatorApiInstance.doExprValidate(ExpressionNames.BASE64, i)) { | ||
| throw new GlutenNotSupportException( | ||
| s"Not supported to map current ${i.getClass} call on function: ${i.functionName}.") | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the refactor. This validations seem to be newly introduced, would you please clarify a bit?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi rui-mo, thanks for mentioning this. Apologize for such a long context within this PR.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, I understand. |
||
| BackendsApiManager.getSparkPlanExecApiInstance.genBase64StaticInvokeTransformer( | ||
| ExpressionNames.BASE64, | ||
| replaceWithExpressionTransformer0(i.arguments.head, attributeSeq, expressionsMap), | ||
| i | ||
| ) | ||
|
|
||
| case fn | ||
| if i.objectName.endsWith("CharVarcharCodegenUtils") && Set( | ||
| "varcharTypeWriteSideCheck", | ||
| "charTypeWriteSideCheck", | ||
| "readSidePadding").contains(fn) => | ||
| val exprName = fn match { | ||
| case "varcharTypeWriteSideCheck" => ExpressionNames.VARCHAR_TYPE_WRITE_SIDE_CHECK | ||
| case "charTypeWriteSideCheck" => ExpressionNames.CHAR_TYPE_WRITE_SIDE_CHECK | ||
| case "readSidePadding" => ExpressionNames.READ_SIDE_PADDING | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The mappings from Spark's name to Velox's name are more commonly to be put at https://github.com/apache/incubator-gluten/blob/main/cpp/velox/substrait/SubstraitParser.cc#L387 to allow compatibility among different backends.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @rui-mo Hi rui-mo, could you help give me some guidance on how I should update the codes?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To add some context, so I guess substrait function name (from
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Typically, in Gluten’s ExpressionNames.scala, the Spark function names are defined, while in the C++ map, the corresponding function names for a specific backend are provided. For instance, the Spark function add might be represented as plus in certain backends. To maintain consistency, Gluten’s Scala side keeps Spark’s original naming (e.g., add), and it is the responsibility of each backend to map it to the appropriate equivalent name. This is just a minor suggestion—please feel free to decide whether you’d like to make this change.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi rui-mo! Thanks for clarifying the reasoning behind this design. Maybe I'll keep the current state for this PR, but I'll be sure to follow this mapping mechanism in subsequent PR. Really appreciate the guidance! |
||
| } | ||
| validateAndTransform( | ||
| exprName, | ||
| i.arguments.map(replaceWithExpressionTransformer0(_, attributeSeq, expressionsMap)) | ||
| ) | ||
|
|
||
| case _ => | ||
| throw new GlutenNotSupportException( | ||
| s"Not supported to transform StaticInvoke with object: ${i.staticObject.getName}, " + | ||
| s"function: ${i.functionName}") | ||
| } | ||
| } | ||
|
|
||
| private def replaceIcebergStaticInvoke( | ||
| s: StaticInvoke, | ||
| attributeSeq: Seq[Attribute], | ||
|
|
@@ -186,33 +246,12 @@ object ExpressionConverter extends SQLConfHelper with Logging { | |
| return BackendsApiManager.getSparkPlanExecApiInstance.genHiveUDFTransformer( | ||
| expr, | ||
| attributeSeq) | ||
| case i: StaticInvoke | ||
| if Seq("encode", "decode").contains(i.functionName) && i.objectName.endsWith( | ||
| "UrlCodec") => | ||
| return GenericExpressionTransformer( | ||
| "url_" + i.functionName, | ||
| replaceWithExpressionTransformer0(i.arguments.head, attributeSeq, expressionsMap), | ||
| i) | ||
| case i: StaticInvoke if i.functionName.equals("isLuhnNumber") => | ||
| return GenericExpressionTransformer( | ||
| ExpressionNames.LUHN_CHECK, | ||
| replaceWithExpressionTransformer0(i.arguments.head, attributeSeq, expressionsMap), | ||
| i) | ||
| case i: StaticInvoke | ||
| if Seq("encode", "decode").contains(i.functionName) && i.objectName.endsWith("Base64") => | ||
| return BackendsApiManager.getSparkPlanExecApiInstance.genBase64StaticInvokeTransformer( | ||
| ExpressionNames.BASE64, | ||
| replaceWithExpressionTransformer0(i.arguments.head, attributeSeq, expressionsMap), | ||
| i | ||
| ) | ||
| case i: StaticInvoke | ||
| if i.functionName == "invoke" && i.staticObject.getName.startsWith( | ||
| "org.apache.iceberg.spark.functions.") => | ||
| return replaceIcebergStaticInvoke(i, attributeSeq, expressionsMap) | ||
| case i: StaticInvoke => | ||
| throw new GlutenNotSupportException( | ||
| s"Not supported to transform StaticInvoke with object: ${i.staticObject.getName}, " + | ||
| s"function: ${i.functionName}") | ||
| return replaceStaticInvokeWithExpressionTransformer(i, attributeSeq, expressionsMap) | ||
| case _ => | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
DefaultValidator always returns false in its doValidate method, which means these CharVarcharCodegenUtils functions will never be validated as supported in the ClickHouse backend. This will cause these functions to fall back instead of being properly handled.