From 127bad320fa46d3a58ee679c70ac507af2a6d877 Mon Sep 17 00:00:00 2001 From: noroshi <253434427+n0r0shi@users.noreply.github.com> Date: Sat, 28 Feb 2026 10:09:25 +0000 Subject: [PATCH] [GLUTEN-7628][VL] Support monotonically_increasing_id function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `Sig[MonotonicallyIncreasingID]` to `ExpressionMappings.SCALAR_SIGS` so the function is offloaded to Velox instead of falling back to vanilla Spark. Also sets Velox's `expression.dedup_non_deterministic` to `false`. By default Velox deduplicates structurally identical non-deterministic expression trees, merging them into a single instance with shared state. This is incorrect for Spark semantics where each non-deterministic call has independent state — e.g. `SELECT monotonically_increasing_id(), monotonically_increasing_id()` must return [0,0],[1,1] (two independent counters), not [0,2],[1,3] (one shared counter). For seeded functions like `rand(42)`, disabling dedup is safe: each independent instance produces the same sequence from the same seed, matching Spark's behavior either way. Un-ignores and fixes the corresponding test in `ScalarFunctionsValidateSuite`. Closes #7628 --- .../gluten/functions/ScalarFunctionsValidateSuite.scala | 9 +++++++-- cpp/velox/compute/WholeStageResultIterator.cc | 8 ++++++++ .../apache/gluten/expression/ExpressionMappings.scala | 1 + 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/backends-velox/src/test/scala/org/apache/gluten/functions/ScalarFunctionsValidateSuite.scala b/backends-velox/src/test/scala/org/apache/gluten/functions/ScalarFunctionsValidateSuite.scala index 302b4aa603d3..4c877bc53d60 100644 --- a/backends-velox/src/test/scala/org/apache/gluten/functions/ScalarFunctionsValidateSuite.scala +++ b/backends-velox/src/test/scala/org/apache/gluten/functions/ScalarFunctionsValidateSuite.scala @@ -496,12 +496,17 @@ abstract class ScalarFunctionsValidateSuite extends FunctionsValidateSuite { } } - // FIXME: Ignored: https://github.com/apache/incubator-gluten/issues/7600. - ignore("monotonically_increasintestg_id") { + test("monotonically_increasing_id") { runQueryAndCompare("""SELECT monotonically_increasing_id(), l_orderkey | from lineitem limit 100""".stripMargin) { checkGlutenPlan[ProjectExecTransformer] } + // Multiple calls must produce independent results (issue #7628). + runQueryAndCompare( + """SELECT monotonically_increasing_id(), monotonically_increasing_id() + | from lineitem limit 100""".stripMargin) { + checkGlutenPlan[ProjectExecTransformer] + } } test("sequence function optimized by Spark constant folding") { diff --git a/cpp/velox/compute/WholeStageResultIterator.cc b/cpp/velox/compute/WholeStageResultIterator.cc index babcaf0e5f64..3a1e9888da64 100644 --- a/cpp/velox/compute/WholeStageResultIterator.cc +++ b/cpp/velox/compute/WholeStageResultIterator.cc @@ -666,6 +666,14 @@ std::unordered_map WholeStageResultIterator::getQueryC configs[velox::core::QueryConfig::kSparkPartitionId] = std::to_string(taskInfo_.partitionId); + // Disable expression deduplication for non-deterministic functions to + // match Spark semantics. Spark creates separate instances for each + // non-deterministic expression, each with independent state. Without + // this, Velox merges structurally identical non-deterministic calls + // (e.g. two monotonically_increasing_id() in the same query) into one + // shared instance, causing incorrect results. + configs[velox::core::QueryConfig::kExprDedupNonDeterministic] = "false"; + // Enable Spark legacy date formatter if spark.sql.legacy.timeParserPolicy is set to 'LEGACY' // or 'legacy' if (veloxCfg_->get(kSparkLegacyTimeParserPolicy, "") == "LEGACY") { diff --git a/gluten-substrait/src/main/scala/org/apache/gluten/expression/ExpressionMappings.scala b/gluten-substrait/src/main/scala/org/apache/gluten/expression/ExpressionMappings.scala index b13aced2a62c..b190419e4b27 100644 --- a/gluten-substrait/src/main/scala/org/apache/gluten/expression/ExpressionMappings.scala +++ b/gluten-substrait/src/main/scala/org/apache/gluten/expression/ExpressionMappings.scala @@ -290,6 +290,7 @@ object ExpressionMappings { Sig[MakeDecimal](MAKE_DECIMAL), Sig[PromotePrecision](PROMOTE_PRECISION), Sig[SparkPartitionID](SPARK_PARTITION_ID), + Sig[MonotonicallyIncreasingID](MONOTONICALLY_INCREASING_ID), Sig[AtLeastNNonNulls](AT_LEAST_N_NON_NULLS), Sig[WidthBucket](WIDTH_BUCKET), Sig[ReplicateRows](REPLICATE_ROWS),