From 127bad320fa46d3a58ee679c70ac507af2a6d877 Mon Sep 17 00:00:00 2001
From: noroshi <253434427+n0r0shi@users.noreply.github.com>
Date: Sat, 28 Feb 2026 10:09:25 +0000
Subject: [PATCH] [GLUTEN-7628][VL] Support monotonically_increasing_id
 function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds `Sig[MonotonicallyIncreasingID]` to `ExpressionMappings.SCALAR_SIGS`
so the function is offloaded to Velox instead of falling back to vanilla
Spark.

Also sets Velox's `expression.dedup_non_deterministic` to `false`. By
default Velox deduplicates structurally identical non-deterministic
expression trees, merging them into a single instance with shared state.
This is incorrect for Spark semantics where each non-deterministic call
has independent state — e.g. `SELECT monotonically_increasing_id(),
monotonically_increasing_id()` must return [0,0],[1,1] (two independent
counters), not [0,2],[1,3] (one shared counter).

For seeded functions like `rand(42)`, disabling dedup is safe: each
independent instance produces the same sequence from the same seed,
matching Spark's behavior either way.

Un-ignores and fixes the corresponding test in
`ScalarFunctionsValidateSuite`.

Closes #7628
---
 .../gluten/functions/ScalarFunctionsValidateSuite.scala  | 9 +++++++--
 cpp/velox/compute/WholeStageResultIterator.cc            | 8 ++++++++
 .../apache/gluten/expression/ExpressionMappings.scala    | 1 +
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/backends-velox/src/test/scala/org/apache/gluten/functions/ScalarFunctionsValidateSuite.scala b/backends-velox/src/test/scala/org/apache/gluten/functions/ScalarFunctionsValidateSuite.scala
index 302b4aa603d3..4c877bc53d60 100644
--- a/backends-velox/src/test/scala/org/apache/gluten/functions/ScalarFunctionsValidateSuite.scala
+++ b/backends-velox/src/test/scala/org/apache/gluten/functions/ScalarFunctionsValidateSuite.scala
@@ -496,12 +496,17 @@ abstract class ScalarFunctionsValidateSuite extends FunctionsValidateSuite {
     }
   }
 
-  // FIXME: Ignored: https://github.com/apache/incubator-gluten/issues/7600.
-  ignore("monotonically_increasintestg_id") {
+  test("monotonically_increasing_id") {
     runQueryAndCompare("""SELECT monotonically_increasing_id(), l_orderkey
                          | from lineitem limit 100""".stripMargin) {
       checkGlutenPlan[ProjectExecTransformer]
     }
+    // Multiple calls must produce independent results (issue #7628).
+    runQueryAndCompare(
+      """SELECT monotonically_increasing_id(), monotonically_increasing_id()
+        | from lineitem limit 100""".stripMargin) {
+      checkGlutenPlan[ProjectExecTransformer]
+    }
   }
 
   test("sequence function optimized by Spark constant folding") {
diff --git a/cpp/velox/compute/WholeStageResultIterator.cc b/cpp/velox/compute/WholeStageResultIterator.cc
index babcaf0e5f64..3a1e9888da64 100644
--- a/cpp/velox/compute/WholeStageResultIterator.cc
+++ b/cpp/velox/compute/WholeStageResultIterator.cc
@@ -666,6 +666,14 @@ std::unordered_map<std::string, std::string> WholeStageResultIterator::getQueryC
 
     configs[velox::core::QueryConfig::kSparkPartitionId] = std::to_string(taskInfo_.partitionId);
 
+    // Disable expression deduplication for non-deterministic functions to
+    // match Spark semantics. Spark creates separate instances for each
+    // non-deterministic expression, each with independent state. Without
+    // this, Velox merges structurally identical non-deterministic calls
+    // (e.g. two monotonically_increasing_id() in the same query) into one
+    // shared instance, causing incorrect results.
+    configs[velox::core::QueryConfig::kExprDedupNonDeterministic] = "false";
+
     // Enable Spark legacy date formatter if spark.sql.legacy.timeParserPolicy is set to 'LEGACY'
     // or 'legacy'
     if (veloxCfg_->get<std::string>(kSparkLegacyTimeParserPolicy, "") == "LEGACY") {
diff --git a/gluten-substrait/src/main/scala/org/apache/gluten/expression/ExpressionMappings.scala b/gluten-substrait/src/main/scala/org/apache/gluten/expression/ExpressionMappings.scala
index b13aced2a62c..b190419e4b27 100644
--- a/gluten-substrait/src/main/scala/org/apache/gluten/expression/ExpressionMappings.scala
+++ b/gluten-substrait/src/main/scala/org/apache/gluten/expression/ExpressionMappings.scala
@@ -290,6 +290,7 @@ object ExpressionMappings {
     Sig[MakeDecimal](MAKE_DECIMAL),
     Sig[PromotePrecision](PROMOTE_PRECISION),
     Sig[SparkPartitionID](SPARK_PARTITION_ID),
+    Sig[MonotonicallyIncreasingID](MONOTONICALLY_INCREASING_ID),
     Sig[AtLeastNNonNulls](AT_LEAST_N_NON_NULLS),
     Sig[WidthBucket](WIDTH_BUCKET),
     Sig[ReplicateRows](REPLICATE_ROWS),