Merge branch 'apache:main' into main

martin-augment · web-flow · commit 3360ca5ca43d · 2025-11-07T08:45:17.000+02:00
diff --git a/.github/workflows/pr_build_linux.yml b/.github/workflows/pr_build_linux.yml
@@ -129,6 +129,7 @@ jobs:
               org.apache.comet.exec.CometAggregateSuite
               org.apache.comet.exec.CometExec3_4PlusSuite
               org.apache.comet.exec.CometExecSuite
+              org.apache.comet.exec.CometWindowExecSuite
               org.apache.comet.exec.CometJoinSuite
               org.apache.comet.CometArrayExpressionSuite
               org.apache.comet.CometCastSuite
diff --git a/.github/workflows/pr_build_macos.yml b/.github/workflows/pr_build_macos.yml
@@ -94,6 +94,7 @@ jobs:
               org.apache.comet.exec.CometAggregateSuite
               org.apache.comet.exec.CometExec3_4PlusSuite
               org.apache.comet.exec.CometExecSuite
+              org.apache.comet.exec.CometWindowExecSuite
               org.apache.comet.exec.CometJoinSuite
               org.apache.comet.CometArrayExpressionSuite
               org.apache.comet.CometCastSuite
diff --git a/docs/source/contributor-guide/spark-sql-tests.md b/docs/source/contributor-guide/spark-sql-tests.md
@@ -56,6 +56,12 @@ git apply ../datafusion-comet/dev/diffs/3.4.3.diff
 
 ### Use the following commands to run the Spark SQL test suite locally.
 
+Optionally, enable Comet fallback logging, so that all fallback reasons are logged at `WARN` level.
+
+```shell
+export ENABLE_COMET_LOG_FALLBACK_REASONS=true
+```
+
 ```shell
 ENABLE_COMET=true ENABLE_COMET_ONHEAP=true build/sbt catalyst/test
 ENABLE_COMET=true ENABLE_COMET_ONHEAP=true build/sbt "sql/testOnly * -- -l org.apache.spark.tags.ExtendedSQLTest -l org.apache.spark.tags.SlowSQLTest"
@@ -68,7 +74,7 @@ ENABLE_COMET=true ENABLE_COMET_ONHEAP=true build/sbt "hive/testOnly * -- -n org.
 ### Steps to run individual test suites through SBT
 1. Open SBT with Comet enabled
 ```shell
-ENABLE_COMET=true sbt -J-Xmx4096m -Dspark.test.includeSlowTests=true 
+ENABLE_COMET=true ENABLE_COMET_ONHEAP=true sbt -J-Xmx4096m -Dspark.test.includeSlowTests=true
 ```
 2. Run individual tests (Below code runs test named `SPARK-35568` in the `spark-sql` module)
 ```shell
diff --git a/docs/source/user-guide/latest/compatibility.md b/docs/source/user-guide/latest/compatibility.md
@@ -56,8 +56,8 @@ and sorting on floating-point data can be enabled by setting `spark.comet.expres
 ## Incompatible Expressions
 
 Expressions that are not 100% Spark-compatible will fall back to Spark by default and can be enabled by setting
-`spark.comet.expression.EXPRNAME.allowIncompatible=true`, where `EXPRNAME` is the Spark expression class name. See 
-the [Comet Supported Expressions Guide](expressions.md) for more information on this configuration setting.  
+`spark.comet.expression.EXPRNAME.allowIncompatible=true`, where `EXPRNAME` is the Spark expression class name. See
+the [Comet Supported Expressions Guide](expressions.md) for more information on this configuration setting.
 
 It is also possible to specify `spark.comet.expression.allowIncompatible=true` to enable all
 incompatible expressions.
diff --git a/fuzz-testing/src/main/scala/org/apache/comet/fuzz/ComparisonTool.scala b/fuzz-testing/src/main/scala/org/apache/comet/fuzz/ComparisonTool.scala
@@ -31,6 +31,8 @@ class ComparisonToolConf(arguments: Seq[String]) extends ScallopConf(arguments)
       opt[String](required = true, descr = "Folder with Spark produced results in Parquet format")
     val inputCometFolder: ScallopOption[String] =
       opt[String](required = true, descr = "Folder with Comet produced results in Parquet format")
+    val tolerance: ScallopOption[Double] =
+      opt[Double](default = Some(0.000002), descr = "Tolerance for floating point comparisons")
   }
   addSubcommand(compareParquet)
   verify()
@@ -49,7 +51,8 @@ object ComparisonTool {
         compareParquetFolders(
           spark,
           conf.compareParquet.inputSparkFolder(),
-          conf.compareParquet.inputCometFolder())
+          conf.compareParquet.inputCometFolder(),
+          conf.compareParquet.tolerance())
 
       case _ =>
         // scalastyle:off println
@@ -62,7 +65,8 @@ object ComparisonTool {
   private def compareParquetFolders(
       spark: SparkSession,
       sparkFolderPath: String,
-      cometFolderPath: String): Unit = {
+      cometFolderPath: String,
+      tolerance: Double): Unit = {
 
     val output = QueryRunner.createOutputMdFile()
 
@@ -115,7 +119,7 @@ object ComparisonTool {
             val cometRows = cometDf.orderBy(cometDf.columns.map(functions.col): _*).collect()
 
             // Compare the results
-            if (QueryComparison.assertSameRows(sparkRows, cometRows, output)) {
+            if (QueryComparison.assertSameRows(sparkRows, cometRows, output, tolerance)) {
               output.write(s"Subfolder $subfolderName: ${sparkRows.length} rows matched\n\n")
             } else {
               // Output schema if dataframes are not equal
diff --git a/fuzz-testing/src/main/scala/org/apache/comet/fuzz/QueryRunner.scala b/fuzz-testing/src/main/scala/org/apache/comet/fuzz/QueryRunner.scala
@@ -148,7 +148,8 @@ object QueryComparison {
   def assertSameRows(
       sparkRows: Array[Row],
       cometRows: Array[Row],
-      output: BufferedWriter): Boolean = {
+      output: BufferedWriter,
+      tolerance: Double = 0.000001): Boolean = {
     if (sparkRows.length == cometRows.length) {
       var i = 0
       while (i < sparkRows.length) {
@@ -164,7 +165,7 @@ object QueryComparison {
 
         assert(l.length == r.length)
         for (j <- 0 until l.length) {
-          if (!same(l(j), r(j))) {
+          if (!same(l(j), r(j), tolerance)) {
             output.write(s"First difference at row $i:\n")
             output.write("Spark: `" + formatRow(l) + "`\n")
             output.write("Comet: `" + formatRow(r) + "`\n")
@@ -186,7 +187,7 @@ object QueryComparison {
     true
   }
 
-  private def same(l: Any, r: Any): Boolean = {
+  private def same(l: Any, r: Any, tolerance: Double): Boolean = {
     if (l == null || r == null) {
       return l == null && r == null
     }
@@ -195,20 +196,20 @@ object QueryComparison {
       case (a: Float, b: Float) if a.isNegInfinity => b.isNegInfinity
       case (a: Float, b: Float) if a.isInfinity => b.isInfinity
       case (a: Float, b: Float) if a.isNaN => b.isNaN
-      case (a: Float, b: Float) => (a - b).abs <= 0.000001f
+      case (a: Float, b: Float) => (a - b).abs <= tolerance
       case (a: Double, b: Double) if a.isPosInfinity => b.isPosInfinity
       case (a: Double, b: Double) if a.isNegInfinity => b.isNegInfinity
       case (a: Double, b: Double) if a.isInfinity => b.isInfinity
       case (a: Double, b: Double) if a.isNaN => b.isNaN
-      case (a: Double, b: Double) => (a - b).abs <= 0.000001
+      case (a: Double, b: Double) => (a - b).abs <= tolerance
       case (a: Array[_], b: Array[_]) =>
-        a.length == b.length && a.zip(b).forall(x => same(x._1, x._2))
+        a.length == b.length && a.zip(b).forall(x => same(x._1, x._2, tolerance))
       case (a: mutable.WrappedArray[_], b: mutable.WrappedArray[_]) =>
-        a.length == b.length && a.zip(b).forall(x => same(x._1, x._2))
+        a.length == b.length && a.zip(b).forall(x => same(x._1, x._2, tolerance))
       case (a: Row, b: Row) =>
         val aa = a.toSeq
         val bb = b.toSeq
-        aa.length == bb.length && aa.zip(bb).forall(x => same(x._1, x._2))
+        aa.length == bb.length && aa.zip(bb).forall(x => same(x._1, x._2, tolerance))
       case (a, b) => a == b
     }
   }
diff --git a/spark/pom.xml b/spark/pom.xml
@@ -326,6 +326,7 @@ under the License.
             </goals>
             <configuration>
               <mainClass>org.apache.comet.GenerateDocs</mainClass>
+              <arguments>${project.parent.basedir}/docs/source/user-guide/latest/</arguments>
               <classpathScope>compile</classpathScope>
             </configuration>
           </execution>
diff --git a/spark/src/main/scala/org/apache/comet/GenerateDocs.scala b/spark/src/main/scala/org/apache/comet/GenerateDocs.scala
@@ -36,18 +36,16 @@ import org.apache.comet.serde.{Compatible, Incompatible, QueryPlanSerde}
  */
 object GenerateDocs {
 
-  private def userGuideLocation = "docs/source/user-guide/latest/"
-
-  val publicConfigs: Set[ConfigEntry[_]] = CometConf.allConfs.filter(_.isPublic).toSet
+  private val publicConfigs: Set[ConfigEntry[_]] = CometConf.allConfs.filter(_.isPublic).toSet
 
   def main(args: Array[String]): Unit = {
-    generateConfigReference()
-    generateCompatibilityGuide()
+    val userGuideLocation = args(0)
+    generateConfigReference(s"$userGuideLocation/configs.md")
+    generateCompatibilityGuide(s"$userGuideLocation/compatibility.md")
   }
 
-  private def generateConfigReference(): Unit = {
+  private def generateConfigReference(filename: String): Unit = {
     val pattern = "<!--BEGIN:CONFIG_TABLE\\[(.*)]-->".r
-    val filename = s"$userGuideLocation/configs.md"
     val lines = readFile(filename)
     val w = new BufferedOutputStream(new FileOutputStream(filename))
     for (line <- lines) {
@@ -95,8 +93,7 @@ object GenerateDocs {
     w.close()
   }
 
-  private def generateCompatibilityGuide(): Unit = {
-    val filename = s"$userGuideLocation/compatibility.md"
+  private def generateCompatibilityGuide(filename: String): Unit = {
     val lines = readFile(filename)
     val w = new BufferedOutputStream(new FileOutputStream(filename))
     for (line <- lines) {
diff --git a/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala
@@ -887,7 +887,7 @@ case class CometExecRule(session: SparkSession) extends Rule[SparkPlan] {
         var supported = true
         for (o <- orderings) {
           if (QueryPlanSerde.exprToProto(o, inputs).isEmpty) {
-            withInfo(s, s"unsupported range partitioning sort order: $o")
+            withInfo(s, s"unsupported range partitioning sort order: $o", o)
             supported = false
             // We don't short-circuit in case there is more than one unsupported expression
             // to provide info for.
diff --git a/spark/src/main/scala/org/apache/comet/serde/literals.scala b/spark/src/main/scala/org/apache/comet/serde/literals.scala
@@ -23,7 +23,7 @@ import java.lang
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.expressions.{Attribute, Literal}
-import org.apache.spark.sql.catalyst.util.GenericArrayData
+import org.apache.spark.sql.catalyst.util.ArrayData
 import org.apache.spark.sql.types.{ArrayType, BinaryType, BooleanType, ByteType, DateType, Decimal, DecimalType, DoubleType, FloatType, IntegerType, LongType, NullType, ShortType, StringType, TimestampNTZType, TimestampType}
 import org.apache.spark.unsafe.types.UTF8String
 
@@ -92,7 +92,7 @@ object CometLiteral extends CometExpressionSerde[Literal] with Logging {
 
         case arr: ArrayType =>
           val listLiteralBuilder: ListLiteral.Builder =
-            makeListLiteral(value.asInstanceOf[GenericArrayData].array, arr)
+            makeListLiteral(value.asInstanceOf[ArrayData].array, arr)
           exprBuilder.setListVal(listLiteralBuilder.build())
           exprBuilder.setDatatype(serializeDataType(dataType).get)
         case dt =>
@@ -198,7 +198,7 @@ object CometLiteral extends CometExpressionSerde[Literal] with Logging {
           })
       case a: ArrayType =>
         array.foreach(v => {
-          val casted = v.asInstanceOf[GenericArrayData]
+          val casted = v.asInstanceOf[ArrayData]
           listLiteralBuilder.addListValues(if (casted != null) {
             makeListLiteral(casted.array, a)
           } else ListLiteral.newBuilder())
diff --git a/spark/src/main/scala/org/apache/comet/testing/FuzzDataGenerator.scala b/spark/src/main/scala/org/apache/comet/testing/FuzzDataGenerator.scala
@@ -44,6 +44,9 @@ object FuzzDataGenerator {
   val defaultBaseDate: Long =
     new SimpleDateFormat("YYYY-MM-DD hh:mm:ss").parse("3333-05-25 12:34:56").getTime
 
+  val floatNaNLiteral = "FLOAT('NaN')"
+  val doubleNaNLiteral = "DOUBLE('NaN')"
+
   def generateSchema(options: SchemaGenOptions): StructType = {
     val primitiveTypes = options.primitiveTypes
     val dataTypes = ListBuffer[DataType]()
@@ -168,6 +171,7 @@ object FuzzDataGenerator {
             case 4 => Float.MaxValue
             case 5 => 0.0f
             case 6 if options.generateNegativeZero => -0.0f
+            case 7 if options.generateNaN => Float.NaN
             case _ => r.nextFloat()
           }
         })
@@ -181,6 +185,7 @@ object FuzzDataGenerator {
             case 4 => Double.MaxValue
             case 5 => 0.0
             case 6 if options.generateNegativeZero => -0.0
+            case 7 if options.generateNaN => Double.NaN
             case _ => r.nextDouble()
           }
         })
@@ -257,6 +262,7 @@ case class SchemaGenOptions(
 case class DataGenOptions(
     allowNull: Boolean = true,
     generateNegativeZero: Boolean = true,
+    generateNaN: Boolean = true,
     baseDate: Long = FuzzDataGenerator.defaultBaseDate,
     customStrings: Seq[String] = Seq.empty,
     maxStringLength: Int = 8)
diff --git a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala
@@ -74,9 +74,11 @@ class CometExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelper {
     df.createOrReplaceTempView("tbl")
 
     withSQLConf(CometConf.getExprAllowIncompatConfigKey("SortOrder") -> "false") {
-      checkSparkAnswerAndFallbackReason(
+      checkSparkAnswerAndFallbackReasons(
         "select * from tbl order by 1, 2",
-        "unsupported range partitioning sort order")
+        Set(
+          "unsupported range partitioning sort order",
+          "Sorting on floating-point is not 100% compatible with Spark"))
     }
   }
 
diff --git a/spark/src/test/scala/org/apache/comet/CometFuzzTestSuite.scala b/spark/src/test/scala/org/apache/comet/CometFuzzTestSuite.scala
@@ -29,6 +29,7 @@ import org.apache.spark.sql.types._
 
 import org.apache.comet.DataTypeSupport.isComplexType
 import org.apache.comet.testing.{DataGenOptions, ParquetGenerator, SchemaGenOptions}
+import org.apache.comet.testing.FuzzDataGenerator.{doubleNaNLiteral, floatNaNLiteral}
 
 class CometFuzzTestSuite extends CometFuzzTestBase {
 
@@ -71,8 +72,20 @@ class CometFuzzTestSuite extends CometFuzzTestBase {
         // Construct the string for the default value based on the column type.
         val defaultValueString = defaultValueType match {
           // These explicit type definitions for TINYINT, SMALLINT, FLOAT, DOUBLE, and DATE are only needed for 3.4.
-          case "TINYINT" | "SMALLINT" | "FLOAT" | "DOUBLE" =>
+          case "TINYINT" | "SMALLINT" =>
             s"$defaultValueType(${defaultValueRow.get(0)})"
+          case "FLOAT" =>
+            if (Float.NaN.equals(defaultValueRow.get(0))) {
+              floatNaNLiteral
+            } else {
+              s"$defaultValueType(${defaultValueRow.get(0)})"
+            }
+          case "DOUBLE" =>
+            if (Double.NaN.equals(defaultValueRow.get(0))) {
+              doubleNaNLiteral
+            } else {
+              s"$defaultValueType(${defaultValueRow.get(0)})"
+            }
           case "DATE" => s"$defaultValueType('${defaultValueRow.get(0)}')"
           case "STRING" => s"'${defaultValueRow.get(0)}'"
           case "TIMESTAMP" | "TIMESTAMP_NTZ" => s"TIMESTAMP '${defaultValueRow.get(0)}'"
@@ -101,7 +114,7 @@ class CometFuzzTestSuite extends CometFuzzTestBase {
                 .asInstanceOf[Array[Byte]]
                 .sameElements(spark.sql(sql).collect()(0).get(0).asInstanceOf[Array[Byte]]))
           } else {
-            assert(defaultValueRow.get(0) == spark.sql(sql).collect()(0).get(0))
+            assert(defaultValueRow.get(0).equals(spark.sql(sql).collect()(0).get(0)))
           }
         }
       }
diff --git a/spark/src/test/scala/org/apache/comet/exec/CometExecSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometExecSuite.scala
diff --git a/spark/src/test/scala/org/apache/comet/exec/CometWindowExecSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometWindowExecSuite.scala
diff --git a/spark/src/test/scala/org/apache/spark/sql/CometTestBase.scala b/spark/src/test/scala/org/apache/spark/sql/CometTestBase.scala