diff --git a/build/buildall b/build/buildall index 4ec9e691c0d..087e6aabbc3 100755 --- a/build/buildall +++ b/build/buildall @@ -1,6 +1,6 @@ #!/bin/bash # -# Copyright (c) 2021-2025, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2021-2026, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -38,8 +38,8 @@ function print_usage() { echo " -gb, --generate-bloop" echo " generate projects for Bloop clients: IDE (Scala Metals, IntelliJ) or Bloop CLI" echo " -p=DIST_PROFILE, --profile=DIST_PROFILE" - echo " use this profile for the dist module, default: noSnapshots, also supported: snapshots, minimumFeatureVersionMix," - echo " snapshotsWithDatabricks, noSnapshotsWithDatabricks, noSnapshotsScala213, snapshotsScala213." + echo " use this profile for the dist module, default: noSnapshots, also supported: snapshots," + echo " snapshotsWithDatabricks, noSnapshotsWithDatabricks" echo " NOTE: the Databricks-related spark3XYdb shims are not built locally, the jars are fetched prebuilt from a" echo " . remote Maven repo. You can also supply a comma-separated list of build versions. E.g., --profile=330,331 will" echo " build only the distribution jar only for 3.3.0 and 3.3.1" @@ -54,6 +54,8 @@ function print_usage() { echo " use this option to build project with maven. E.g., --option='-Dcudf.version=cuda12'" echo " --rebuild-dist-only" echo " repackage the dist module artifact using installed dependencies" + echo " --scala213" + echo " build 2.13 shims" } function bloopInstall() { @@ -152,7 +154,7 @@ case "$1" in ;; -o=*|--option=*) - MVN_OPT="${1#*=}" + export MVN_OPT="${1#*=}" ;; *) @@ -172,41 +174,35 @@ if [[ "$DIST_PROFILE" == *Scala213 ]]; then SCALA213=1 fi +MVN=${MVN:-"mvn"} # include options to mvn command -export MVN="mvn -Dmaven.wagon.http.retryHandler.count=3 ${MVN_OPT}" +export MVN="$MVN -Dmaven.wagon.http.retryHandler.count=3 ${MVN_OPT}" if [[ "$SCALA213" == "1" ]]; then - MVN="$MVN -f scala2.13/" - DIST_PROFILE=${DIST_PROFILE:-"noSnapshotsScala213"} + POM_FILE="scala2.13/pom.xml" + export MVN="$MVN -f scala2.13/" $(dirname $0)/make-scala-version-build-files.sh 2.13 -else - DIST_PROFILE=${DIST_PROFILE:-"noSnapshots"} +else + POM_FILE="pom.xml" fi +DIST_PROFILE=${DIST_PROFILE:-"noSnapshots"} + + [[ "$MODULE" != "" ]] && MODULE_OPT="--projects $MODULE --also-make" || MODULE_OPT="" echo "Collecting Spark versions..." case $DIST_PROFILE in - snapshotsScala213) - SPARK_SHIM_VERSIONS=($(versionsFromReleaseProfiles "snap_and_no_snap" "scala2.13/pom.xml")) - ;; - - noSnapshotsScala213) - SPARK_SHIM_VERSIONS=($(versionsFromReleaseProfiles "no_snapshots" "scala2.13/pom.xml")) - ;; - snapshots?(WithDatabricks)) - SPARK_SHIM_VERSIONS=($(versionsFromReleaseProfiles "snap_and_no_snap" "pom.xml")) + SPARK_SHIM_VERSIONS=($(versionsFromReleaseProfiles "snap_and_no_snap" $POM_FILE)) ;; noSnapshots?(WithDatabricks)) - SPARK_SHIM_VERSIONS=($(versionsFromReleaseProfiles "no_snapshots" "pom.xml")) + SPARK_SHIM_VERSIONS=($(versionsFromReleaseProfiles "no_snapshots" $POM_FILE)) ;; - minimumFeatureVersionMix) - SPARK_SHIM_VERSIONS=($(versionsFromDistProfile "minimumFeatureVersionMix")) - ;; + [34]*) <<< $DIST_PROFILE IFS="," read -ra SPARK_SHIM_VERSIONS diff --git a/datagen/src/main/spark400/scala/org/apache/spark/sql/tests/datagen/DataGenExprShims.scala b/datagen/src/main/spark400/scala/org/apache/spark/sql/tests/datagen/DataGenExprShims.scala index fe9c6f468ec..ad42aa6bfdd 100644 --- a/datagen/src/main/spark400/scala/org/apache/spark/sql/tests/datagen/DataGenExprShims.scala +++ b/datagen/src/main/spark400/scala/org/apache/spark/sql/tests/datagen/DataGenExprShims.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * Copyright (c) 2024-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ /*** spark-rapids-shim-json-lines {"spark": "400"} {"spark": "401"} +{"spark": "411"} spark-rapids-shim-json-lines ***/ package org.apache.spark.sql.tests.datagen diff --git a/delta-lake/common/src/main/delta-33x-40x/scala/com/nvidia/spark/rapids/delta/common/DeltaProviderBase.scala b/delta-lake/common/src/main/delta-33x-40x/scala/com/nvidia/spark/rapids/delta/common/DeltaProviderBase.scala index e86f33299c2..474f79ff6e3 100644 --- a/delta-lake/common/src/main/delta-33x-40x/scala/com/nvidia/spark/rapids/delta/common/DeltaProviderBase.scala +++ b/delta-lake/common/src/main/delta-33x-40x/scala/com/nvidia/spark/rapids/delta/common/DeltaProviderBase.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2025, NVIDIA CORPORATION. + * Copyright (c) 2025-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ import com.nvidia.spark.rapids._ import com.nvidia.spark.rapids.RapidsPluginImplicits._ import com.nvidia.spark.rapids.delta.{DeltaIOProvider, GpuDeltaDataSource, RapidsDeltaUtils} import com.nvidia.spark.rapids.shims._ +import com.nvidia.spark.rapids.shims.InvalidateCacheShims import org.apache.hadoop.fs.Path import org.apache.spark.sql.SparkSession @@ -132,7 +133,7 @@ abstract class DeltaProviderBase extends DeltaIOProvider { cpuExec.tableSpec, cpuExec.writeOptions, cpuExec.orCreate, - cpuExec.invalidateCache) + InvalidateCacheShims.getInvalidateCache(cpuExec.invalidateCache)) } diff --git a/delta-lake/common/src/main/delta-33x-40x/scala/com/nvidia/spark/rapids/delta/common/GpuDeltaParquetFileFormatBase.scala b/delta-lake/common/src/main/delta-33x-40x/scala/com/nvidia/spark/rapids/delta/common/GpuDeltaParquetFileFormatBase.scala index c512721c5f1..824788c03e2 100644 --- a/delta-lake/common/src/main/delta-33x-40x/scala/com/nvidia/spark/rapids/delta/common/GpuDeltaParquetFileFormatBase.scala +++ b/delta-lake/common/src/main/delta-33x-40x/scala/com/nvidia/spark/rapids/delta/common/GpuDeltaParquetFileFormatBase.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2025, NVIDIA CORPORATION. + * Copyright (c) 2025-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +17,6 @@ package com.nvidia.spark.rapids.delta.common import ai.rapids.cudf._ -import ai.rapids.cudf.HostColumnVector._ import com.nvidia.spark.rapids._ import com.nvidia.spark.rapids.Arm.withResource import com.nvidia.spark.rapids.RapidsPluginImplicits._ diff --git a/delta-lake/common/src/main/delta-33x/scala/org/apache/spark/sql/delta/rapids/GpuOptimizeWriteExchangeExec.scala b/delta-lake/common/src/main/delta-33x/scala/org/apache/spark/sql/delta/rapids/GpuOptimizeWriteExchangeExec.scala index 1f332fc66d9..5640376de72 100644 --- a/delta-lake/common/src/main/delta-33x/scala/org/apache/spark/sql/delta/rapids/GpuOptimizeWriteExchangeExec.scala +++ b/delta-lake/common/src/main/delta-33x/scala/org/apache/spark/sql/delta/rapids/GpuOptimizeWriteExchangeExec.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2025, NVIDIA CORPORATION. + * Copyright (c) 2025-2026, NVIDIA CORPORATION. * * This file was derived from OptimizeWriteExchange.scala * in the Delta Lake project at https://github.com/delta-io/delta @@ -26,8 +26,7 @@ import scala.concurrent.Future import scala.concurrent.duration.Duration import com.nvidia.spark.rapids.{GpuColumnarBatchSerializer, GpuExec, GpuMetric, GpuPartitioning, GpuRoundRobinPartitioning, RapidsConf} -import com.nvidia.spark.rapids.GpuMetric.{OP_TIME_NEW_SHUFFLE_READ, OP_TIME_NEW_SHUFFLE_WRITE} -import com.nvidia.spark.rapids.GpuMetric.{DESCRIPTION_OP_TIME_NEW_SHUFFLE_READ, DESCRIPTION_OP_TIME_NEW_SHUFFLE_WRITE, MODERATE_LEVEL} +import com.nvidia.spark.rapids.GpuMetric._ import com.nvidia.spark.rapids.delta.RapidsDeltaSQLConf import com.nvidia.spark.rapids.shims.GpuHashPartitioning @@ -60,7 +59,6 @@ case class GpuOptimizeWriteExchangeExec( partitioning: GpuPartitioning, override val child: SparkPlan, @transient deltaLog: DeltaLog) extends Exchange with GpuExec with DeltaLogging { - import GpuMetric._ // Use 150% of target file size hint config considering parquet compression. // Still the result file can be smaller/larger than the config due to data skew or diff --git a/delta-lake/common/src/main/delta-40x/scala/org/apache/spark/sql/delta/hooks/GpuAutoCompact.scala b/delta-lake/common/src/main/delta-40x/scala/org/apache/spark/sql/delta/hooks/GpuAutoCompact.scala index 4ac4cac83a9..eed441e4006 100644 --- a/delta-lake/common/src/main/delta-40x/scala/org/apache/spark/sql/delta/hooks/GpuAutoCompact.scala +++ b/delta-lake/common/src/main/delta-40x/scala/org/apache/spark/sql/delta/hooks/GpuAutoCompact.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2025, NVIDIA CORPORATION. + * Copyright (c) 2025-2026, NVIDIA CORPORATION. * * This file was derived from OptimisticTransaction.scala and TransactionalWrite.scala * in the Delta Lake project at https://github.com/delta-io/delta. @@ -23,11 +23,8 @@ package org.apache.spark.sql.delta.hooks import org.apache.spark.internal.MDC import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.delta._ import org.apache.spark.sql.delta.actions._ -import org.apache.spark.sql.delta.commands.DeltaOptimizeContext -import org.apache.spark.sql.delta.commands.optimize._ import org.apache.spark.sql.delta.logging.DeltaLogKeys import org.apache.spark.sql.delta.rapids.GpuOptimisticTransactionBase import org.apache.spark.sql.delta.stats.AutoCompactPartitionStats diff --git a/delta-lake/common/src/main/delta-40x/scala/org/apache/spark/sql/delta/rapids/GpuWriteIntoDelta.scala b/delta-lake/common/src/main/delta-40x/scala/org/apache/spark/sql/delta/rapids/GpuWriteIntoDelta.scala index 9f7bf72ec37..d3ee99d43b7 100644 --- a/delta-lake/common/src/main/delta-40x/scala/org/apache/spark/sql/delta/rapids/GpuWriteIntoDelta.scala +++ b/delta-lake/common/src/main/delta-40x/scala/org/apache/spark/sql/delta/rapids/GpuWriteIntoDelta.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2025, NVIDIA CORPORATION. + * Copyright (c) 2022-2026, NVIDIA CORPORATION. * * This file was derived from WriteIntoDelta.scala * in the Delta Lake project at https://github.com/delta-io/delta. @@ -45,8 +45,6 @@ import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.sql.functions.{array, col, explode, lit, struct} import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.nvidia.DFUDFShims -import org.apache.spark.sql.rapids.shims.TrampolineConnectShims.SparkSession import org.apache.spark.sql.types.StructType /** GPU version of Delta Lake's WriteIntoDelta. */ diff --git a/delta-lake/delta-40x/src/main/scala/org/apache/spark/sql/delta/rapids/delta40x/GpuMergeIntoCommand.scala b/delta-lake/delta-40x/src/main/scala/org/apache/spark/sql/delta/rapids/delta40x/GpuMergeIntoCommand.scala index e491febc243..717bd641b47 100644 --- a/delta-lake/delta-40x/src/main/scala/org/apache/spark/sql/delta/rapids/delta40x/GpuMergeIntoCommand.scala +++ b/delta-lake/delta-40x/src/main/scala/org/apache/spark/sql/delta/rapids/delta40x/GpuMergeIntoCommand.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2025, NVIDIA CORPORATION. + * Copyright (c) 2025-2026, NVIDIA CORPORATION. * * This file was derived from MergeIntoCommand.scala * in the Delta Lake project at https://github.com/delta-io/delta. @@ -30,11 +30,11 @@ import com.nvidia.spark.rapids.RapidsConf import com.nvidia.spark.rapids.delta._ import org.apache.spark.SparkContext -import org.apache.spark.sql.{DataFrame, Row, SparkSession => SqlSparkSession} +import org.apache.spark.sql.{Row, SparkSession => SqlSparkSession} import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression, Literal, Or} import org.apache.spark.sql.catalyst.plans.logical._ -import org.apache.spark.sql.classic.{ColumnNodeToExpressionConverter, ExpressionUtils, SparkSession => ClassicSparkSession} +import org.apache.spark.sql.classic.{SparkSession => ClassicSparkSession} import org.apache.spark.sql.delta._ import org.apache.spark.sql.delta.actions.{AddFile, FileAction} import org.apache.spark.sql.delta.commands.MergeIntoCommandBase @@ -384,7 +384,7 @@ case class GpuMergeIntoCommand( } } commitAndRecordStats( - org.apache.spark.sql.classic.SparkSession.active, + ClassicSparkSession.active, gpuDeltaTxn, mergeActions, startTime, @@ -583,7 +583,6 @@ case class GpuMergeIntoCommand( val matchedRowCounts = collectTouchedFiles.groupBy(ROW_ID_COL).agg(sum("one").as("count")) // Get multiple matches and simultaneously collect (using touchedFilesAccum) the file names - import org.apache.spark.sql.delta.implicits._ val mmRow = matchedRowCounts .filter(col("count") > lit(1)) .select( diff --git a/delta-lake/delta-40x/src/main/scala/org/apache/spark/sql/delta/rapids/delta40x/GpuOptimisticTransaction.scala b/delta-lake/delta-40x/src/main/scala/org/apache/spark/sql/delta/rapids/delta40x/GpuOptimisticTransaction.scala index b0acf681806..c3ad36504b1 100644 --- a/delta-lake/delta-40x/src/main/scala/org/apache/spark/sql/delta/rapids/delta40x/GpuOptimisticTransaction.scala +++ b/delta-lake/delta-40x/src/main/scala/org/apache/spark/sql/delta/rapids/delta40x/GpuOptimisticTransaction.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2025, NVIDIA CORPORATION. + * Copyright (c) 2025-2026, NVIDIA CORPORATION. * * This file was derived from OptimisticTransaction.scala and TransactionalWrite.scala * in the Delta Lake project at https://github.com/delta-io/delta. @@ -32,7 +32,7 @@ import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.{SparkSession => SqlSparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.catalog.CatalogTable -import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, RuntimeReplaceable} +import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.delta._ import org.apache.spark.sql.delta.actions.{AddFile, FileAction} @@ -47,7 +47,6 @@ import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.functions.to_json import org.apache.spark.sql.rapids.{BasicColumnarWriteJobStatsTracker, ColumnarWriteJobStatsTracker, GpuWriteJobStatsTracker} import org.apache.spark.sql.rapids.delta.GpuIdentityColumn -import org.apache.spark.sql.rapids.shims.TrampolineConnectShims import org.apache.spark.sql.rapids.shims.TrampolineConnectShims.SparkSession import org.apache.spark.sql.types.StructType import org.apache.spark.sql.vectorized.ColumnarBatch diff --git a/dist/README.md b/dist/README.md index af53b7a39e0..aa23b6a6332 100644 --- a/dist/README.md +++ b/dist/README.md @@ -30,7 +30,7 @@ for each version of Spark supported in the jar, i.e., spark330/, spark341/, etc. If you have to change the contents of the uber jar the following files control what goes into the base jar as classes that are not shaded. -1. `unshimmed-common-from-spark320.txt` - This has classes and files that should go into the base jar with their normal +1. `unshimmed-common-from-single-shim.txt` - This has classes and files that should go into the base jar with their normal package name (not shaded). This includes user visible classes (i.e., com/nvidia/spark/SQLPlugin), python files, and other files that aren't version specific. Uses Spark 3.2.0 built jar for these base classes as explained above. 2. `unshimmed-from-each-spark3xx.txt` - This is applied to all the individual Spark specific version jars to pull diff --git a/dist/build/package-parallel-worlds.py b/dist/build/package-parallel-worlds.py index ef64a4cd6bd..652b34410cf 100644 --- a/dist/build/package-parallel-worlds.py +++ b/dist/build/package-parallel-worlds.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2026, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -29,6 +29,7 @@ def shell_exec(shell_cmd): artifacts = attributes.get('artifact_csv').split(',') buildver_list = re.sub(r'\s+', '', project.getProperty('included_buildvers'), flags=re.UNICODE).split(',') +buildver_list = sorted(buildver_list, reverse=True) source_basedir = project.getProperty('spark.rapids.source.basedir') project_basedir = project.getProperty('spark.rapids.project.basedir') project_version = project.getProperty('project.version') @@ -73,8 +74,8 @@ def shell_exec(shell_cmd): shell_exec(mvn_cmd) dist_dir = os.sep.join([source_basedir, 'dist']) - with open(os.sep.join([dist_dir, 'unshimmed-common-from-spark320.txt']), 'r') as f: - from_spark320 = f.read().splitlines() + with open(os.sep.join([dist_dir, 'unshimmed-common-from-single-shim.txt']), 'r') as f: + from_single_shim = f.read().splitlines() with open(os.sep.join([dist_dir, 'unshimmed-from-each-spark3xx.txt']), 'r') as f: from_each = f.read().splitlines() with zipfile.ZipFile(os.sep.join([deps_dir, art_jar]), 'r') as zip_handle: @@ -82,13 +83,13 @@ def shell_exec(shell_cmd): zip_handle.extractall(path=top_dist_jar_dir) else: zip_handle.extractall(path=os.sep.join([top_dist_jar_dir, classifier])) - # IMPORTANT unconditional extract from first to the top + # IMPORTANT unconditional extract from the highest Spark version to the top if bv == buildver_list[0] and art == 'sql-plugin-api': zip_handle.extractall(path=top_dist_jar_dir) # TODO deprecate namelist = zip_handle.namelist() matching_members = [] - glob_list = from_spark320 + from_each if bv == buildver_list[0] else from_each + glob_list = from_single_shim + from_each if bv == buildver_list[0] else from_each for pat in glob_list: new_matches = fnmatch.filter(namelist, pat) matching_members += new_matches diff --git a/dist/maven-antrun/build-parallel-worlds.xml b/dist/maven-antrun/build-parallel-worlds.xml index bc4d7c9991c..9f422bc95fa 100644 --- a/dist/maven-antrun/build-parallel-worlds.xml +++ b/dist/maven-antrun/build-parallel-worlds.xml @@ -1,6 +1,6 @@ + includesfile="${spark.rapids.source.basedir}/${rapids.module}/unshimmed-common-from-single-shim.txt"/> diff --git a/dist/scripts/binary-dedupe.sh b/dist/scripts/binary-dedupe.sh index b761ea57826..6813818bcd1 100755 --- a/dist/scripts/binary-dedupe.sh +++ b/dist/scripts/binary-dedupe.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Copyright (c) 2021-2025, NVIDIA CORPORATION. +# Copyright (c) 2021-2026, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -85,9 +85,6 @@ function retain_single_copy() { package_class_parts=("${path_parts[@]:2}") - package_len=$((${#package_class_parts[@]} - 1)) - package_parts=("${package_class_parts[@]::$package_len}") - package_class_with_spaces="${package_class_parts[*]}" # com/nvidia/spark/udf/Repr\$UnknownCapturedArg\$.class package_class="${package_class_with_spaces// //}" @@ -164,12 +161,16 @@ function verify_same_sha_for_unshimmed() { # sha1 look up if there is an entry with the unshimmed class as a suffix class_file_quoted=$(printf '%q' "$class_file") - # TODO currently RapidsShuffleManager is "removed" from /spark* by construction in # dist pom.xml via ant. We could delegate this logic to this script # and make both simmpler - if [[ ! "$class_file_quoted" =~ com/nvidia/spark/rapids/spark[34].*/.*ShuffleManager.class ]]; then - + # + # TODO ParquetCachedBatchSerializer is not bitwise-identical after 411, + # but it is compatible with previous versions because it merely adds a new method. + # we might need to replace this strict check with MiMa + # https://github.com/apache/spark/blob/7011706a0a8dbec6adb5b5b121921b29b314335f/sql/core/src/main/scala/org/apache/spark/sql/columnar/CachedBatchSerializer.scala#L75-L95 + if [[ ! "$class_file_quoted" =~ com/nvidia/spark/rapids/spark[34].*/.*ShuffleManager.class && \ + "$class_file_quoted" != "com/nvidia/spark/ParquetCachedBatchSerializer.class" ]]; then if ! grep -q "/spark.\+/$class_file_quoted" "$SPARK_SHARED_TXT"; then echo >&2 "$class_file is not bitwise-identical across shims" exit 255 diff --git a/dist/unshimmed-common-from-spark320.txt b/dist/unshimmed-common-from-single-shim.txt similarity index 83% rename from dist/unshimmed-common-from-spark320.txt rename to dist/unshimmed-common-from-single-shim.txt index 3871188e1fc..81a5c61003c 100644 --- a/dist/unshimmed-common-from-spark320.txt +++ b/dist/unshimmed-common-from-single-shim.txt @@ -1,8 +1,6 @@ META-INF/DEPENDENCIES META-INF/LICENSE META-INF/NOTICE -com/nvidia/spark/GpuCachedBatchSerializer* -com/nvidia/spark/ParquetCachedBatchSerializer* com/nvidia/spark/rapids/ExplainPlan.class com/nvidia/spark/rapids/ExplainPlan$.class com/nvidia/spark/rapids/ExplainPlanBase.class diff --git a/integration_tests/requirements.txt b/integration_tests/requirements.txt index 616e9318949..ce0fc4318ec 100644 --- a/integration_tests/requirements.txt +++ b/integration_tests/requirements.txt @@ -17,6 +17,7 @@ pandas pyarrow == 17.0.0 ; python_version == '3.8' pyarrow == 19.0.1 ; python_version >= '3.9' pytest-xdist >= 2.0.0 +pytz findspark fsspec == 2025.3.0 fastparquet == 2024.5.0 ; python_version >= '3.9' diff --git a/integration_tests/run_pyspark_from_build.sh b/integration_tests/run_pyspark_from_build.sh index eb63174d6a9..a619e6c66ae 100755 --- a/integration_tests/run_pyspark_from_build.sh +++ b/integration_tests/run_pyspark_from_build.sh @@ -79,7 +79,7 @@ else # PySpark uses ".dev0" for "-SNAPSHOT" and either ".dev" for "preview" or ".devN" for "previewN" # https://github.com/apache/spark/blob/66f25e314032d562567620806057fcecc8b71f08/dev/create-release/release-build.sh#L267 VERSION_STRING=$(PYTHONPATH=${SPARK_HOME}/python:${PY4J_FILE} python -c \ - "import pyspark, re; print(re.sub('\.dev[012]?$', '', pyspark.__version__))" + "import pyspark, re; print(re.sub(r'\.dev[012]?$', '', pyspark.__version__))" ) SCALA_VERSION=`$SPARK_HOME/bin/pyspark --version 2>&1| grep Scala | awk '{split($4,v,"."); printf "%s.%s", v[1], v[2]}'` diff --git a/integration_tests/src/main/python/asserts.py b/integration_tests/src/main/python/asserts.py index b211b8b2bcc..cc9013cd845 100644 --- a/integration_tests/src/main/python/asserts.py +++ b/integration_tests/src/main/python/asserts.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2025, NVIDIA CORPORATION. +# Copyright (c) 2020-2026, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -95,6 +95,9 @@ def _assert_equal(cpu, gpu, float_check, path): assert cpu == gpu, f"GPU ({gpu}) and CPU ({cpu}) decimal values are different at {path}" elif isinstance(cpu, bytearray): assert cpu == gpu, f"GPU ({gpu}) and CPU ({cpu}) bytearray values are different at {path}" + elif isinstance(cpu, bytes): + # Spark 4.1.0+ returns bytes instead of bytearray for binary data + assert cpu == gpu, f"GPU ({gpu}) and CPU ({cpu}) bytes values are different at {path}" elif isinstance(cpu, timedelta): # Used by interval type DayTimeInterval for Pyspark 3.3.0+ assert cpu == gpu, f"GPU ({gpu}) and CPU ({cpu}) timedelta values are different at {path}" diff --git a/integration_tests/src/main/python/udf_test.py b/integration_tests/src/main/python/udf_test.py index 19961e30cb7..51b3f4f2857 100644 --- a/integration_tests/src/main/python/udf_test.py +++ b/integration_tests/src/main/python/udf_test.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2025, NVIDIA CORPORATION. +# Copyright (c) 2020-2026, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -49,6 +49,8 @@ arrow_udf_conf = { 'spark.sql.execution.arrow.pyspark.enabled': 'true', 'spark.rapids.sql.exec.WindowInPandasExec': 'true', + # ArrowWindowPythonExec is the new name for WindowInPandasExec in Spark 4.1+ + 'spark.rapids.sql.exec.ArrowWindowPythonExec': 'true', 'spark.rapids.sql.exec.FlatMapCoGroupsInPandasExec': 'true' } @@ -148,7 +150,7 @@ def pandas_sum(to_process: pd.Series) -> int: lambda spark : binary_op_df(spark, data_gen)\ .groupBy('a')\ .agg(pandas_sum(f.col('b'))), - conf=arrow_udf_conf) + conf=arrow_udf_conf_unsafe) @ignore_order(local=True) diff --git a/integration_tests/src/test/spark400/scala/org/apache/spark/sql/rapids/shims/TrampolineUtilShim.scala b/integration_tests/src/test/spark400/scala/org/apache/spark/sql/rapids/shims/TrampolineUtilShim.scala index cf3556baec9..61924942c45 100644 --- a/integration_tests/src/test/spark400/scala/org/apache/spark/sql/rapids/shims/TrampolineUtilShim.scala +++ b/integration_tests/src/test/spark400/scala/org/apache/spark/sql/rapids/shims/TrampolineUtilShim.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2025, NVIDIA CORPORATION. + * Copyright (c) 2025-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ /*** spark-rapids-shim-json-lines {"spark": "400"} {"spark": "401"} +{"spark": "411"} spark-rapids-shim-json-lines ***/ package org.apache.spark.sql.rapids.shims diff --git a/jdk-profiles/pom.xml b/jdk-profiles/pom.xml index 3d743f857bb..8b51165431d 100644 --- a/jdk-profiles/pom.xml +++ b/jdk-profiles/pom.xml @@ -1,6 +1,6 @@ [9,) + + buildver + !411 + @@ -63,7 +65,6 @@ net.alchim31.maven scala-maven-plugin - ${scala.plugin.version} ${java.major.version} @@ -72,5 +73,31 @@ + + release411 + + + buildver + 411 + + + + + + + net.alchim31.maven + scala-maven-plugin + + + -Xlint:all,-serial,-path,-try,-processing|-target|${java.major.version} + + + + + + \ No newline at end of file diff --git a/pom.xml b/pom.xml index 5747fac6193..038331d7dc5 100644 --- a/pom.xml +++ b/pom.xml @@ -769,6 +769,26 @@ delta-lake/delta-40x + + release411 + + + buildver + 411 + + + + 411 + ${spark411.version} + ${spark411.version} + 1.13.1 + rapids-4-spark-delta-stub + 2.0.7 + + + delta-lake/delta-stub + + --> source-javadoc @@ -794,14 +814,14 @@ scala-2.12 2.12 - 2.12.15 + 2.12.21 scala-2.13 2.13 - 2.13.14 + 2.13.18 @@ -894,7 +914,7 @@ 26.02.0-SNAPSHOT 2.12 incremental - 2.12.15 + 2.12.21 4.9.2 @@ -1469,10 +1490,8 @@ This will force full Scala code rebuild in downstream modules. -Wconf:cat=lint-adapted-args:e diff --git a/scala2.13/jdk-profiles/pom.xml b/scala2.13/jdk-profiles/pom.xml index d20e01a02ef..81ab0c08e5c 100644 --- a/scala2.13/jdk-profiles/pom.xml +++ b/scala2.13/jdk-profiles/pom.xml @@ -1,6 +1,6 @@ [9,) + + buildver + !411 + @@ -63,7 +65,6 @@ net.alchim31.maven scala-maven-plugin - ${scala.plugin.version} ${java.major.version} @@ -72,5 +73,31 @@ + + release411 + + + buildver + 411 + + + + + + + net.alchim31.maven + scala-maven-plugin + + + -Xlint:all,-serial,-path,-try,-processing|-target|${java.major.version} + + + + + + \ No newline at end of file diff --git a/scala2.13/pom.xml b/scala2.13/pom.xml index 9ae5365d117..0c208bd2230 100644 --- a/scala2.13/pom.xml +++ b/scala2.13/pom.xml @@ -769,6 +769,26 @@ delta-lake/delta-40x + + release411 + + + buildver + 411 + + + + 411 + ${spark411.version} + ${spark411.version} + 1.13.1 + rapids-4-spark-delta-stub + 2.0.7 + + + delta-lake/delta-stub + + source-javadoc @@ -794,14 +814,14 @@ scala-2.12 2.12 - 2.12.15 + 2.12.21 scala-2.13 2.13 - 2.13.14 + 2.13.18 @@ -894,7 +914,7 @@ 26.02.0-SNAPSHOT 2.13 incremental - 2.13.14 + 2.13.18 4.9.2 @@ -1469,10 +1490,8 @@ This will force full Scala code rebuild in downstream modules. -Wconf:cat=lint-adapted-args:e -Xsource:2.13 - -Ywarn-unused:locals,patvars,privates -Wconf:cat=deprecation:e,any:e -Wconf:cat=scaladoc:wv - -Wconf:cat=lint-multiarg-infix:wv -Wconf:cat=other-nullary-override:e -Wconf:msg=^(?=.*?method|value|type|object|trait|inheritance)(?=.*?deprecated)(?=.*?since 2.13).+$:s -Wconf:msg=^(?=.*?Widening conversion from)(?=.*?is deprecated because it loses precision).+$:s @@ -1480,6 +1499,14 @@ This will force full Scala code rebuild in downstream modules. -Wconf:cat=unchecked&msg=outer reference:s -Wconf:cat=unchecked&msg=eliminated by erasure:s -Wconf:msg=^(?=.*?a value of type)(?=.*?cannot also be).+$:s + -Wconf:cat=unused:e + -Wconf:cat=unused-imports:e + -Wconf:cat=unused-locals:e + -Wconf:cat=unused-nowarn:e + -Wconf:cat=unused-params:e + -Wconf:cat=unused-pat-vars:e + -Wconf:cat=unused-privates:e + -Wunused:imports,locals,patvars,privates diff --git a/scala2.13/sql-plugin-api/pom.xml b/scala2.13/sql-plugin-api/pom.xml index c3c28b8c3c9..f4a79c3cde5 100644 --- a/scala2.13/sql-plugin-api/pom.xml +++ b/scala2.13/sql-plugin-api/pom.xml @@ -1,6 +1,6 @@