From 2320f58f8efc653700197fa6df7a87b53273cad3 Mon Sep 17 00:00:00 2001 From: Erik Ordentlich Date: Tue, 2 Dec 2025 12:07:15 -0800 Subject: [PATCH 1/9] updates for 25.12 nightly Signed-off-by: Erik Ordentlich --- ci/Dockerfile | 4 +- docker/Dockerfile.pip | 5 +- docker/Dockerfile.python | 4 +- docs/source/conf.py | 2 +- jvm/README.md | 6 +- jvm/pom.xml | 2 +- notebooks/aws-emr/init-bootstrap-action.sh | 8 ++- notebooks/databricks/init-pip-cuda-12.0.sh | 8 ++- notebooks/dataproc/README.md | 2 +- notebooks/dataproc/spark_rapids_ml.sh | 10 ++- notebooks/logistic-regression.ipynb | 3 +- python/README.md | 6 +- .../benchmark/databricks/cpu_cluster_spec.sh | 2 +- .../benchmark/databricks/gpu_cluster_spec.sh | 4 +- .../databricks/gpu_etl_cluster_spec.sh | 4 +- .../databricks/init-pip-cuda-12.0.sh | 8 ++- python/benchmark/databricks/setup.sh | 2 +- python/benchmark/dataproc/init_benchmark.sh | 10 ++- python/pyproject.toml | 2 +- python/src/spark_rapids_ml/__init__.py | 2 +- python/src/spark_rapids_ml/classification.py | 36 +++------- python/src/spark_rapids_ml/clustering.py | 10 +-- python/src/spark_rapids_ml/connect_plugin.py | 1 - python/src/spark_rapids_ml/feature.py | 13 ++-- python/src/spark_rapids_ml/metrics/utils.py | 68 +++++++++++++++++++ python/src/spark_rapids_ml/regression.py | 9 ++- python/src/spark_rapids_ml/tree.py | 2 +- python/src/spark_rapids_ml/umap.py | 19 +++++- python/src/spark_rapids_ml/utils.py | 4 +- python/tests/test_linear_model.py | 2 +- python/tests/test_logistic_regression.py | 50 ++++++-------- python/tests/test_umap.py | 2 +- .../test_large_logistic_regression.py | 6 +- 33 files changed, 202 insertions(+), 114 deletions(-) create mode 100644 python/src/spark_rapids_ml/metrics/utils.py diff --git a/ci/Dockerfile b/ci/Dockerfile index 35f584cf7..acf57cb8b 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -47,6 +47,6 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86 && conda config --set solver libmamba # install cuML -ARG CUML_VER=25.10 -RUN conda install -y -c rapidsai -c conda-forge -c nvidia cuml=$CUML_VER cuvs=$CUML_VER python=3.10 cuda-version=12.0 numpy~=1.0 \ +ARG RAPIDS_VERSION=25.12 +RUN conda install -y -c rapidsai -c conda-forge -c nvidia cuml=$RAPIDS_VERSION cuvs=$RAPIDS_VERSION python=3.10 pylibraft=$RAPIDS_VERSION raft-dask=$RAPIDS_VERSION cuda-cudart cuda-version=12.0 numpy~=1.0 \ && conda clean --all -f -y diff --git a/docker/Dockerfile.pip b/docker/Dockerfile.pip index 1dd97f9fb..2ea468cd5 100644 --- a/docker/Dockerfile.pip +++ b/docker/Dockerfile.pip @@ -18,7 +18,7 @@ ARG CUDA_VERSION=12.0.1 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 ARG PYSPARK_VERSION=3.3.1 -ARG RAPIDS_VERSION=25.10.0 +ARG RAPIDS_VERSION=25.12.0 ARG ARCH=amd64 #ARG ARCH=arm64 @@ -50,6 +50,9 @@ RUN pip install --no-cache-dir \ cudf-cu12~=${RAPIDS_VERSION} \ cuml-cu12~=${RAPIDS_VERSION} \ cuvs-cu12~=${RAPIDS_VERSION} \ + pylibraft-cu12~=${RAPIDS_VERSION} \ + raft-dask-cu12~=${RAPIDS_VERSION} \ + dask-cuda-cu12~=${RAPIDS_VERSION} \ numpy~=1.0 \ --extra-index-url=https://pypi.nvidia.com diff --git a/docker/Dockerfile.python b/docker/Dockerfile.python index 4d8f335ae..9d3664510 100644 --- a/docker/Dockerfile.python +++ b/docker/Dockerfile.python @@ -17,7 +17,7 @@ ARG CUDA_VERSION=12.0.1 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 -ARG CUML_VERSION=25.10 +ARG RAPIDS_VERSION=25.12 # ubuntu22 RUN sed -i -e 's|http://archive.ubuntu.com/ubuntu|https://archive.ubuntu.com/ubuntu|g' \ @@ -47,7 +47,7 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-py38_4.10.3-Linu # install cuML -RUN conda install -y -c rapidsai -c conda-forge -c nvidia python=3.10 cuda-version=12.0 cuml=$CUML_VERSION numpy~=1.0 \ +RUN conda install -y -c rapidsai -c conda-forge -c nvidia python=3.10 cuda-version=12.0 cuml=$RAPIDS_VERSION cudf=$RAPIDS_VERSION cuvs=$RAPIDS_VERSION pylibraft=$RAPIDS_VERSION raft-dask=$RAPIDS_VERSION cuda-cudart numpy~=1.0 \ && conda clean --all -f -y # install python dependencies diff --git a/docs/source/conf.py b/docs/source/conf.py index 7399523c0..c7db1bf1b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -23,7 +23,7 @@ project = 'spark-rapids-ml' copyright = '2025, NVIDIA' author = 'NVIDIA' -release = '25.10.0' +release = '25.12.0' # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration diff --git a/jvm/README.md b/jvm/README.md index 282b029c6..1eb7d92f4 100644 --- a/jvm/README.md +++ b/jvm/README.md @@ -50,10 +50,10 @@ including setting up the server and running client-side tests. To start the Spark Connect server with Spark Rapids ML support, follow these steps: ```shell -conda activate rapids-25.10 # from spark-rapids-ml installation +conda activate rapids-25.12 # from spark-rapids-ml installation export SPARK_HOME= export PYSPARK_PYTHON=$(which python) -export PLUGIN_JAR=$(pip show spark-rapids-ml | grep Location: | cut -d ' ' -f 2 )/spark_rapids_ml/jars/com.nvidia.rapids.ml-25.10.0.jar +export PLUGIN_JAR=$(pip show spark-rapids-ml | grep Location: | cut -d ' ' -f 2 )/spark_rapids_ml/jars/com.nvidia.rapids.ml-25.12.0.jar $SPARK_HOME/sbin/start-connect-server.sh --master local[*] \ --jars $PLUGIN_JAR \ --conf spark.driver.memory=20G @@ -107,7 +107,7 @@ mvn clean package -DskipTests if you would like to compile the plugin and run the unit tests, install `spark-rapids-ml` python package and its dependencies per the above instructions and run the following command: ``` shell -conda activate rapids-25.10 +conda activate rapids-25.12 export PYSPARK_PYTHON=$(which python) mvn clean package ``` diff --git a/jvm/pom.xml b/jvm/pom.xml index 39f449c89..09aae8bf1 100644 --- a/jvm/pom.xml +++ b/jvm/pom.xml @@ -21,7 +21,7 @@ com.nvidia.rapids ml - 25.10.0 + 25.12.0 jar diff --git a/notebooks/aws-emr/init-bootstrap-action.sh b/notebooks/aws-emr/init-bootstrap-action.sh index a2d6a4846..a812e0434 100755 --- a/notebooks/aws-emr/init-bootstrap-action.sh +++ b/notebooks/aws-emr/init-bootstrap-action.sh @@ -27,7 +27,7 @@ sudo bash -c "wget https://www.python.org/ftp/python/3.10.9/Python-3.10.9.tgz && tar xzf Python-3.10.9.tgz && cd Python-3.10.9 && \ ./configure --enable-optimizations && make altinstall" -RAPIDS_VERSION=25.10.0 +RAPIDS_VERSION=25.12.0 sudo /usr/local/bin/pip3.10 install --upgrade pip @@ -35,9 +35,13 @@ sudo /usr/local/bin/pip3.10 install --upgrade pip sudo /usr/local/bin/pip3.10 install scikit-learn # install cudf and cuml -sudo /usr/local/bin/pip3.10 install --no-cache-dir cudf-cu12~=${RAPIDS_VERSION} \ +sudo /usr/local/bin/pip3.10 install --no-cache-dir \ + cudf-cu12~=${RAPIDS_VERSION} \ cuml-cu12~=${RAPIDS_VERSION} \ cuvs-cu12~=${RAPIDS_VERSION} \ + pylibraft-cu12~=${RAPIDS_VERSION} \ + raft-dask-cu12~=${RAPIDS_VERSION} \ + dask-cuda-cu12~=${RAPIDS_VERSION} \ --extra-index-url=https://pypi.nvidia.com --verbose sudo /usr/local/bin/pip3.10 install spark-rapids-ml sudo /usr/local/bin/pip3.10 list diff --git a/notebooks/databricks/init-pip-cuda-12.0.sh b/notebooks/databricks/init-pip-cuda-12.0.sh index e67ff3e88..91c38be8e 100644 --- a/notebooks/databricks/init-pip-cuda-12.0.sh +++ b/notebooks/databricks/init-pip-cuda-12.0.sh @@ -18,7 +18,7 @@ set -ex # IMPORTANT: specify RAPIDS_VERSION fully 23.10.0 and not 23.10 # also in general, RAPIDS_VERSION (python) fields should omit any leading 0 in month/minor field (i.e. 23.8.0 and not 23.08.0) # while SPARK_RAPIDS_VERSION (jar) should have leading 0 in month/minor (e.g. 23.08.2 and not 23.8.2) -RAPIDS_VERSION=25.10.0 +RAPIDS_VERSION=25.12.0 SPARK_RAPIDS_VERSION=25.08.0 curl -L https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}-cuda12.jar -o /databricks/jars/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar @@ -36,9 +36,13 @@ ln -s /usr/local/cuda-12.0 /usr/local/cuda # install cudf, cuml and their rapids dependencies # using ~= pulls in latest micro version patches -/databricks/python/bin/pip install cudf-cu12~=${RAPIDS_VERSION} \ +/databricks/python/bin/pip install --no-cache-dir \ + cudf-cu12~=${RAPIDS_VERSION} \ cuml-cu12~=${RAPIDS_VERSION} \ cuvs-cu12~=${RAPIDS_VERSION} \ + pylibraft-cu12~=${RAPIDS_VERSION} \ + raft-dask-cu12~=${RAPIDS_VERSION} \ + dask-cuda-cu12~=${RAPIDS_VERSION} \ --extra-index-url=https://pypi.nvidia.com # install spark-rapids-ml diff --git a/notebooks/dataproc/README.md b/notebooks/dataproc/README.md index b7b2c14b1..6a32c72bd 100644 --- a/notebooks/dataproc/README.md +++ b/notebooks/dataproc/README.md @@ -31,7 +31,7 @@ If you already have a Dataproc account, you can run the example notebooks on a D If you wish to enable [no-import-change](../README.md#no-import-change) UX for the cluster, change the `spark-rapids-ml-no-import-enabled` metadata value to `1` in the command. The initialization script `spark_rapids_ml.sh` checks this metadata value and modifies the run time accordingly. ``` - export RAPIDS_VERSION=25.10.0 + export RAPIDS_VERSION=25.12.0 gcloud dataproc clusters create $USER-spark-rapids-ml \ --image-version=2.2-ubuntu22 \ diff --git a/notebooks/dataproc/spark_rapids_ml.sh b/notebooks/dataproc/spark_rapids_ml.sh index 29b493029..6ab88749f 100644 --- a/notebooks/dataproc/spark_rapids_ml.sh +++ b/notebooks/dataproc/spark_rapids_ml.sh @@ -15,12 +15,18 @@ set -ex -RAPIDS_VERSION=25.10.0 +RAPIDS_VERSION=25.12.0 # install cudf and cuml pip install --upgrade pip -pip install cudf-cu12~=${RAPIDS_VERSION} cuml-cu12~=${RAPIDS_VERSION} cuvs-cu12~=${RAPIDS_VERSION} \ +pip install --no-cache-dir \ + cudf-cu12~=${RAPIDS_VERSION} \ + cuml-cu12~=${RAPIDS_VERSION} \ + cuvs-cu12~=${RAPIDS_VERSION} \ + pylibraft-cu12~=${RAPIDS_VERSION} \ + raft-dask-cu12~=${RAPIDS_VERSION} \ + dask-cuda-cu12~=${RAPIDS_VERSION} \ --extra-index-url=https://pypi.nvidia.com # install spark-rapids-ml diff --git a/notebooks/logistic-regression.ipynb b/notebooks/logistic-regression.ipynb index c14f362a0..72cceac52 100644 --- a/notebooks/logistic-regression.ipynb +++ b/notebooks/logistic-regression.ipynb @@ -739,9 +739,10 @@ "outputs": [], "source": [ "from spark_rapids_ml.classification import LogisticRegression as GPULR\n", + "from spark_rapids_ml.metrics.utils import logistic_regression_objective\n", "gpu_lr, gpu_model, gpu_fit_time, gpu_test_logLoss = sparse_vectors_compat(GPULR)\n", "print(f\"GPU fit took: {gpu_fit_time} sec\")\n", - "print(f\"GPU training objective: {gpu_model.objective}\")\n", + "print(f\"GPU training objective: {logistic_regression_objective(df_train, gpu_model)}\")\n", "print(f\"GPU test logLoss: {gpu_test_logLoss}\")" ] }, diff --git a/python/README.md b/python/README.md index e6f3f013c..e9cfc7968 100644 --- a/python/README.md +++ b/python/README.md @@ -20,9 +20,9 @@ For simplicity, the following instructions just use Spark local mode, assuming a First, install RAPIDS cuML per [these instructions](https://rapids.ai/start.html). Example for CUDA Toolkit 12.0: ```bash -conda create -n rapids-25.10 \ +conda create -n rapids-25.12 \ -c rapidsai -c conda-forge -c nvidia \ - cuml=25.10 cuvs=25.10 python=3.10 cuda-version=12.0 numpy~=1.0 + cuml=25.12 cuvs=25.12 python=3.10 cuda-version=12.0 numpy~=1.0 ``` **Note**: while testing, we recommend using conda or docker to simplify installation and isolate your environment while experimenting. Once you have a working environment, you can then try installing directly, if necessary. @@ -31,7 +31,7 @@ conda create -n rapids-25.10 \ Once you have the conda environment, activate it and install the required packages. ```bash -conda activate rapids-25.10 +conda activate rapids-25.12 ## for development access to notebooks, tests, and benchmarks git clone --branch main https://github.com/NVIDIA/spark-rapids-ml.git diff --git a/python/benchmark/databricks/cpu_cluster_spec.sh b/python/benchmark/databricks/cpu_cluster_spec.sh index 400f4056a..79997e97b 100644 --- a/python/benchmark/databricks/cpu_cluster_spec.sh +++ b/python/benchmark/databricks/cpu_cluster_spec.sh @@ -22,7 +22,7 @@ cat < Dict[str, Any]: logistic_regression = LogisticRegressionMG( handle=params[param_alias.handle], linesearch_max_iter=20, + penalty_normalized=False, + lbfgs_memory=10, **init_parameters, ) - logistic_regression.solver_model.penalty_normalized = False - logistic_regression.solver_model.lbfgs_memory = 10 - logistic_regression.solver_model.linesearch_max_iter = 20 - if is_sparse and pdesc.partition_max_nnz > nnz_limit_for_int32: # type: ignore logistic_regression._convert_index = np.int64 @@ -1121,8 +1119,7 @@ def _single_fit(init_parameters: Dict[str, Any]) -> Dict[str, Any]: "classes_": logistic_regression.classes_.tolist(), "n_cols": n_cols, "dtype": logistic_regression.dtype.name, - "num_iters": logistic_regression.solver_model.num_iters, - "objective": logistic_regression.solver_model.objective, + "num_iters": logistic_regression.n_iter_[0], "index_dtype": index_dtype, } @@ -1199,7 +1196,6 @@ def _out_schema(self) -> Union[StructType, str]: StructField("n_cols", IntegerType(), False), StructField("dtype", StringType(), False), StructField("num_iters", IntegerType(), False), - StructField("objective", DoubleType(), False), StructField("index_dtype", StringType(), False), ] ) @@ -1305,7 +1301,6 @@ def __init__( n_cols: int, dtype: str, num_iters: int, - objective: float, ) -> None: super().__init__( dtype=dtype, @@ -1314,7 +1309,6 @@ def __init__( intercept_=intercept_, classes_=classes_, num_iters=num_iters, - objective=objective, ) self.coef_ = coef_ self.intercept_ = intercept_ @@ -1322,7 +1316,6 @@ def __init__( self._lr_spark_model: Optional[SparkLogisticRegressionModel] = None self._num_classes = len(self.classes_) self.num_iters = num_iters - self.objective = objective self._this_model = self def cpu(self) -> SparkLogisticRegressionModel: @@ -1469,9 +1462,10 @@ def _predict_labels(scores: "cp.ndarray", _num_classes: int) -> "cp.ndarray": def _construct_lr() -> CumlT: import cupy as cp import numpy as np - from cuml.internals.input_utils import input_to_cuml_array from cuml.linear_model.logistic_regression_mg import LogisticRegressionMG + from .utils import cudf_to_cuml_array + _intercepts, _coefs = ( (intercept_, coef_) if num_models > 1 else ([intercept_], [coef_]) ) @@ -1479,26 +1473,18 @@ def _construct_lr() -> CumlT: for i in range(num_models): lr = LogisticRegressionMG(output_type="cupy") - # need this to revert a change in cuML targeting sklearn compat. - lr.n_features_in_ = None + + lr.n_features_in_ = n_cols lr.n_cols = n_cols lr.dtype = np.dtype(dtype) gpu_intercept_ = cp.array(_intercepts[i], order="C", dtype=dtype) + gpu_coef_ = cp.array(_coefs[i], order="F", dtype=dtype) - gpu_coef_ = cp.array(_coefs[i], order="F", dtype=dtype).T - gpu_stacked = cp.vstack([gpu_coef_, gpu_intercept_]) - lr.solver_model._coef_ = input_to_cuml_array( - gpu_stacked, order="C" - ).array - - lr.classes_ = input_to_cuml_array( - np.array(classes_, order="F").astype(dtype) - ).array.to_output(output_type="numpy") - lr._num_classes = len(lr.classes_) + lr.classes_ = np.array(classes_, order="F").astype(dtype) + lr.coef_ = cudf_to_cuml_array(gpu_coef_, order="F") + lr.intercept_ = cudf_to_cuml_array(gpu_intercept_, order="C") - lr.loss = "sigmoid" if lr._num_classes <= 2 else "softmax" - lr.solver_model.qnparams = lr.create_qnparams() lrs.append(lr) return lrs diff --git a/python/src/spark_rapids_ml/clustering.py b/python/src/spark_rapids_ml/clustering.py index 195662785..c09a7448f 100644 --- a/python/src/spark_rapids_ml/clustering.py +++ b/python/src/spark_rapids_ml/clustering.py @@ -362,7 +362,7 @@ def _cuml_fit( params: Dict[str, Any], ) -> Dict[str, Any]: import cupy as cp - from cuml.cluster.kmeans import KMeans as CumlKMeans + from cuml.cluster.kmeans_mg import KMeansMG as CumlKMeans kmeans_object = CumlKMeans( handle=params[param_alias.handle], @@ -393,10 +393,9 @@ def _cuml_fit( cuda_system_mem_headroom, ) - kmeans_object._fit( + kmeans_object.fit( concated, sample_weight=None, - multigpu=True, ) logger = get_logger(cls) @@ -506,15 +505,16 @@ def _get_cuml_transform_func( array_order = self._transform_array_order() def _construct_kmeans() -> CumlT: + import cupy as cp from cuml.cluster.kmeans import KMeans as CumlKMeans + from .utils import cudf_to_cuml_array kmeans = CumlKMeans(output_type="cupy", **cuml_alg_params) - from spark_rapids_ml.utils import cudf_to_cuml_array kmeans.n_features_in_ = n_cols kmeans.dtype = np.dtype(dtype) kmeans.cluster_centers_ = cudf_to_cuml_array( - np.array(cluster_centers_).astype(dtype), order=array_order + cp.array(cluster_centers_, dtype=dtype, order=array_order), order=array_order ) return kmeans diff --git a/python/src/spark_rapids_ml/connect_plugin.py b/python/src/spark_rapids_ml/connect_plugin.py index 5e1d4fc34..55bfe83eb 100644 --- a/python/src/spark_rapids_ml/connect_plugin.py +++ b/python/src/spark_rapids_ml/connect_plugin.py @@ -135,7 +135,6 @@ def transform(MODEL_TYPE: type) -> DataFrame: lr_model.n_cols, lr_model.dtype, lr_model.num_iters, - lr_model.objective, ] write_with_length(json.dumps(attributes).encode("utf-8"), outfile) diff --git a/python/src/spark_rapids_ml/feature.py b/python/src/spark_rapids_ml/feature.py index 70950ff83..595a9e854 100644 --- a/python/src/spark_rapids_ml/feature.py +++ b/python/src/spark_rapids_ml/feature.py @@ -263,7 +263,7 @@ def _cuml_fit( "explained_variance_ratio_": [cpu_explained_variance], "singular_values_": [cpu_singular_values], "n_cols": params[param_alias.num_cols], - "dtype": pca_object.dtype.name, + "dtype": pca_object.components_.dtype.name, } return _cuml_fit @@ -398,7 +398,7 @@ def _get_cuml_transform_func( cuml_alg_params = self.cuml_params.copy() n_cols = self.n_cols - dype = self.dtype + dtype = self.dtype components = self.components_ mean = self.mean_ singular_values = self.singular_values_ @@ -414,8 +414,7 @@ def _construct_pca() -> CumlT: pca = CumlPCAMG(output_type="numpy", **cuml_alg_params) - # need this to revert a change in cuML targeting sklearn compat. - pca.n_features_in_ = None + pca.n_features_in_ = n_cols # Compatible with older cuml versions (before 23.02) pca._n_components = pca.n_components @@ -424,7 +423,7 @@ def _construct_pca() -> CumlT: from spark_rapids_ml.utils import cudf_to_cuml_array pca.n_cols = n_cols - pca.dtype = np.dtype(dype) + pca.dtype = np.dtype(dtype) # TBD: figure out why PCA warns regardless of array order here and for singular values pca.components_ = cudf_to_cuml_array( @@ -437,8 +436,8 @@ def _construct_pca() -> CumlT: return pca transformed_mean = np.matmul( - np.array(self.mean_, self.dtype), - np.array(self.components_, self.dtype).T, + np.array(mean, dtype), + np.array(components, dtype).T, ) def _transform_internal( diff --git a/python/src/spark_rapids_ml/metrics/utils.py b/python/src/spark_rapids_ml/metrics/utils.py new file mode 100644 index 000000000..765e7408d --- /dev/null +++ b/python/src/spark_rapids_ml/metrics/utils.py @@ -0,0 +1,68 @@ +from spark_rapids_ml.classification import LogisticRegressionModel +from pyspark.ml.classification import ( + LogisticRegressionModel as SparkLogisticRegressionModel, +) +from pyspark.sql import DataFrame +from typing import Union +import numpy as np +from pyspark.ml.feature import StandardScaler +from pyspark.ml.evaluation import MulticlassClassificationEvaluator + + +def logistic_regression_objective( + df: DataFrame, + lr_model: Union["LogisticRegressionModel", "SparkLogisticRegressionModel"], +) -> float: + """can be used in testing and examples to calculate the full objective of a logistic regression model using Spark MLlib + + Args: + df: DataFrame + lr_model: Union[LogisticRegressionModel, SparkLogisticRegressionModel] + + Returns: + Full objective of the logistic regression model + """ + if isinstance(lr_model, LogisticRegressionModel): + lr_model = lr_model.cpu() + + df_with_preds = lr_model.transform(df) + + prediction_col = lr_model.getPredictionCol() + probability_col = lr_model.getProbabilityCol() + label_name = lr_model.getLabelCol() + features_col = lr_model.getFeaturesCol() + + evaluator_train = ( + MulticlassClassificationEvaluator() + .setMetricName("logLoss") # type:ignore + .setPredictionCol(prediction_col) + .setProbabilityCol(probability_col) + .setLabelCol(label_name) + ) + + log_loss = evaluator_train.evaluate(df_with_preds) + coefficients = ( + np.array(lr_model.coefficients) + if lr_model.numClasses == 2 + else lr_model.coefficientMatrix.toArray() + ) + + # account for effects of standardization on the coefficients for regularization penalty + if lr_model.getStandardization() is True: + scaler = StandardScaler( + inputCol=features_col, + outputCol="scaledFeatures", + ) + scaler_model = scaler.fit(df) + stdev = np.array(scaler_model.std) + coefficients = coefficients * stdev + + coefs_l1 = np.sum(np.abs(coefficients)) + coefs_l2 = np.sum(coefficients**2) + + elasticnet_param = lr_model.getElasticNetParam() + full_objective = log_loss + lr_model.getRegParam() * ( + 0.5 * (1 - elasticnet_param) * coefs_l2 + elasticnet_param * coefs_l1 + ) + + return full_objective diff --git a/python/src/spark_rapids_ml/regression.py b/python/src/spark_rapids_ml/regression.py index 334192634..717b8d79e 100644 --- a/python/src/spark_rapids_ml/regression.py +++ b/python/src/spark_rapids_ml/regression.py @@ -79,7 +79,7 @@ _RandomForestEstimator, _RandomForestModel, ) -from .utils import PartitionDescriptor, _get_spark_session, cudf_to_cuml_array, java_uid +from .utils import PartitionDescriptor, _get_spark_session, java_uid if TYPE_CHECKING: import cupy as cp @@ -216,7 +216,7 @@ def _param_value_mapping( def _get_cuml_params_default(self) -> Dict[str, Any]: return { - "algorithm": "eig", + "algorithm": "auto", "fit_intercept": True, "copy_X": True, "normalize": False, @@ -787,6 +787,8 @@ def _get_cuml_transform_func( def _construct_lr() -> CumlT: from cuml.linear_model.linear_regression_mg import LinearRegressionMG + from .utils import cudf_to_cuml_array + lrs = [] coefs = coef_ if isinstance(intercept_, list) else [coef_] @@ -795,7 +797,8 @@ def _construct_lr() -> CumlT: for i in range(len(coefs)): lr = LinearRegressionMG(output_type="numpy", copy_X=False) # need this to revert a change in cuML targeting sklearn compat. - lr.n_features_in_ = None + lr.n_features_in_ = n_cols + lr.n_cols = n_cols lr.coef_ = cudf_to_cuml_array( np.array(coefs[i], order="F").astype(dtype) ) diff --git a/python/src/spark_rapids_ml/tree.py b/python/src/spark_rapids_ml/tree.py index b267e41f9..bb418ce7f 100644 --- a/python/src/spark_rapids_ml/tree.py +++ b/python/src/spark_rapids_ml/tree.py @@ -697,7 +697,7 @@ def _construct_rf() -> CumlT: model = pickle.loads(base64.b64decode(m)) rf = cuRf() rf.n_classes_ = num_classes - rf.classes_ = cp.arange(num_classes, dtype=np.int32) + rf.classes_ = np.arange(num_classes, dtype=np.int32) rf._treelite_model_bytes = treelite.Model.deserialize_bytes(model) rfs.append(rf) diff --git a/python/src/spark_rapids_ml/umap.py b/python/src/spark_rapids_ml/umap.py index 6b9cdb5a8..3febc9cdc 100644 --- a/python/src/spark_rapids_ml/umap.py +++ b/python/src/spark_rapids_ml/umap.py @@ -1396,6 +1396,11 @@ def _get_cuml_transform_func( cuml_alg_params = self.cuml_params sparse_fit = self._sparse_fit n_cols = self.n_cols + n_neighbors = self.getNNeighbors() + a = self.getA() + b = self.getB() + spread = self.getSpread() + min_dist = self.getMinDist() def _chunk_and_broadcast( sc: pyspark.SparkContext, @@ -1504,9 +1509,19 @@ def _construct_umap() -> CumlT: internal_model = CumlUMAP(**cuml_alg_params) internal_model.n_features_in_ = raw_data_cuml.shape[1] - internal_model.embedding_ = cp.array(embedding).data + internal_model.embedding_ = cudf_to_cuml_array(cp.array(embedding)) internal_model._raw_data = raw_data_cuml - internal_model.sparse_fit = sparse_fit + internal_model._sparse_data = sparse_fit + internal_model._n_neighbors = min(raw_data_cuml.shape[0], n_neighbors) + + if a is None or b is None: + #import pyximport + #pyximport.install() + from cuml.manifold.umap import find_ab_params + internal_model._a, internal_model._b = find_ab_params(spread, min_dist) + else: + internal_model._a = a + internal_model._b = b return internal_model diff --git a/python/src/spark_rapids_ml/utils.py b/python/src/spark_rapids_ml/utils.py index 7d0434802..ece582c33 100644 --- a/python/src/spark_rapids_ml/utils.py +++ b/python/src/spark_rapids_ml/utils.py @@ -186,9 +186,9 @@ def _configure_memory_resource( global _last_sam_headroom_size - _SYSTEM_MEMORY_SUPPORTED = rmm._cuda.gpu.getDeviceAttribute( + _SYSTEM_MEMORY_SUPPORTED = rmm._cuda.gpu.getDeviceAttribute( # type: ignore runtime.cudaDeviceAttr.cudaDevAttrPageableMemoryAccess, - rmm._cuda.gpu.getDevice(), + rmm._cuda.gpu.getDevice(), # type: ignore ) if not _SYSTEM_MEMORY_SUPPORTED and sam_enabled: diff --git a/python/tests/test_linear_model.py b/python/tests/test_linear_model.py index 9da550873..eddfbfd61 100644 --- a/python/tests/test_linear_model.py +++ b/python/tests/test_linear_model.py @@ -170,7 +170,7 @@ def test_linear_regression_params( "tol": 1e-06, } default_cuml_params = { - "algorithm": "eig", + "algorithm": "auto", "alpha": 0.0, "fit_intercept": True, "l1_ratio": 0.0, diff --git a/python/tests/test_logistic_regression.py b/python/tests/test_logistic_regression.py index ab84fffda..11d624de0 100644 --- a/python/tests/test_logistic_regression.py +++ b/python/tests/test_logistic_regression.py @@ -48,6 +48,8 @@ from pyspark.sql.functions import array, col, sum, udf from pyspark.sql.types import FloatType, LongType +from spark_rapids_ml.metrics.utils import logistic_regression_objective + if version.parse(cuml.__version__) < version.parse("23.08.00"): raise ValueError( "Logistic Regression requires cuml 23.08.00 or above. Try upgrading cuml or ignoring this file in testing" @@ -201,6 +203,8 @@ def test_params(tmp_path: str, caplog: LogCaptureFixture) -> None: cuml_classes=[CumlLogisticRegression], excludes=[ "class_weight", + "penalty_normalized", + "lbfgs_memory", "linesearch_max_iter", "solver", "handle", @@ -438,10 +442,15 @@ def _func_test_classifier( reg_param=reg_param, elasticNet_param=elasticNet_param ) - cu_lr = cuLR(fit_intercept=fit_intercept, penalty=penalty, C=C, l1_ratio=l1_ratio) - cu_lr.solver_model.penalty_normalized = False - cu_lr.solver_model.lbfgs_memory = 10 - cu_lr.solver_model.linesearch_max_iter = 20 + cu_lr = cuLR( + fit_intercept=fit_intercept, + penalty=penalty, + C=C, + l1_ratio=l1_ratio, + penalty_normalized=False, + lbfgs_memory=10, + linesearch_max_iter=20, + ) cu_lr.fit(X_train, y_train) spark_conf.update( @@ -484,7 +493,6 @@ def to_sparse_func(v: Union[SparseVector, DenseVector]) -> SparseVector: assert spark_lr._cuml_params["l1_ratio"] == cu_lr.l1_ratio else: assert spark_lr._cuml_params["l1_ratio"] == spark_lr.getElasticNetParam() - assert cu_lr.l1_ratio == None spark_lr.setFeaturesCol(features_col) spark_lr.setLabelCol(label_col) @@ -1223,27 +1231,6 @@ def test_quick( assert lr._cuml_params["C"] == C assert lr._cuml_params["l1_ratio"] == l1_ratio - from cuml import LogisticRegression as CUMLSG - - sg = CUMLSG(penalty=penalty, C=C, l1_ratio=l1_ratio) - l1_strength, l2_strength = sg._get_qn_params() - if reg_param == 0.0: - assert penalty == None - assert l1_strength == 0.0 - assert l2_strength == 0.0 - elif elasticNet_param == 0.0: - assert penalty == "l2" - assert l1_strength == 0.0 - assert l2_strength == reg_param - elif elasticNet_param == 1.0: - assert penalty == "l1" - assert l1_strength == reg_param - assert l2_strength == 0.0 - else: - assert penalty == "elasticnet" - assert l1_strength == reg_param * elasticNet_param - assert l2_strength == reg_param * (1 - elasticNet_param) - @pytest.mark.parametrize("metric_name", ["accuracy", "logLoss", "areaUnderROC"]) @pytest.mark.parametrize("feature_type", [feature_types.vector]) @@ -1802,9 +1789,11 @@ def test_sparse_nlp20news( cpu_model = cpu_lr.fit(df_train) cpu_objective = cpu_model.summary.objectiveHistory[-1] + gpu_model_objective = logistic_regression_objective(df_train, gpu_model) + assert ( - gpu_model.objective < cpu_objective - or abs(gpu_model.objective - cpu_objective) < tolerance + gpu_model_objective < cpu_objective + or abs(gpu_model_objective - cpu_objective) < tolerance ) if standardization is True: @@ -2329,9 +2318,10 @@ def test_sparse_int64() -> None: cpu_est = SparkLogisticRegression(**est_params) cpu_model = cpu_est.fit(df) cpu_objective = cpu_model.summary.objectiveHistory[-1] + gpu_model_objective = logistic_regression_objective(df, gpu_model) assert ( - gpu_model.objective < cpu_objective - or abs(gpu_model.objective - cpu_objective) < tolerance + gpu_model_objective < cpu_objective + or abs(gpu_model_objective - cpu_objective) < tolerance ) df_test = df.sample(fraction=fraction_sampled_for_test, seed=0) diff --git a/python/tests/test_umap.py b/python/tests/test_umap.py index 9511da30b..cc447d992 100644 --- a/python/tests/test_umap.py +++ b/python/tests/test_umap.py @@ -649,7 +649,7 @@ def test_umap_sample_fraction(gpu_number: int) -> None: def test_umap_build_algo(gpu_number: int, metric: str) -> None: n_rows = 10000 - # cuml 25.10 UMAP is unstable for low dimensions + # cuml 25.12 UMAP is unstable for low dimensions n_cols = 100 random_state = 42 diff --git a/python/tests_large/test_large_logistic_regression.py b/python/tests_large/test_large_logistic_regression.py index c3190c3a3..f68781ad8 100644 --- a/python/tests_large/test_large_logistic_regression.py +++ b/python/tests_large/test_large_logistic_regression.py @@ -31,6 +31,7 @@ ) from spark_rapids_ml.classification import LogisticRegression, LogisticRegressionModel +from spark_rapids_ml.metrics.utils import logistic_regression_objective from tests.test_logistic_regression import compare_model from .conftest import _spark @@ -52,9 +53,10 @@ def _compare_with_cpu_estimator( df_test = df.sample(fraction=fraction_sampled_for_test, seed=0) + gpu_model_objective = logistic_regression_objective(df, gpu_model) assert ( - gpu_model.objective < cpu_objective - or abs(gpu_model.objective - cpu_objective) < tolerance + gpu_model_objective < cpu_objective + or abs(gpu_model_objective - cpu_objective) < tolerance ) compare_model( From 7e712feed47f6d7c11c616d44f4a2d523437c84d Mon Sep 17 00:00:00 2001 From: Erik Ordentlich Date: Tue, 2 Dec 2025 15:40:15 -0800 Subject: [PATCH 2/9] use order internally expected by cuml for setting embedding_ field Signed-off-by: Erik Ordentlich --- python/src/spark_rapids_ml/umap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/src/spark_rapids_ml/umap.py b/python/src/spark_rapids_ml/umap.py index 3febc9cdc..41d8e0a12 100644 --- a/python/src/spark_rapids_ml/umap.py +++ b/python/src/spark_rapids_ml/umap.py @@ -1509,7 +1509,7 @@ def _construct_umap() -> CumlT: internal_model = CumlUMAP(**cuml_alg_params) internal_model.n_features_in_ = raw_data_cuml.shape[1] - internal_model.embedding_ = cudf_to_cuml_array(cp.array(embedding)) + internal_model.embedding_ = cudf_to_cuml_array(cp.array(embedding, order="C"), order="C") internal_model._raw_data = raw_data_cuml internal_model._sparse_data = sparse_fit internal_model._n_neighbors = min(raw_data_cuml.shape[0], n_neighbors) From c8718249d5cdfb0404d790679ffc215078ceefb4 Mon Sep 17 00:00:00 2001 From: Erik Ordentlich Date: Tue, 2 Dec 2025 15:44:17 -0800 Subject: [PATCH 3/9] formatting Signed-off-by: Erik Ordentlich --- python/src/spark_rapids_ml/clustering.py | 4 +++- python/src/spark_rapids_ml/metrics/utils.py | 12 +++++++----- python/src/spark_rapids_ml/umap.py | 11 +++++++---- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/python/src/spark_rapids_ml/clustering.py b/python/src/spark_rapids_ml/clustering.py index c09a7448f..c768c62b5 100644 --- a/python/src/spark_rapids_ml/clustering.py +++ b/python/src/spark_rapids_ml/clustering.py @@ -507,6 +507,7 @@ def _get_cuml_transform_func( def _construct_kmeans() -> CumlT: import cupy as cp from cuml.cluster.kmeans import KMeans as CumlKMeans + from .utils import cudf_to_cuml_array kmeans = CumlKMeans(output_type="cupy", **cuml_alg_params) @@ -514,7 +515,8 @@ def _construct_kmeans() -> CumlT: kmeans.n_features_in_ = n_cols kmeans.dtype = np.dtype(dtype) kmeans.cluster_centers_ = cudf_to_cuml_array( - cp.array(cluster_centers_, dtype=dtype, order=array_order), order=array_order + cp.array(cluster_centers_, dtype=dtype, order=array_order), + order=array_order, ) return kmeans diff --git a/python/src/spark_rapids_ml/metrics/utils.py b/python/src/spark_rapids_ml/metrics/utils.py index 765e7408d..4ed655c00 100644 --- a/python/src/spark_rapids_ml/metrics/utils.py +++ b/python/src/spark_rapids_ml/metrics/utils.py @@ -1,12 +1,14 @@ -from spark_rapids_ml.classification import LogisticRegressionModel +from typing import Union + +import numpy as np from pyspark.ml.classification import ( LogisticRegressionModel as SparkLogisticRegressionModel, ) -from pyspark.sql import DataFrame -from typing import Union -import numpy as np -from pyspark.ml.feature import StandardScaler from pyspark.ml.evaluation import MulticlassClassificationEvaluator +from pyspark.ml.feature import StandardScaler +from pyspark.sql import DataFrame + +from spark_rapids_ml.classification import LogisticRegressionModel def logistic_regression_objective( diff --git a/python/src/spark_rapids_ml/umap.py b/python/src/spark_rapids_ml/umap.py index 41d8e0a12..1fbd38163 100644 --- a/python/src/spark_rapids_ml/umap.py +++ b/python/src/spark_rapids_ml/umap.py @@ -1509,15 +1509,18 @@ def _construct_umap() -> CumlT: internal_model = CumlUMAP(**cuml_alg_params) internal_model.n_features_in_ = raw_data_cuml.shape[1] - internal_model.embedding_ = cudf_to_cuml_array(cp.array(embedding, order="C"), order="C") + internal_model.embedding_ = cudf_to_cuml_array( + cp.array(embedding, order="C"), order="C" + ) internal_model._raw_data = raw_data_cuml internal_model._sparse_data = sparse_fit internal_model._n_neighbors = min(raw_data_cuml.shape[0], n_neighbors) - + if a is None or b is None: - #import pyximport - #pyximport.install() + # import pyximport + # pyximport.install() from cuml.manifold.umap import find_ab_params + internal_model._a, internal_model._b = find_ab_params(spread, min_dist) else: internal_model._a = a From 76db03a27a199f943126a987d80249c1aaf9333a Mon Sep 17 00:00:00 2001 From: Erik Ordentlich Date: Mon, 8 Dec 2025 15:57:57 -0800 Subject: [PATCH 4/9] cleanup, fixes for DB 13.3, 14.3 scipy issue, fix one label value compat tests Signed-off-by: Erik Ordentlich --- ci/Dockerfile | 2 +- docker/Dockerfile.python | 2 +- python/README.md | 2 +- .../benchmark/databricks/gpu_cluster_spec.sh | 3 + .../databricks/init-pip-cuda-12.0.sh | 4 + python/src/spark_rapids_ml/classification.py | 99 +++++++++++-------- python/src/spark_rapids_ml/umap.py | 2 - python/tests/test_logistic_regression.py | 39 +++----- 8 files changed, 80 insertions(+), 73 deletions(-) diff --git a/ci/Dockerfile b/ci/Dockerfile index acf57cb8b..771e9e319 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -48,5 +48,5 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86 # install cuML ARG RAPIDS_VERSION=25.12 -RUN conda install -y -c rapidsai -c conda-forge -c nvidia cuml=$RAPIDS_VERSION cuvs=$RAPIDS_VERSION python=3.10 pylibraft=$RAPIDS_VERSION raft-dask=$RAPIDS_VERSION cuda-cudart cuda-version=12.0 numpy~=1.0 \ +RUN conda install -y -c rapidsai -c conda-forge -c nvidia cuml=$RAPIDS_VERSION cuvs=$RAPIDS_VERSION python=3.10 pylibraft=$RAPIDS_VERSION raft-dask=$RAPIDS_VERSION cuda-version=12.0 numpy~=1.0 \ && conda clean --all -f -y diff --git a/docker/Dockerfile.python b/docker/Dockerfile.python index 9d3664510..1e8141d06 100644 --- a/docker/Dockerfile.python +++ b/docker/Dockerfile.python @@ -47,7 +47,7 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-py38_4.10.3-Linu # install cuML -RUN conda install -y -c rapidsai -c conda-forge -c nvidia python=3.10 cuda-version=12.0 cuml=$RAPIDS_VERSION cudf=$RAPIDS_VERSION cuvs=$RAPIDS_VERSION pylibraft=$RAPIDS_VERSION raft-dask=$RAPIDS_VERSION cuda-cudart numpy~=1.0 \ +RUN conda install -y -c rapidsai -c conda-forge -c nvidia python=3.10 cuda-version=12.0 cuml=$RAPIDS_VERSION cudf=$RAPIDS_VERSION cuvs=$RAPIDS_VERSION pylibraft=$RAPIDS_VERSION raft-dask=$RAPIDS_VERSION numpy~=1.0 \ && conda clean --all -f -y # install python dependencies diff --git a/python/README.md b/python/README.md index e9cfc7968..8ae6ae114 100644 --- a/python/README.md +++ b/python/README.md @@ -22,7 +22,7 @@ First, install RAPIDS cuML per [these instructions](https://rapids.ai/start.html ```bash conda create -n rapids-25.12 \ -c rapidsai -c conda-forge -c nvidia \ - cuml=25.12 cuvs=25.12 python=3.10 cuda-version=12.0 numpy~=1.0 + cuml=25.12 cuvs=25.12 python=3.10 pylibraft=$RAPIDS_VERSION raft-dask=$RAPIDS_VERSION cuda-version=12.0 numpy~=1.0 ``` **Note**: while testing, we recommend using conda or docker to simplify installation and isolate your environment while experimenting. Once you have a working environment, you can then try installing directly, if necessary. diff --git a/python/benchmark/databricks/gpu_cluster_spec.sh b/python/benchmark/databricks/gpu_cluster_spec.sh index 81098945e..5eafc82b6 100644 --- a/python/benchmark/databricks/gpu_cluster_spec.sh +++ b/python/benchmark/databricks/gpu_cluster_spec.sh @@ -35,6 +35,9 @@ cat < Dict[str, Any]: cuda_system_mem_headroom, ) - logistic_regression.fit( - [(concated, concated_y)], - pdesc.m, - pdesc.n, - pdesc.parts_rank_size, - pdesc.rank, - ) + try: + logistic_regression.fit( + [(concated, concated_y)], + pdesc.m, + pdesc.n, + pdesc.parts_rank_size, + pdesc.rank, + ) + except ValueError as e: + # cuML now raises an exception if only one label value is observed. + # Here we suppress in that case until later as we can handle it when + # fitIntercept=True. + import traceback + + exc_str = traceback.format_exc() + if not "requires n_classes == 2 (got 1)" in exc_str: + raise + + # check if invalid label exists. Do this first before handling 1 label value to match apache spark. + for class_val in logistic_regression.classes_.tolist(): + if class_val < 0: + raise RuntimeError( + f"Labels MUST be in [0, 2147483647), but got {class_val}" + ) + elif not class_val.is_integer(): + raise RuntimeError( + f"Labels MUST be Integers, but got {class_val}" + ) + + n_cols = logistic_regression.n_cols - coef_ = logistic_regression.coef_ - intercept_ = logistic_regression.intercept_ + if len(logistic_regression.classes_) == 1: + class_val = logistic_regression.classes_[0] + # TODO: match Spark to use max(class_list) to calculate the number of classes + # Cuml currently uses unique(class_list) + if class_val != 1.0 and class_val != 0.0: + raise RuntimeError( + "class value must be either 1. or 0. when dataset has one label" + ) + + import cupy as cp + + coef_ = cp.zeros(n_cols) + intercept_ = cp.array( + [float("inf") if class_val == 1.0 else float("-inf")] + ) + n_iter_ = 0 + else: + coef_ = logistic_regression.coef_ + intercept_ = logistic_regression.intercept_ + n_iter_ = logistic_regression.n_iter_[0] if standarization_with_cupy is True: import cupy as cp @@ -1104,8 +1145,6 @@ def _single_fit(init_parameters: Dict[str, Any]) -> Dict[str, Any]: ) intercept_array -= intercept_mean - n_cols = logistic_regression.n_cols - # index_dtype is only available in sparse logistic regression. It records the dtype of indices array and indptr array that were used in C++ computation layer. Its value can be 'int32' or 'int64'. index_dtype = ( str(logistic_regression.index_dtype) @@ -1114,41 +1153,19 @@ def _single_fit(init_parameters: Dict[str, Any]) -> Dict[str, Any]: ) model = { - "coef_": coef_[:, :n_cols].tolist(), + "coef_": ( + coef_[:, :n_cols].tolist() + if coef_.ndim == 2 + else [coef_[:n_cols].tolist()] + ), "intercept_": intercept_.tolist(), "classes_": logistic_regression.classes_.tolist(), "n_cols": n_cols, "dtype": logistic_regression.dtype.name, - "num_iters": logistic_regression.n_iter_[0], + "num_iters": n_iter_, "index_dtype": index_dtype, } - # check if invalid label exists - for class_val in model["classes_"]: - if class_val < 0: - raise RuntimeError( - f"Labels MUST be in [0, 2147483647), but got {class_val}" - ) - elif not class_val.is_integer(): - raise RuntimeError( - f"Labels MUST be Integers, but got {class_val}" - ) - - if len(logistic_regression.classes_) == 1: - class_val = logistic_regression.classes_[0] - # TODO: match Spark to use max(class_list) to calculate the number of classes - # Cuml currently uses unique(class_list) - if class_val != 1.0 and class_val != 0.0: - raise RuntimeError( - "class value must be either 1. or 0. when dataset has one label" - ) - - if init_parameters["fit_intercept"] is True: - model["coef_"] = [[0.0] * n_cols] - model["intercept_"] = [ - float("inf") if class_val == 1.0 else float("-inf") - ] - del logistic_regression return model @@ -1204,8 +1221,8 @@ def _create_pyspark_model(self, result: Row) -> "LogisticRegressionModel": logger = get_logger(self.__class__) if len(result["classes_"]) == 1: if self.getFitIntercept() is False: - logger.warning( - "All labels belong to a single class and fitIntercept=false. It's a dangerous ground, so the algorithm may not converge." + raise ValueError( + "All labels belong to a single class and fitIntercept=false. This is not supported. Please use fitIntercept=true." ) else: logger.warning( diff --git a/python/src/spark_rapids_ml/umap.py b/python/src/spark_rapids_ml/umap.py index 1fbd38163..9ab47727d 100644 --- a/python/src/spark_rapids_ml/umap.py +++ b/python/src/spark_rapids_ml/umap.py @@ -1517,8 +1517,6 @@ def _construct_umap() -> CumlT: internal_model._n_neighbors = min(raw_data_cuml.shape[0], n_neighbors) if a is None or b is None: - # import pyximport - # pyximport.install() from cuml.manifold.umap import find_ab_params internal_model._a, internal_model._b = find_ab_params(spread, min_dist) diff --git a/python/tests/test_logistic_regression.py b/python/tests/test_logistic_regression.py index 11d624de0..c5fecbcdb 100644 --- a/python/tests/test_logistic_regression.py +++ b/python/tests/test_logistic_regression.py @@ -1440,32 +1440,8 @@ def test_compat_one_label( assert label == 1.0 or label == 0.0 - blor_model = blor.fit(bdf) - - if fit_intercept is False: - if _LogisticRegression is SparkLogisticRegression: - # Got empty caplog.text. Spark prints warning message from jvm - assert caplog.text == "" - else: - assert ( - "All labels belong to a single class and fitIntercept=false. It's a dangerous ground, so the algorithm may not converge." - in caplog.text - ) - - if label == 1.0: - assert array_equal( - blor_model.coefficients.toArray(), - [0.85431526, 0.85431526], - tolerance, - ) - else: - assert array_equal( - blor_model.coefficients.toArray(), - [-0.85431526, -0.85431526], - tolerance, - ) - assert blor_model.intercept == 0.0 - else: + if fit_intercept is True: + blor_model = blor.fit(bdf) if _LogisticRegression is SparkLogisticRegression: # Got empty caplog.text. Spark prints warning message from jvm assert caplog.text == "" @@ -1474,11 +1450,20 @@ def test_compat_one_label( "All labels are the same value and fitIntercept=true, so the coefficients will be zeros. Training is not needed." in caplog.text ) - assert array_equal(blor_model.coefficients.toArray(), [0, 0], 0.0) assert blor_model.intercept == ( float("inf") if label == 1.0 else float("-inf") ) + else: + if _LogisticRegression is SparkLogisticRegression: + blor_model = blor.fit(bdf) + assert caplog.text == "" + else: + with pytest.raises( + ValueError, + match="All labels belong to a single class and fitIntercept=false. This is not supported. Please use fitIntercept=true.", + ): + blor_model = blor.fit(bdf) @pytest.mark.compat From 991c653af5bfd491f832a0a7becef09a778af5be Mon Sep 17 00:00:00 2001 From: Erik Ordentlich Date: Mon, 8 Dec 2025 18:39:06 -0800 Subject: [PATCH 5/9] cleanup Signed-off-by: Erik Ordentlich --- python/benchmark/databricks/setup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/benchmark/databricks/setup.sh b/python/benchmark/databricks/setup.sh index b0fc28631..5cc0da38e 100755 --- a/python/benchmark/databricks/setup.sh +++ b/python/benchmark/databricks/setup.sh @@ -60,7 +60,7 @@ popd # create workspace directory databricks workspace mkdirs ${INIT_SCRIPT_DIR} --profile ${DB_PROFILE} ${DB_OVERWRITE} # point cpu and gpu cluster init scripts to new files and upload -for init_script in init-pip-cuda-12.0-nightly.sh init-cpu.sh init-pip-cuda-13.0-nightly.sh +for init_script in init-pip-cuda-12.0.sh init-cpu.sh do # NOTE: on linux delete the .bu after -i if base64 --help | grep '\-w'; then From 477cc3b5cd292fdac67a04b15ecc2a0c52ba271d Mon Sep 17 00:00:00 2001 From: Erik Ordentlich Date: Mon, 8 Dec 2025 20:15:12 -0800 Subject: [PATCH 6/9] fix ci rapidsai-nightly conda channel, remove nightly from 25.12 databricks scripts Signed-off-by: Erik Ordentlich --- ci/Dockerfile | 2 +- notebooks/databricks/init-pip-cuda-12.0.sh | 1 + python/benchmark/databricks/gpu_cluster_spec.sh | 2 +- python/benchmark/databricks/gpu_etl_cluster_spec.sh | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/ci/Dockerfile b/ci/Dockerfile index 771e9e319..54e9d6aeb 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -48,5 +48,5 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86 # install cuML ARG RAPIDS_VERSION=25.12 -RUN conda install -y -c rapidsai -c conda-forge -c nvidia cuml=$RAPIDS_VERSION cuvs=$RAPIDS_VERSION python=3.10 pylibraft=$RAPIDS_VERSION raft-dask=$RAPIDS_VERSION cuda-version=12.0 numpy~=1.0 \ +RUN conda install -y -c rapidsai-nightly -c conda-forge -c nvidia cuml=$RAPIDS_VERSION cuvs=$RAPIDS_VERSION python=3.10 pylibraft=$RAPIDS_VERSION raft-dask=$RAPIDS_VERSION cuda-version=12.0 numpy~=1.0 \ && conda clean --all -f -y diff --git a/notebooks/databricks/init-pip-cuda-12.0.sh b/notebooks/databricks/init-pip-cuda-12.0.sh index 91c38be8e..0bb76dfe0 100644 --- a/notebooks/databricks/init-pip-cuda-12.0.sh +++ b/notebooks/databricks/init-pip-cuda-12.0.sh @@ -43,6 +43,7 @@ ln -s /usr/local/cuda-12.0 /usr/local/cuda pylibraft-cu12~=${RAPIDS_VERSION} \ raft-dask-cu12~=${RAPIDS_VERSION} \ dask-cuda-cu12~=${RAPIDS_VERSION} \ + numpy~=1.0 \ --extra-index-url=https://pypi.nvidia.com # install spark-rapids-ml diff --git a/python/benchmark/databricks/gpu_cluster_spec.sh b/python/benchmark/databricks/gpu_cluster_spec.sh index 5eafc82b6..7dc1fc77e 100644 --- a/python/benchmark/databricks/gpu_cluster_spec.sh +++ b/python/benchmark/databricks/gpu_cluster_spec.sh @@ -58,7 +58,7 @@ cat < Date: Mon, 8 Dec 2025 20:56:47 -0800 Subject: [PATCH 7/9] updates for cuda 12.2 as new minimum cuda 12 in rapids, delete old dockerfile Signed-off-by: Erik Ordentlich --- ci/Dockerfile | 4 +- docker/Dockerfile | 115 ------------------ docker/Dockerfile.pip | 2 +- docker/Dockerfile.python | 4 +- notebooks/databricks/README.md | 4 +- ...t-pip-cuda-12.0.sh => init-pip-cuda-12.sh} | 8 +- .../benchmark/databricks/gpu_cluster_spec.sh | 2 +- .../databricks/gpu_etl_cluster_spec.sh | 2 +- ...t-pip-cuda-12.0.sh => init-pip-cuda-12.sh} | 12 +- python/benchmark/databricks/setup.sh | 2 +- 10 files changed, 20 insertions(+), 135 deletions(-) delete mode 100644 docker/Dockerfile rename notebooks/databricks/{init-pip-cuda-12.0.sh => init-pip-cuda-12.sh} (89%) rename python/benchmark/databricks/{init-pip-cuda-12.0.sh => init-pip-cuda-12.sh} (85%) diff --git a/ci/Dockerfile b/ci/Dockerfile index 54e9d6aeb..f7b4a0d75 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -14,7 +14,7 @@ # limitations under the License. # -ARG CUDA_VERSION=12.0.1 +ARG CUDA_VERSION=12.2.2 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 # ubuntu22 @@ -48,5 +48,5 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86 # install cuML ARG RAPIDS_VERSION=25.12 -RUN conda install -y -c rapidsai-nightly -c conda-forge -c nvidia cuml=$RAPIDS_VERSION cuvs=$RAPIDS_VERSION python=3.10 pylibraft=$RAPIDS_VERSION raft-dask=$RAPIDS_VERSION cuda-version=12.0 numpy~=1.0 \ +RUN conda install -y -c rapidsai-nightly -c conda-forge -c nvidia cuml=$RAPIDS_VERSION cuvs=$RAPIDS_VERSION python=3.10 pylibraft=$RAPIDS_VERSION raft-dask=$RAPIDS_VERSION cuda-version=12.2 numpy~=1.0 \ && conda clean --all -f -y diff --git a/docker/Dockerfile b/docker/Dockerfile deleted file mode 100644 index 1c5f9d864..000000000 --- a/docker/Dockerfile +++ /dev/null @@ -1,115 +0,0 @@ -# -# Copyright (c) 2023-2025, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - - -### -# -# The image to mvn build jars of GitHub/nvidia/spark-rapids-ml -# -# Arguments: CUDA_VERSION=11.5.x (CUDA Toolkit(>=11.5)) -# Arguments: GCC_VERSION=9 (gcc(>=9.3)) -# Arguments: CMAKE_VERSION=3.20.x (cmake(>=3.20)) -# Arguments: NINJA_VERSION=1.10.x (ninja(>=1.10)) -# Arguments: MAVEN_VERSION=3.8.6 (maven(>=3.3.9)) -# -# Example to build & upload image : -# docker build -t spark-rapids-ml:cuda11.5.2 . -f Dockerfile \ -# --build-arg CUDA_VERSION=11.5.2 \ -# --build-arg GCC_VERSION=9 \ -# --build-arg CMAKE_VERSION=3.23.3 \ -# --build-arg NINJA_VERSION=1.10.2 \ -# --build-arg MAVEN_VERSION=3.8.6 -# -### - -ARG CUDA_VERSION=12.0.1 -FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 - -# ubuntu22 -RUN sed -i -e 's|http://archive.ubuntu.com/ubuntu|https://archive.ubuntu.com/ubuntu|g' \ - -e 's|http://security.ubuntu.com/ubuntu|https://security.ubuntu.com/ubuntu|g' \ - /etc/apt/sources.list -# ubuntu24+ -RUN find /etc/apt/sources.list.d/ -name '*.sources' -exec sed -i \ - -e "s|http://archive.ubuntu.com/ubuntu|https://archive.ubuntu.com/ubuntu|g" \ - -e "s|http://security.ubuntu.com/ubuntu|https://security.ubuntu.com/ubuntu|g" {} + -# Install packages to build spark-rapids-ml jars -RUN apt update -y && \ - DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt install -y openjdk-8-jdk wget git zip - -# Install gcc & g++ -ARG GCC_VERSION=9 -RUN apt install -y software-properties-common \ - && add-apt-repository -y ppa:git-core/ppa \ - && add-apt-repository -y ppa:ubuntu-toolchain-r/test \ - && add-apt-repository -y ppa:deadsnakes/ppa \ - && apt update -y \ - && apt install -y gcc-${GCC_VERSION} g++-${GCC_VERSION} \ - && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-${GCC_VERSION} 100 \ - && update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-${GCC_VERSION} 100 - -# Install cmake -ARG CMAKE_VERSION=3.23.3 -RUN cd /usr/local/ && wget -q https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz && \ - tar zxf cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz && \ - rm -rf cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz -ENV PATH /usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:$PATH - -# Install ninja -ARG NINJA_VERSION=1.10.2 -RUN cd /usr/local/ && wget -q https://github.com/ninja-build/ninja/releases/download/v${NINJA_VERSION}/ninja-linux.zip && \ - mkdir -p /usr/local/ninja && unzip -d /usr/local/ninja ninja-linux.zip && rm -rf ninja-linux.zip -ENV PATH /usr/local/ninja:$PATH - -# Config JAVA_HOME -ENV JAVA_HOME /usr/lib/jvm/java-1.8.0-openjdk-amd64 - -# Donwload maven -ARG MAVEN_VERSION=3.8.8 -RUN cd /usr/local/ && wget -q https://dlcdn.apache.org/maven/maven-3/$MAVEN_VERSION/binaries/apache-maven-$MAVEN_VERSION-bin.zip && \ - unzip apache-maven-$MAVEN_VERSION-bin.zip && rm -rf apache-maven-$MAVEN_VERSION-bin.zip -ENV PATH /usr/local/apache-maven-$MAVEN_VERSION/bin:$PATH - -# Install conda -ENV PATH="/root/miniconda3/bin:${PATH}" -ARG PATH="/root/miniconda3/bin:${PATH}" -RUN wget --quiet \ - https://repo.anaconda.com/miniconda/Miniconda3-py38_4.10.3-Linux-x86_64.sh \ - && mkdir /root/.conda \ - && bash Miniconda3-py38_4.10.3-Linux-x86_64.sh -b \ - && rm -f Miniconda3-py38_4.10.3-Linux-x86_64.sh \ - && conda tos accept --override-channels -c conda-forge -c defaults \ - && conda init - -# install cuDF dependency, Fall back to use cudf 22.04 due to issue: -# https://github.com/NVIDIA/spark-rapids-ml/issues/73 -ARG CONDA_CUDF_VER=22.04 -RUN conda install -c rapidsai -c conda-forge cudf=$CONDA_CUDF_VER python=3.8 -y - -# Note: the raft verion is fixed to 22.12, do not modify it when updating the spark-rapids-ml version. -# newer versions may fail the build process due to API incompatibility. -ARG RAFT_VER=22.12 -RUN git clone -b branch-$RAFT_VER https://github.com/rapidsai/raft.git -ENV RAFT_PATH=/raft - -### END OF CACHE ### - -#ARG RAPIDS_ML_VER=23.04 -#RUN git clone -b branch-$RAPIDS_ML_VER https://github.com/NVIDIA/spark-rapids-ml.git -COPY . /spark-rapids-ml -WORKDIR /spark-rapids-ml/jvm - -SHELL ["conda", "run", "--no-capture-output", "-n", "base", "/bin/bash", "-c"] diff --git a/docker/Dockerfile.pip b/docker/Dockerfile.pip index 2ea468cd5..51b241ea5 100644 --- a/docker/Dockerfile.pip +++ b/docker/Dockerfile.pip @@ -14,7 +14,7 @@ # limitations under the License. # -ARG CUDA_VERSION=12.0.1 +ARG CUDA_VERSION=12.2.2 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 ARG PYSPARK_VERSION=3.3.1 diff --git a/docker/Dockerfile.python b/docker/Dockerfile.python index 1e8141d06..0306459a9 100644 --- a/docker/Dockerfile.python +++ b/docker/Dockerfile.python @@ -14,7 +14,7 @@ # limitations under the License. # -ARG CUDA_VERSION=12.0.1 +ARG CUDA_VERSION=12.2.2 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 ARG RAPIDS_VERSION=25.12 @@ -47,7 +47,7 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-py38_4.10.3-Linu # install cuML -RUN conda install -y -c rapidsai -c conda-forge -c nvidia python=3.10 cuda-version=12.0 cuml=$RAPIDS_VERSION cudf=$RAPIDS_VERSION cuvs=$RAPIDS_VERSION pylibraft=$RAPIDS_VERSION raft-dask=$RAPIDS_VERSION numpy~=1.0 \ +RUN conda install -y -c rapidsai -c conda-forge -c nvidia python=3.10 cuda-version=12.2 cuml=$RAPIDS_VERSION cudf=$RAPIDS_VERSION cuvs=$RAPIDS_VERSION pylibraft=$RAPIDS_VERSION raft-dask=$RAPIDS_VERSION numpy~=1.0 \ && conda clean --all -f -y # install python dependencies diff --git a/notebooks/databricks/README.md b/notebooks/databricks/README.md index 278a85775..6637961df 100644 --- a/notebooks/databricks/README.md +++ b/notebooks/databricks/README.md @@ -11,7 +11,7 @@ If you already have a Databricks account, you can run the example notebooks on a ```bash export WS_SAVE_DIR="/path/to/directory/in/workspace" databricks workspace mkdirs ${WS_SAVE_DIR} --profile ${PROFILE} - databricks workspace import --format AUTO --file init-pip-cuda-12.0.sh ${WS_SAVE_DIR}/init-pip-cuda-12.0.sh --profile ${PROFILE} + databricks workspace import --format AUTO --file init-pip-cuda-12.sh ${WS_SAVE_DIR}/init-pip-cuda-12.sh --profile ${PROFILE} ``` **Note**: the init script does the following on each Spark node: - updates the CUDA runtime to 12.0 (required for Spark Rapids ML dependencies). @@ -20,7 +20,7 @@ If you already have a Databricks account, you can run the example notebooks on a - if the cluster environment variable `SPARK_RAPIDS_ML_NO_IMPORT_ENABLED=1` is define (see below), the init script also modifies a Databricks notebook kernel startup script to enable no-import change UX for the cluster. See [no-import-change](../README.md#no-import-change). - Create a cluster using **Databricks 13.3 LTS ML GPU Runtime** using at least two single-gpu workers and add the following configurations to the **Advanced options**. - **Init Scripts** - - add the workspace path to the uploaded init script `${WS_SAVE_DIR}/init-pip-cuda-12.0.sh` as set above (but substitute variables manually in the form). + - add the workspace path to the uploaded init script `${WS_SAVE_DIR}/init-pip-cuda-12.sh` as set above (but substitute variables manually in the form). - **Spark** - **Spark config** ``` diff --git a/notebooks/databricks/init-pip-cuda-12.0.sh b/notebooks/databricks/init-pip-cuda-12.sh similarity index 89% rename from notebooks/databricks/init-pip-cuda-12.0.sh rename to notebooks/databricks/init-pip-cuda-12.sh index 0bb76dfe0..7f36b9278 100644 --- a/notebooks/databricks/init-pip-cuda-12.0.sh +++ b/notebooks/databricks/init-pip-cuda-12.sh @@ -23,13 +23,13 @@ SPARK_RAPIDS_VERSION=25.08.0 curl -L https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}-cuda12.jar -o /databricks/jars/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar -# install cudatoolkit 12.0 via runfile approach -wget https://developer.download.nvidia.com/compute/cuda/12.0.1/local_installers/cuda_12.0.1_525.85.12_linux.run -sh cuda_12.0.1_525.85.12_linux.run --silent --toolkit +# install cudatoolkit 12.2 via runfile approach +wget https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_535.104.05_linux.run +sh cuda_12.2.2_535.104.05_linux.run --silent --toolkit # reset symlink and update library loading paths rm /usr/local/cuda -ln -s /usr/local/cuda-12.0 /usr/local/cuda +ln -s /usr/local/cuda-12.2 /usr/local/cuda # upgrade pip /databricks/python/bin/pip install --upgrade pip diff --git a/python/benchmark/databricks/gpu_cluster_spec.sh b/python/benchmark/databricks/gpu_cluster_spec.sh index 7dc1fc77e..7578e351f 100644 --- a/python/benchmark/databricks/gpu_cluster_spec.sh +++ b/python/benchmark/databricks/gpu_cluster_spec.sh @@ -58,7 +58,7 @@ cat < Date: Thu, 11 Dec 2025 21:46:06 -0800 Subject: [PATCH 8/9] fix dbscan tests for 2+ gpus, cleanup, remove -nightly Signed-off-by: Erik Ordentlich --- ci/Dockerfile | 2 +- python/src/spark_rapids_ml/metrics/utils.py | 9 ++++--- python/tests/test_dbscan.py | 30 +++++++++------------ 3 files changed, 19 insertions(+), 22 deletions(-) diff --git a/ci/Dockerfile b/ci/Dockerfile index f7b4a0d75..13d119e92 100644 --- a/ci/Dockerfile +++ b/ci/Dockerfile @@ -48,5 +48,5 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86 # install cuML ARG RAPIDS_VERSION=25.12 -RUN conda install -y -c rapidsai-nightly -c conda-forge -c nvidia cuml=$RAPIDS_VERSION cuvs=$RAPIDS_VERSION python=3.10 pylibraft=$RAPIDS_VERSION raft-dask=$RAPIDS_VERSION cuda-version=12.2 numpy~=1.0 \ +RUN conda install -y -c rapidsai -c conda-forge -c nvidia cuml=$RAPIDS_VERSION cuvs=$RAPIDS_VERSION python=3.10 pylibraft=$RAPIDS_VERSION raft-dask=$RAPIDS_VERSION cuda-version=12.2 numpy~=1.0 \ && conda clean --all -f -y diff --git a/python/src/spark_rapids_ml/metrics/utils.py b/python/src/spark_rapids_ml/metrics/utils.py index 4ed655c00..6b6d99ea6 100644 --- a/python/src/spark_rapids_ml/metrics/utils.py +++ b/python/src/spark_rapids_ml/metrics/utils.py @@ -22,7 +22,10 @@ def logistic_regression_objective( lr_model: Union[LogisticRegressionModel, SparkLogisticRegressionModel] Returns: - Full objective of the logistic regression model + Full objective of the logistic regression model: + log_loss + reg_param * (0.5 * (1 - elasticnet_param) * ||coefs||_2^2 + elasticnet_param * |coefs|_1) + where: + log_loss = (1/n) * sum_i(-log(prob(y_i))) for labels y_1, y_2, ..., y_n """ if isinstance(lr_model, LogisticRegressionModel): lr_model = lr_model.cpu() @@ -34,7 +37,7 @@ def logistic_regression_objective( label_name = lr_model.getLabelCol() features_col = lr_model.getFeaturesCol() - evaluator_train = ( + evaluator = ( MulticlassClassificationEvaluator() .setMetricName("logLoss") # type:ignore .setPredictionCol(prediction_col) @@ -42,7 +45,7 @@ def logistic_regression_objective( .setLabelCol(label_name) ) - log_loss = evaluator_train.evaluate(df_with_preds) + log_loss = evaluator.evaluate(df_with_preds) coefficients = ( np.array(lr_model.coefficients) if lr_model.numClasses == 2 diff --git a/python/tests/test_dbscan.py b/python/tests/test_dbscan.py index a5759b93d..ca6bc7862 100644 --- a/python/tests/test_dbscan.py +++ b/python/tests/test_dbscan.py @@ -129,18 +129,14 @@ def test_dbscan_basic( # reduce the number of GPUs for toy dataset to avoid empty partition gpu_number = min(gpu_number, 2) data = [ - ([0.0, 0.0]), - ([1.0, 1.0]), - ([9.0, 8.0]), - ([8.0, 9.0]), + (0, [0.0, 0.0]), + (1, [1.0, 1.0]), + (2, [9.0, 8.0]), + (3, [8.0, 9.0]), ] with CleanSparkSession() as spark: - df = ( - spark.sparkContext.parallelize(data, gpu_number) - .map(lambda row: (row,)) - .toDF(["features"]) - ) + df = spark.sparkContext.parallelize(data, gpu_number).toDF(["id", "features"]) dbscan = DBSCAN(num_workers=gpu_number, min_samples=2, eps=2).setFeaturesCol( "features" ) @@ -156,10 +152,10 @@ def test_dbscan_basic( # test transform function dbscan_model.setPredictionCol("prediction") label_df = dbscan_model.transform(df) - assert ["features", "prediction"] == sorted(label_df.columns) + assert ["features", "id", "prediction"] == sorted(label_df.columns) o_col = dbscan_model.getPredictionCol() - labels = [row[o_col] for row in label_df.collect()] + labels = [row[o_col] for row in label_df.sort("id").collect()] assert len(labels) == 4 assert labels[0] == labels[1] @@ -169,10 +165,10 @@ def test_dbscan_basic( # Test the loaded model dbscan_model_loaded.setPredictionCol("prediction") label_df = dbscan_model_loaded.transform(df) - assert ["features", "prediction"] == sorted(label_df.columns) + assert ["features", "id", "prediction"] == sorted(label_df.columns) o_col = dbscan_model_loaded.getPredictionCol() - labels = [row[o_col] for row in label_df.collect()] + labels = [row[o_col] for row in label_df.sort("id").collect()] assert len(labels) == 4 assert labels[0] == labels[1] @@ -279,11 +275,9 @@ def test_dbscan( transformed = dbscan_model.transform(df) # Check cluster match - label_df = transformed.select("prediction") - feature_df = transformed.drop("prediction") - - label_pdf = label_df.toPandas() - feature_pdf = feature_df.toPandas() + pandas_df = transformed.toPandas() + label_pdf = pandas_df["prediction"] + feature_pdf = pandas_df[features_col] label_arr = label_pdf.to_numpy().squeeze() feature_matrix = feature_pdf.to_numpy() From 6efd6017ae91f9da17eb758086170e6221da7ec3 Mon Sep 17 00:00:00 2001 From: Erik Ordentlich Date: Tue, 16 Dec 2025 11:59:15 -0800 Subject: [PATCH 9/9] address comments Signed-off-by: Erik Ordentlich --- docker/Dockerfile.pip | 1 - notebooks/aws-emr/init-bootstrap-action.sh | 1 - notebooks/databricks/README.md | 2 +- notebooks/databricks/init-pip-cuda-12.sh | 1 - notebooks/dataproc/spark_rapids_ml.sh | 1 - python/README.md | 4 ++-- python/benchmark/databricks/init-pip-cuda-12.sh | 1 - python/benchmark/dataproc/init_benchmark.sh | 1 - python/src/spark_rapids_ml/classification.py | 1 + python/src/spark_rapids_ml/regression.py | 5 ++++- 10 files changed, 8 insertions(+), 10 deletions(-) diff --git a/docker/Dockerfile.pip b/docker/Dockerfile.pip index 51b241ea5..ac9cd292d 100644 --- a/docker/Dockerfile.pip +++ b/docker/Dockerfile.pip @@ -52,7 +52,6 @@ RUN pip install --no-cache-dir \ cuvs-cu12~=${RAPIDS_VERSION} \ pylibraft-cu12~=${RAPIDS_VERSION} \ raft-dask-cu12~=${RAPIDS_VERSION} \ - dask-cuda-cu12~=${RAPIDS_VERSION} \ numpy~=1.0 \ --extra-index-url=https://pypi.nvidia.com diff --git a/notebooks/aws-emr/init-bootstrap-action.sh b/notebooks/aws-emr/init-bootstrap-action.sh index a812e0434..f45deb065 100755 --- a/notebooks/aws-emr/init-bootstrap-action.sh +++ b/notebooks/aws-emr/init-bootstrap-action.sh @@ -41,7 +41,6 @@ sudo /usr/local/bin/pip3.10 install --no-cache-dir \ cuvs-cu12~=${RAPIDS_VERSION} \ pylibraft-cu12~=${RAPIDS_VERSION} \ raft-dask-cu12~=${RAPIDS_VERSION} \ - dask-cuda-cu12~=${RAPIDS_VERSION} \ --extra-index-url=https://pypi.nvidia.com --verbose sudo /usr/local/bin/pip3.10 install spark-rapids-ml sudo /usr/local/bin/pip3.10 list diff --git a/notebooks/databricks/README.md b/notebooks/databricks/README.md index 6637961df..d6b1bb312 100644 --- a/notebooks/databricks/README.md +++ b/notebooks/databricks/README.md @@ -14,7 +14,7 @@ If you already have a Databricks account, you can run the example notebooks on a databricks workspace import --format AUTO --file init-pip-cuda-12.sh ${WS_SAVE_DIR}/init-pip-cuda-12.sh --profile ${PROFILE} ``` **Note**: the init script does the following on each Spark node: - - updates the CUDA runtime to 12.0 (required for Spark Rapids ML dependencies). + - updates the CUDA runtime (required for Spark Rapids ML dependencies). - downloads and installs the [Spark-Rapids](https://github.com/NVIDIA/spark-rapids) plugin for accelerating data loading and Spark SQL. - installs various `cuXX` dependencies via pip. - if the cluster environment variable `SPARK_RAPIDS_ML_NO_IMPORT_ENABLED=1` is define (see below), the init script also modifies a Databricks notebook kernel startup script to enable no-import change UX for the cluster. See [no-import-change](../README.md#no-import-change). diff --git a/notebooks/databricks/init-pip-cuda-12.sh b/notebooks/databricks/init-pip-cuda-12.sh index 7f36b9278..70201ec85 100644 --- a/notebooks/databricks/init-pip-cuda-12.sh +++ b/notebooks/databricks/init-pip-cuda-12.sh @@ -42,7 +42,6 @@ ln -s /usr/local/cuda-12.2 /usr/local/cuda cuvs-cu12~=${RAPIDS_VERSION} \ pylibraft-cu12~=${RAPIDS_VERSION} \ raft-dask-cu12~=${RAPIDS_VERSION} \ - dask-cuda-cu12~=${RAPIDS_VERSION} \ numpy~=1.0 \ --extra-index-url=https://pypi.nvidia.com diff --git a/notebooks/dataproc/spark_rapids_ml.sh b/notebooks/dataproc/spark_rapids_ml.sh index 6ab88749f..b00102d19 100644 --- a/notebooks/dataproc/spark_rapids_ml.sh +++ b/notebooks/dataproc/spark_rapids_ml.sh @@ -26,7 +26,6 @@ pip install --no-cache-dir \ cuvs-cu12~=${RAPIDS_VERSION} \ pylibraft-cu12~=${RAPIDS_VERSION} \ raft-dask-cu12~=${RAPIDS_VERSION} \ - dask-cuda-cu12~=${RAPIDS_VERSION} \ --extra-index-url=https://pypi.nvidia.com # install spark-rapids-ml diff --git a/python/README.md b/python/README.md index 8ae6ae114..266e8fcda 100644 --- a/python/README.md +++ b/python/README.md @@ -18,11 +18,11 @@ This PySpark-compatible API leverages the RAPIDS cuML python API to provide GPU- For simplicity, the following instructions just use Spark local mode, assuming a server with at least one GPU. -First, install RAPIDS cuML per [these instructions](https://rapids.ai/start.html). Example for CUDA Toolkit 12.0: +First, install RAPIDS cuML per [these instructions](https://rapids.ai/start.html). Example for CUDA Toolkit 12.2: ```bash conda create -n rapids-25.12 \ -c rapidsai -c conda-forge -c nvidia \ - cuml=25.12 cuvs=25.12 python=3.10 pylibraft=$RAPIDS_VERSION raft-dask=$RAPIDS_VERSION cuda-version=12.0 numpy~=1.0 + python=3.10 cuml=25.12 cuvs=25.12 pylibraft=25.12 raft-dask=25.12 cuda-version=12.2 numpy~=1.0 ``` **Note**: while testing, we recommend using conda or docker to simplify installation and isolate your environment while experimenting. Once you have a working environment, you can then try installing directly, if necessary. diff --git a/python/benchmark/databricks/init-pip-cuda-12.sh b/python/benchmark/databricks/init-pip-cuda-12.sh index 7f56e4500..f0e0afd72 100644 --- a/python/benchmark/databricks/init-pip-cuda-12.sh +++ b/python/benchmark/databricks/init-pip-cuda-12.sh @@ -47,7 +47,6 @@ ln -s /usr/local/cuda-12.2 /usr/local/cuda cuvs-cu12~=${RAPIDS_VERSION} \ pylibraft-cu12~=${RAPIDS_VERSION} \ raft-dask-cu12~=${RAPIDS_VERSION} \ - dask-cuda-cu12~=${RAPIDS_VERSION} \ numpy~=1.0 \ --extra-index-url=https://pypi.nvidia.com diff --git a/python/benchmark/dataproc/init_benchmark.sh b/python/benchmark/dataproc/init_benchmark.sh index fff0d5a4c..07babbdf8 100755 --- a/python/benchmark/dataproc/init_benchmark.sh +++ b/python/benchmark/dataproc/init_benchmark.sh @@ -35,7 +35,6 @@ pip install --no-cache-dir \ cuvs-cu12~=${RAPIDS_VERSION} \ pylibraft-cu12~=${RAPIDS_VERSION} \ raft-dask-cu12~=${RAPIDS_VERSION} \ - dask-cuda-cu12~=${RAPIDS_VERSION} \ --extra-index-url=https://pypi.nvidia.com # install benchmark files diff --git a/python/src/spark_rapids_ml/classification.py b/python/src/spark_rapids_ml/classification.py index 276dc4c04..8e6ebcd6a 100644 --- a/python/src/spark_rapids_ml/classification.py +++ b/python/src/spark_rapids_ml/classification.py @@ -1081,6 +1081,7 @@ def _single_fit(init_parameters: Dict[str, Any]) -> Dict[str, Any]: ) except ValueError as e: # cuML now raises an exception if only one label value is observed. + # see e.g. https://github.com/rapidsai/cuml/blob/f7f175d7ae1c63e8eed3b66e581f328e0fd335be/python/cuml/cuml/solvers/qn.pyx#L96 # Here we suppress in that case until later as we can handle it when # fitIntercept=True. import traceback diff --git a/python/src/spark_rapids_ml/regression.py b/python/src/spark_rapids_ml/regression.py index 717b8d79e..b630bb365 100644 --- a/python/src/spark_rapids_ml/regression.py +++ b/python/src/spark_rapids_ml/regression.py @@ -79,7 +79,7 @@ _RandomForestEstimator, _RandomForestModel, ) -from .utils import PartitionDescriptor, _get_spark_session, java_uid +from .utils import _get_spark_session, java_uid if TYPE_CHECKING: import cupy as cp @@ -521,6 +521,9 @@ def _linear_regression_fit( dfs: FitInputType, params: Dict[str, Any], ) -> Dict[str, Any]: + + from .utils import PartitionDescriptor + # Step 1, get the PartitionDescriptor pdesc = PartitionDescriptor.build( params[param_alias.part_sizes], params[param_alias.num_cols]