From 7e311c63d6e6baf751f452a5d16aa43214d9136f Mon Sep 17 00:00:00 2001 From: nvauto <70000568+nvauto@users.noreply.github.com> Date: Fri, 7 Nov 2025 06:26:41 +0000 Subject: [PATCH 1/2] Create release branch release/25.10 Signed-off-by: nvauto <70000568+nvauto@users.noreply.github.com> From 904d2986a4c68610ae463bb6bb55f22e62f4efc2 Mon Sep 17 00:00:00 2001 From: eordentlich Date: Wed, 12 Nov 2025 17:06:03 -0800 Subject: [PATCH 2/2] retain global pointers to previous default rmm memory resources (#995) This is needed to avoid race condition segfaults with SAM when SAM headroom is reduced from its initial larger value during data loading to a smaller value during computations. --------- Signed-off-by: Erik Ordentlich --- python/src/spark_rapids_ml/classification.py | 1 - python/src/spark_rapids_ml/clustering.py | 2 -- python/src/spark_rapids_ml/knn.py | 1 - python/src/spark_rapids_ml/tree.py | 1 - python/src/spark_rapids_ml/umap.py | 2 -- python/src/spark_rapids_ml/utils.py | 25 ++++++++++++++++---- 6 files changed, 21 insertions(+), 11 deletions(-) diff --git a/python/src/spark_rapids_ml/classification.py b/python/src/spark_rapids_ml/classification.py index 07de849a..73cc32bd 100644 --- a/python/src/spark_rapids_ml/classification.py +++ b/python/src/spark_rapids_ml/classification.py @@ -1071,7 +1071,6 @@ def _single_fit(init_parameters: Dict[str, Any]) -> Dict[str, Any]: cuda_managed_mem_enabled, cuda_system_mem_enabled, cuda_system_mem_headroom, - force_sam_headroom=True, ) logistic_regression.fit( diff --git a/python/src/spark_rapids_ml/clustering.py b/python/src/spark_rapids_ml/clustering.py index 5504003b..19566278 100644 --- a/python/src/spark_rapids_ml/clustering.py +++ b/python/src/spark_rapids_ml/clustering.py @@ -391,7 +391,6 @@ def _cuml_fit( cuda_managed_mem_enabled, cuda_system_mem_enabled, cuda_system_mem_headroom, - force_sam_headroom=True, ) kmeans_object._fit( @@ -997,7 +996,6 @@ def _cuml_fit( cuda_managed_mem_enabled, cuda_system_mem_enabled, cuda_system_mem_headroom, - force_sam_headroom=True, ) # Set out_dtype tp 64bit to get larger indexType in cuML for avoiding overflow diff --git a/python/src/spark_rapids_ml/knn.py b/python/src/spark_rapids_ml/knn.py index d2bc61f3..347de9c1 100644 --- a/python/src/spark_rapids_ml/knn.py +++ b/python/src/spark_rapids_ml/knn.py @@ -758,7 +758,6 @@ async def do_allGather() -> List[str]: cuda_managed_mem_enabled, cuda_system_mem_enabled, cuda_system_mem_headroom, - force_sam_headroom=True, ) res_tuple: Tuple[List[np.ndarray], List[np.ndarray]] = nn_object.kneighbors( diff --git a/python/src/spark_rapids_ml/tree.py b/python/src/spark_rapids_ml/tree.py index 10d66949..b267e41f 100644 --- a/python/src/spark_rapids_ml/tree.py +++ b/python/src/spark_rapids_ml/tree.py @@ -407,7 +407,6 @@ def _single_fit(rf: cuRf) -> Dict[str, Any]: cuda_managed_mem_enabled, cuda_system_mem_enabled, cuda_system_mem_headroom, - force_sam_headroom=True, ) # Fit a random forest model on the dataset (X, y) diff --git a/python/src/spark_rapids_ml/umap.py b/python/src/spark_rapids_ml/umap.py index fb9240fa..6b9cdb5a 100644 --- a/python/src/spark_rapids_ml/umap.py +++ b/python/src/spark_rapids_ml/umap.py @@ -1044,7 +1044,6 @@ def _cuml_fit( cuda_managed_mem_enabled, cuda_system_mem_enabled, cuda_system_mem_headroom, - force_sam_headroom=True, ) umap_model = umap_object.fit(concated, y=labels) @@ -1054,7 +1053,6 @@ def _cuml_fit( cuda_managed_mem_enabled, cuda_system_mem_enabled, cuda_system_mem_headroom, - force_sam_headroom=True, ) # Call unsupervised fit diff --git a/python/src/spark_rapids_ml/utils.py b/python/src/spark_rapids_ml/utils.py index 7e1ab923..7d043480 100644 --- a/python/src/spark_rapids_ml/utils.py +++ b/python/src/spark_rapids_ml/utils.py @@ -163,17 +163,29 @@ def _get_gpu_id(task_context: TaskContext) -> int: return gpu_id +# When changing default rmm memory resources we retain the old ones +# in this global array singleton to so that any (C++) allocations using them can +# invoke the corresponding deallocate methods. They will get cleaned up only when +# the process exits. This avoids a segfault in the case of creating a new +# SAM resource with a smaller headroom. +_old_memory_resources = [] + +# keep track of last headroom to check if new sam mr is needed. +_last_sam_headroom_size = None + + def _configure_memory_resource( uvm_enabled: bool = False, sam_enabled: bool = False, sam_headroom: Optional[int] = None, - force_sam_headroom: bool = False, ) -> None: import cupy as cp import rmm from cuda.bindings import runtime from rmm.allocators.cupy import rmm_cupy_allocator + global _last_sam_headroom_size + _SYSTEM_MEMORY_SUPPORTED = rmm._cuda.gpu.getDeviceAttribute( runtime.cudaDeviceAttr.cudaDevAttrPageableMemoryAccess, rmm._cuda.gpu.getDevice(), @@ -193,12 +205,16 @@ def _configure_memory_resource( if not type(rmm.mr.get_current_device_resource()) == type( rmm.mr.SystemMemoryResource() ): + _old_memory_resources.append(rmm.mr.get_current_device_resource()) + _last_sam_headroom_size = None mr = rmm.mr.SystemMemoryResource() rmm.mr.set_current_device_resource(mr) elif sam_enabled and sam_headroom is not None: - if force_sam_headroom or not type(rmm.mr.get_current_device_resource()) == type( - rmm.mr.SamHeadroomMemoryResource(headroom=sam_headroom) - ): + if sam_headroom != _last_sam_headroom_size or not type( + rmm.mr.get_current_device_resource() + ) == type(rmm.mr.SamHeadroomMemoryResource(headroom=sam_headroom)): + _old_memory_resources.append(rmm.mr.get_current_device_resource()) + _last_sam_headroom_size = sam_headroom mr = rmm.mr.SamHeadroomMemoryResource(headroom=sam_headroom) rmm.mr.set_current_device_resource(mr) @@ -206,6 +222,7 @@ def _configure_memory_resource( if not type(rmm.mr.get_current_device_resource()) == type( rmm.mr.ManagedMemoryResource() ): + _old_memory_resources.append(rmm.mr.get_current_device_resource()) rmm.mr.set_current_device_resource(rmm.mr.ManagedMemoryResource()) if sam_enabled or uvm_enabled: