From a34bf0907966abcfea28bd8872a4d03ab738b606 Mon Sep 17 00:00:00 2001 From: Erik Ordentlich Date: Tue, 11 Nov 2025 21:24:37 -0800 Subject: [PATCH 1/2] retain global pointers to previous default rmm memory resources when changing to avoid race condition segfaults with SAM Signed-off-by: Erik Ordentlich --- python/src/spark_rapids_ml/utils.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/python/src/spark_rapids_ml/utils.py b/python/src/spark_rapids_ml/utils.py index 7e1ab923..6485c291 100644 --- a/python/src/spark_rapids_ml/utils.py +++ b/python/src/spark_rapids_ml/utils.py @@ -163,6 +163,14 @@ def _get_gpu_id(task_context: TaskContext) -> int: return gpu_id +# When changing default rmm memory resources we retain the old ones +# in this global array singleton to so that any (C++) allocations using them can +# invoke the corresponding deallocate methods. They will get cleaned up only when +# the process exits. This avoids a segfault in the case of creating a new +# SAM resource with a smaller headroom. +_old_memory_resources = [] + + def _configure_memory_resource( uvm_enabled: bool = False, sam_enabled: bool = False, @@ -193,12 +201,14 @@ def _configure_memory_resource( if not type(rmm.mr.get_current_device_resource()) == type( rmm.mr.SystemMemoryResource() ): + _old_memory_resources.append(rmm.mr.get_current_device_resource()) mr = rmm.mr.SystemMemoryResource() rmm.mr.set_current_device_resource(mr) elif sam_enabled and sam_headroom is not None: if force_sam_headroom or not type(rmm.mr.get_current_device_resource()) == type( rmm.mr.SamHeadroomMemoryResource(headroom=sam_headroom) ): + _old_memory_resources.append(rmm.mr.get_current_device_resource()) mr = rmm.mr.SamHeadroomMemoryResource(headroom=sam_headroom) rmm.mr.set_current_device_resource(mr) @@ -206,6 +216,7 @@ def _configure_memory_resource( if not type(rmm.mr.get_current_device_resource()) == type( rmm.mr.ManagedMemoryResource() ): + _old_memory_resources.append(rmm.mr.get_current_device_resource()) rmm.mr.set_current_device_resource(rmm.mr.ManagedMemoryResource()) if sam_enabled or uvm_enabled: From 33254aad991b57210e4740156a8f660dd597837b Mon Sep 17 00:00:00 2001 From: Erik Ordentlich Date: Wed, 12 Nov 2025 12:01:48 -0800 Subject: [PATCH 2/2] keep track of last sam headroom and update resource if changed Signed-off-by: Erik Ordentlich --- python/src/spark_rapids_ml/classification.py | 1 - python/src/spark_rapids_ml/clustering.py | 2 -- python/src/spark_rapids_ml/knn.py | 1 - python/src/spark_rapids_ml/tree.py | 1 - python/src/spark_rapids_ml/umap.py | 2 -- python/src/spark_rapids_ml/utils.py | 14 ++++++++++---- 6 files changed, 10 insertions(+), 11 deletions(-) diff --git a/python/src/spark_rapids_ml/classification.py b/python/src/spark_rapids_ml/classification.py index 07de849a..73cc32bd 100644 --- a/python/src/spark_rapids_ml/classification.py +++ b/python/src/spark_rapids_ml/classification.py @@ -1071,7 +1071,6 @@ def _single_fit(init_parameters: Dict[str, Any]) -> Dict[str, Any]: cuda_managed_mem_enabled, cuda_system_mem_enabled, cuda_system_mem_headroom, - force_sam_headroom=True, ) logistic_regression.fit( diff --git a/python/src/spark_rapids_ml/clustering.py b/python/src/spark_rapids_ml/clustering.py index 5504003b..19566278 100644 --- a/python/src/spark_rapids_ml/clustering.py +++ b/python/src/spark_rapids_ml/clustering.py @@ -391,7 +391,6 @@ def _cuml_fit( cuda_managed_mem_enabled, cuda_system_mem_enabled, cuda_system_mem_headroom, - force_sam_headroom=True, ) kmeans_object._fit( @@ -997,7 +996,6 @@ def _cuml_fit( cuda_managed_mem_enabled, cuda_system_mem_enabled, cuda_system_mem_headroom, - force_sam_headroom=True, ) # Set out_dtype tp 64bit to get larger indexType in cuML for avoiding overflow diff --git a/python/src/spark_rapids_ml/knn.py b/python/src/spark_rapids_ml/knn.py index d2bc61f3..347de9c1 100644 --- a/python/src/spark_rapids_ml/knn.py +++ b/python/src/spark_rapids_ml/knn.py @@ -758,7 +758,6 @@ async def do_allGather() -> List[str]: cuda_managed_mem_enabled, cuda_system_mem_enabled, cuda_system_mem_headroom, - force_sam_headroom=True, ) res_tuple: Tuple[List[np.ndarray], List[np.ndarray]] = nn_object.kneighbors( diff --git a/python/src/spark_rapids_ml/tree.py b/python/src/spark_rapids_ml/tree.py index 10d66949..b267e41f 100644 --- a/python/src/spark_rapids_ml/tree.py +++ b/python/src/spark_rapids_ml/tree.py @@ -407,7 +407,6 @@ def _single_fit(rf: cuRf) -> Dict[str, Any]: cuda_managed_mem_enabled, cuda_system_mem_enabled, cuda_system_mem_headroom, - force_sam_headroom=True, ) # Fit a random forest model on the dataset (X, y) diff --git a/python/src/spark_rapids_ml/umap.py b/python/src/spark_rapids_ml/umap.py index fb9240fa..6b9cdb5a 100644 --- a/python/src/spark_rapids_ml/umap.py +++ b/python/src/spark_rapids_ml/umap.py @@ -1044,7 +1044,6 @@ def _cuml_fit( cuda_managed_mem_enabled, cuda_system_mem_enabled, cuda_system_mem_headroom, - force_sam_headroom=True, ) umap_model = umap_object.fit(concated, y=labels) @@ -1054,7 +1053,6 @@ def _cuml_fit( cuda_managed_mem_enabled, cuda_system_mem_enabled, cuda_system_mem_headroom, - force_sam_headroom=True, ) # Call unsupervised fit diff --git a/python/src/spark_rapids_ml/utils.py b/python/src/spark_rapids_ml/utils.py index 6485c291..7d043480 100644 --- a/python/src/spark_rapids_ml/utils.py +++ b/python/src/spark_rapids_ml/utils.py @@ -170,18 +170,22 @@ def _get_gpu_id(task_context: TaskContext) -> int: # SAM resource with a smaller headroom. _old_memory_resources = [] +# keep track of last headroom to check if new sam mr is needed. +_last_sam_headroom_size = None + def _configure_memory_resource( uvm_enabled: bool = False, sam_enabled: bool = False, sam_headroom: Optional[int] = None, - force_sam_headroom: bool = False, ) -> None: import cupy as cp import rmm from cuda.bindings import runtime from rmm.allocators.cupy import rmm_cupy_allocator + global _last_sam_headroom_size + _SYSTEM_MEMORY_SUPPORTED = rmm._cuda.gpu.getDeviceAttribute( runtime.cudaDeviceAttr.cudaDevAttrPageableMemoryAccess, rmm._cuda.gpu.getDevice(), @@ -202,13 +206,15 @@ def _configure_memory_resource( rmm.mr.SystemMemoryResource() ): _old_memory_resources.append(rmm.mr.get_current_device_resource()) + _last_sam_headroom_size = None mr = rmm.mr.SystemMemoryResource() rmm.mr.set_current_device_resource(mr) elif sam_enabled and sam_headroom is not None: - if force_sam_headroom or not type(rmm.mr.get_current_device_resource()) == type( - rmm.mr.SamHeadroomMemoryResource(headroom=sam_headroom) - ): + if sam_headroom != _last_sam_headroom_size or not type( + rmm.mr.get_current_device_resource() + ) == type(rmm.mr.SamHeadroomMemoryResource(headroom=sam_headroom)): _old_memory_resources.append(rmm.mr.get_current_device_resource()) + _last_sam_headroom_size = sam_headroom mr = rmm.mr.SamHeadroomMemoryResource(headroom=sam_headroom) rmm.mr.set_current_device_resource(mr)