diff --git a/python/src/spark_rapids_ml/classification.py b/python/src/spark_rapids_ml/classification.py index 07de849a..73cc32bd 100644 --- a/python/src/spark_rapids_ml/classification.py +++ b/python/src/spark_rapids_ml/classification.py @@ -1071,7 +1071,6 @@ def _single_fit(init_parameters: Dict[str, Any]) -> Dict[str, Any]: cuda_managed_mem_enabled, cuda_system_mem_enabled, cuda_system_mem_headroom, - force_sam_headroom=True, ) logistic_regression.fit( diff --git a/python/src/spark_rapids_ml/clustering.py b/python/src/spark_rapids_ml/clustering.py index 5504003b..19566278 100644 --- a/python/src/spark_rapids_ml/clustering.py +++ b/python/src/spark_rapids_ml/clustering.py @@ -391,7 +391,6 @@ def _cuml_fit( cuda_managed_mem_enabled, cuda_system_mem_enabled, cuda_system_mem_headroom, - force_sam_headroom=True, ) kmeans_object._fit( @@ -997,7 +996,6 @@ def _cuml_fit( cuda_managed_mem_enabled, cuda_system_mem_enabled, cuda_system_mem_headroom, - force_sam_headroom=True, ) # Set out_dtype tp 64bit to get larger indexType in cuML for avoiding overflow diff --git a/python/src/spark_rapids_ml/knn.py b/python/src/spark_rapids_ml/knn.py index d2bc61f3..347de9c1 100644 --- a/python/src/spark_rapids_ml/knn.py +++ b/python/src/spark_rapids_ml/knn.py @@ -758,7 +758,6 @@ async def do_allGather() -> List[str]: cuda_managed_mem_enabled, cuda_system_mem_enabled, cuda_system_mem_headroom, - force_sam_headroom=True, ) res_tuple: Tuple[List[np.ndarray], List[np.ndarray]] = nn_object.kneighbors( diff --git a/python/src/spark_rapids_ml/tree.py b/python/src/spark_rapids_ml/tree.py index 10d66949..b267e41f 100644 --- a/python/src/spark_rapids_ml/tree.py +++ b/python/src/spark_rapids_ml/tree.py @@ -407,7 +407,6 @@ def _single_fit(rf: cuRf) -> Dict[str, Any]: cuda_managed_mem_enabled, cuda_system_mem_enabled, cuda_system_mem_headroom, - force_sam_headroom=True, ) # Fit a random forest model on the dataset (X, y) diff --git a/python/src/spark_rapids_ml/umap.py b/python/src/spark_rapids_ml/umap.py index fb9240fa..6b9cdb5a 100644 --- a/python/src/spark_rapids_ml/umap.py +++ b/python/src/spark_rapids_ml/umap.py @@ -1044,7 +1044,6 @@ def _cuml_fit( cuda_managed_mem_enabled, cuda_system_mem_enabled, cuda_system_mem_headroom, - force_sam_headroom=True, ) umap_model = umap_object.fit(concated, y=labels) @@ -1054,7 +1053,6 @@ def _cuml_fit( cuda_managed_mem_enabled, cuda_system_mem_enabled, cuda_system_mem_headroom, - force_sam_headroom=True, ) # Call unsupervised fit diff --git a/python/src/spark_rapids_ml/utils.py b/python/src/spark_rapids_ml/utils.py index 7e1ab923..7d043480 100644 --- a/python/src/spark_rapids_ml/utils.py +++ b/python/src/spark_rapids_ml/utils.py @@ -163,17 +163,29 @@ def _get_gpu_id(task_context: TaskContext) -> int: return gpu_id +# When changing default rmm memory resources we retain the old ones +# in this global array singleton to so that any (C++) allocations using them can +# invoke the corresponding deallocate methods. They will get cleaned up only when +# the process exits. This avoids a segfault in the case of creating a new +# SAM resource with a smaller headroom. +_old_memory_resources = [] + +# keep track of last headroom to check if new sam mr is needed. +_last_sam_headroom_size = None + + def _configure_memory_resource( uvm_enabled: bool = False, sam_enabled: bool = False, sam_headroom: Optional[int] = None, - force_sam_headroom: bool = False, ) -> None: import cupy as cp import rmm from cuda.bindings import runtime from rmm.allocators.cupy import rmm_cupy_allocator + global _last_sam_headroom_size + _SYSTEM_MEMORY_SUPPORTED = rmm._cuda.gpu.getDeviceAttribute( runtime.cudaDeviceAttr.cudaDevAttrPageableMemoryAccess, rmm._cuda.gpu.getDevice(), @@ -193,12 +205,16 @@ def _configure_memory_resource( if not type(rmm.mr.get_current_device_resource()) == type( rmm.mr.SystemMemoryResource() ): + _old_memory_resources.append(rmm.mr.get_current_device_resource()) + _last_sam_headroom_size = None mr = rmm.mr.SystemMemoryResource() rmm.mr.set_current_device_resource(mr) elif sam_enabled and sam_headroom is not None: - if force_sam_headroom or not type(rmm.mr.get_current_device_resource()) == type( - rmm.mr.SamHeadroomMemoryResource(headroom=sam_headroom) - ): + if sam_headroom != _last_sam_headroom_size or not type( + rmm.mr.get_current_device_resource() + ) == type(rmm.mr.SamHeadroomMemoryResource(headroom=sam_headroom)): + _old_memory_resources.append(rmm.mr.get_current_device_resource()) + _last_sam_headroom_size = sam_headroom mr = rmm.mr.SamHeadroomMemoryResource(headroom=sam_headroom) rmm.mr.set_current_device_resource(mr) @@ -206,6 +222,7 @@ def _configure_memory_resource( if not type(rmm.mr.get_current_device_resource()) == type( rmm.mr.ManagedMemoryResource() ): + _old_memory_resources.append(rmm.mr.get_current_device_resource()) rmm.mr.set_current_device_resource(rmm.mr.ManagedMemoryResource()) if sam_enabled or uvm_enabled: