NVIDIA · eordentlich · Nov 13, 2025 · Nov 12, 2025 · Nov 12, 2025 · wbo4958
diff --git a/python/src/spark_rapids_ml/classification.py b/python/src/spark_rapids_ml/classification.py
@@ -1071,7 +1071,6 @@ def _single_fit(init_parameters: Dict[str, Any]) -> Dict[str, Any]:
                     cuda_managed_mem_enabled,
                     cuda_system_mem_enabled,
                     cuda_system_mem_headroom,
-                    force_sam_headroom=True,
                 )
 
                 logistic_regression.fit(

diff --git a/python/src/spark_rapids_ml/clustering.py b/python/src/spark_rapids_ml/clustering.py
@@ -391,7 +391,6 @@ def _cuml_fit(
                 cuda_managed_mem_enabled,
                 cuda_system_mem_enabled,
                 cuda_system_mem_headroom,
-                force_sam_headroom=True,
             )
 
             kmeans_object._fit(
@@ -997,7 +996,6 @@ def _cuml_fit(
                 cuda_managed_mem_enabled,
                 cuda_system_mem_enabled,
                 cuda_system_mem_headroom,
-                force_sam_headroom=True,
             )
 
             # Set out_dtype tp 64bit to get larger indexType in cuML for avoiding overflow

diff --git a/python/src/spark_rapids_ml/knn.py b/python/src/spark_rapids_ml/knn.py
@@ -758,7 +758,6 @@ async def do_allGather() -> List[str]:
                 cuda_managed_mem_enabled,
                 cuda_system_mem_enabled,
                 cuda_system_mem_headroom,
-                force_sam_headroom=True,
             )
 
             res_tuple: Tuple[List[np.ndarray], List[np.ndarray]] = nn_object.kneighbors(

diff --git a/python/src/spark_rapids_ml/tree.py b/python/src/spark_rapids_ml/tree.py
@@ -407,7 +407,6 @@ def _single_fit(rf: cuRf) -> Dict[str, Any]:
                     cuda_managed_mem_enabled,
                     cuda_system_mem_enabled,
                     cuda_system_mem_headroom,
-                    force_sam_headroom=True,
                 )
 
                 # Fit a random forest model on the dataset (X, y)

diff --git a/python/src/spark_rapids_ml/umap.py b/python/src/spark_rapids_ml/umap.py
@@ -1044,7 +1044,6 @@ def _cuml_fit(
                     cuda_managed_mem_enabled,
                     cuda_system_mem_enabled,
                     cuda_system_mem_headroom,
-                    force_sam_headroom=True,
                 )
 
                 umap_model = umap_object.fit(concated, y=labels)
@@ -1054,7 +1053,6 @@ def _cuml_fit(
                     cuda_managed_mem_enabled,
                     cuda_system_mem_enabled,
                     cuda_system_mem_headroom,
-                    force_sam_headroom=True,
                 )
 
                 # Call unsupervised fit

diff --git a/python/src/spark_rapids_ml/utils.py b/python/src/spark_rapids_ml/utils.py
@@ -163,17 +163,29 @@ def _get_gpu_id(task_context: TaskContext) -> int:
     return gpu_id
 
 
+# When changing default rmm memory resources we retain the old ones
+# in this global array singleton to so that any (C++) allocations using them can
+# invoke the corresponding deallocate methods.  They will get cleaned up only when
+# the process exits.  This avoids a segfault in the case of creating a new
+# SAM resource with a smaller headroom.
+_old_memory_resources = []
-_old_memory_resources = []
+import threading
+_old_memory_resources: List[Any] = []
+_memory_resource_lock = threading.Lock()
-_old_memory_resources = []
+import threading
+_old_memory_resources: List[Any] = []
+_memory_resource_lock = threading.Lock()
+
+# keep track of last headroom to check if new sam mr is needed.
+_last_sam_headroom_size = None
+
+
 def _configure_memory_resource(
     uvm_enabled: bool = False,
     sam_enabled: bool = False,
     sam_headroom: Optional[int] = None,
-    force_sam_headroom: bool = False,
 ) -> None:
     import cupy as cp
     import rmm
     from cuda.bindings import runtime
     from rmm.allocators.cupy import rmm_cupy_allocator
 
+    global _last_sam_headroom_size
+
     _SYSTEM_MEMORY_SUPPORTED = rmm._cuda.gpu.getDeviceAttribute(
         runtime.cudaDeviceAttr.cudaDevAttrPageableMemoryAccess,
         rmm._cuda.gpu.getDevice(),
@@ -193,19 +205,24 @@ def _configure_memory_resource(
         if not type(rmm.mr.get_current_device_resource()) == type(
             rmm.mr.SystemMemoryResource()
         ):
+            _old_memory_resources.append(rmm.mr.get_current_device_resource())
+            _last_sam_headroom_size = None
             mr = rmm.mr.SystemMemoryResource()
             rmm.mr.set_current_device_resource(mr)
     elif sam_enabled and sam_headroom is not None:
-        if force_sam_headroom or not type(rmm.mr.get_current_device_resource()) == type(
-            rmm.mr.SamHeadroomMemoryResource(headroom=sam_headroom)
-        ):
+        if sam_headroom != _last_sam_headroom_size or not type(
+            rmm.mr.get_current_device_resource()
+        ) == type(rmm.mr.SamHeadroomMemoryResource(headroom=sam_headroom)):
+            _old_memory_resources.append(rmm.mr.get_current_device_resource())
+            _last_sam_headroom_size = sam_headroom
             mr = rmm.mr.SamHeadroomMemoryResource(headroom=sam_headroom)
             rmm.mr.set_current_device_resource(mr)
 
     if uvm_enabled:
         if not type(rmm.mr.get_current_device_resource()) == type(
             rmm.mr.ManagedMemoryResource()
         ):
+            _old_memory_resources.append(rmm.mr.get_current_device_resource())
             rmm.mr.set_current_device_resource(rmm.mr.ManagedMemoryResource())
 
     if sam_enabled or uvm_enabled: