ptycho · ptim0626 · Sep 11, 2024 · Sep 11, 2024 · Sep 11, 2024 · Sep 11, 2024
diff --git a/ptypy/accelerate/base/mem_utils.py b/ptypy/accelerate/base/mem_utils.py
@@ -0,0 +1,48 @@
+from math import floor
+
+
+def max_fpb_from_scans(scans):
+    """Find maximum 'frames_per_block' from different scan models.
+
+    Parameters
+    ----------
+    scans : dict
+        a dictionary contains different scan models, i.e. instances of
+        ptypy.core.manager.ScanModel.
+
+    Returns
+    -------
+    max_fpb : int
+        the maximum number of 'frames_per_block' in these scan models,
+        None if any one of them is GradFull as it is irrelevant.
+
+    """
+    max_fpb = 0
+    for scan in scans.values():
+        if scan.__class__.__name__ == "GradFull":
+            # the 'frames_per_block' is irrelevant for the GradFull
+            # model
+            return None
+        max_fpb = max(scan.max_frames_per_block, max_fpb)
+
+    return max_fpb
+
+def calculate_safe_fpb(mem_avail, mem_per_frame, nblk=3):
+    """Return a safe value of 'frames_per_block' from memory information.
+
+    Parameters
+    ----------
+    mem_avail : int
+        the available GPU memory
+    mem_per_frame : int
+        the GPU memory required for a single frame
+    nblk : int, optional
+        the total number of blocks the data will be divided into.
+        Default to 3.
+
+    """
+    if mem_per_frame < 0 or mem_avail < 0:
+        msg = "Memory should be a positive number."
+        raise ValueError(msg)
+
+    return floor((mem_avail / nblk) / mem_per_frame)
diff --git a/ptypy/accelerate/cuda_cupy/engines/ML_cupy.py b/ptypy/accelerate/cuda_cupy/engines/ML_cupy.py
@@ -20,6 +20,8 @@
 from ptypy import utils as u
 from ptypy.utils.verbose import logger, log
 from ptypy.utils import parallel
+from ptypy.accelerate.base.mem_utils import (max_fpb_from_scans,
+calculate_safe_fpb)
 from .. import get_context, log_device_memory_stats
 from ..kernels import PropagationKernel, RealSupportKernel, FourierSupportKernel
 from ..kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel
@@ -34,6 +36,8 @@
 # can be used to limit the number of blocks, simulating that they don't fit
 MAX_BLOCKS = 99999
 # MAX_BLOCKS = 3  # can be used to limit the number of blocks, simulating that they don't fit
+# the number of blocks to have with a safe value of frames_per_block
+NUM_BLK_SAFE_FPB = 3
 
 
 @register()
@@ -161,9 +165,19 @@ def _setup_kernels(self):
         mem = cp.cuda.runtime.memGetInfo()[0] + mempool.total_bytes() - mempool.used_bytes()
 
         # leave 200MB room for safety
-        fit = int(mem - 200 * 1024 * 1024) // blk
+        avail_mem = max(int(mem - 200 * 1024 * 1024), 0)
+        fit =  avail_mem // blk
         if not fit:
             log(1, "Cannot fit memory into device, if possible reduce frames per block. Exiting...")
+            # max_fpb is None if there is a GradFull in the scan models
+            # as 'frames_per_block' is irrelevant
+            max_fpb = max_fpb_from_scans(self.ptycho.model.scans)
+            if max_fpb is not None:
+                per_frame = blk / max_fpb
+                safe_fpb = calculate_safe_fpb(avail_mem, per_frame, NUM_BLK_SAFE_FPB)
+                log(1,f"Your current 'frames_per_block' is {max_fpb}.")
+                log(1,f"With current reconstruction parameters and computing resources, you can try setting 'frames_per_block' to {safe_fpb}.")
+                log(1,f"This would divide your reonstruction into {NUM_BLK_SAFE_FPB} blocks.")
             raise SystemExit("ptypy has been exited.")
 
         # TODO grow blocks dynamically
@@ -318,7 +332,7 @@ def engine_iterate(self, num=1):
         return err
 
     def position_update(self):
-        """ 
+        """
         Position refinement
         """
         if not self.do_position_refinement or (not self.curiter):
@@ -746,7 +760,7 @@ def delxf(x, axis=-1):
         self.dot = lambda x, y: self.AUK.dot(x, y).get().item()
 
         self._grad_reg_kernel = cp.ElementwiseKernel(
-            "float32 fac, complex64 py, complex64 px, complex64 my, complex64 mx", 
+            "float32 fac, complex64 py, complex64 px, complex64 my, complex64 mx",
             "complex64 out",
             "out = (px+py-my-mx) * fac",
             "grad_reg",

diff --git a/ptypy/accelerate/cuda_cupy/engines/projectional_cupy.py b/ptypy/accelerate/cuda_cupy/engines/projectional_cupy.py
@@ -18,6 +18,7 @@
 from ptypy.engines import register
 from ptypy.engines.projectional import DMMixin, RAARMixin
 from ptypy.accelerate.base.engines import projectional_serial
+from ptypy.accelerate.base.mem_utils import calculate_safe_fpb
 from ..kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel
 from ..kernels import PropagationKernel, RealSupportKernel, FourierSupportKernel
 from ..array_utils import ArrayUtilsKernel, GaussianSmoothingKernel,\
@@ -28,6 +29,9 @@
 
 __all__ = ['DM_cupy', 'RAAR_cupy']
 
+# the number of blocks to have with a safe value of frames_per_block
+NUM_BLK_SAFE_FPB = 3
+
 
 class _ProjectionEngine_cupy(projectional_serial._ProjectionEngine_serial):
 
@@ -122,6 +126,13 @@ def _setup_kernels(self):
             mem = cp.cuda.runtime.memGetInfo()[0] + mempool.total_bytes() - mempool.used_bytes()
             if not int(mem) // aux.nbytes:
                 log(1,"Cannot fit memory into device, if possible reduce frames per block or nr. of modes. Exiting...")
+                if scan.__class__.__name__ != "GradFull":
+                    # only make sense if the model is not GradFull
+                    per_frame = (aux.nbytes / aux.shape[0]) * nmodes
+                    safe_fpb = calculate_safe_fpb(mem, per_frame, NUM_BLK_SAFE_FPB)
+                    log(1,f"Your current 'frames_per_block' is {fpc}.")
+                    log(1,f"With current reconstruction parameters and computing resources, you can try setting 'frames_per_block' to {safe_fpb}.")
+                    log(1,f"This would divide your reonstruction into {NUM_BLK_SAFE_FPB} blocks.")
                 raise SystemExit("ptypy has been exited.")
             kern.aux = cp.asarray(aux)
 
@@ -209,7 +220,7 @@ def engine_iterate(self, num=1):
         queue.use()
 
         for it in range(num):
-                        
+
             reduced_error = np.zeros((3,))
             reduced_error_count = 0
             local_error = {}

diff --git a/ptypy/accelerate/cuda_cupy/engines/projectional_cupy_stream.py b/ptypy/accelerate/cuda_cupy/engines/projectional_cupy_stream.py
@@ -20,6 +20,8 @@
 from ptypy.accelerate.cuda_cupy import log_device_memory_stats
 from ptypy.utils.verbose import log
 from ptypy.utils import parallel
+from ptypy.accelerate.base.mem_utils import (max_fpb_from_scans,
+calculate_safe_fpb)
 from ptypy.engines import register
 from ptypy.engines.projectional import DMMixin, RAARMixin
 from . import projectional_cupy
@@ -31,6 +33,8 @@
 # can be used to limit the number of blocks, simulating that they don't fit
 MAX_BLOCKS = 99999
 # MAX_BLOCKS = 3  # can be used to limit the number of blocks, simulating that they don't fit
+# the number of blocks to have with a safe value of frames_per_block
+NUM_BLK_SAFE_FPB = 3
 
 __all__ = ['DM_cupy_stream', 'RAAR_cupy_stream']
 
@@ -50,26 +54,36 @@ def engine_initialize(self):
         self.qu_dtoh = cp.cuda.Stream()
 
     def _setup_kernels(self):
-        
+
         super()._setup_kernels()
         ex_mem = 0
         mag_mem = 0
         for scan, kern in self.kernels.items():
             ex_mem = max(kern.aux.nbytes, ex_mem)
             mag_mem = max(kern.FUK.gpu.fdev.nbytes, mag_mem)
         ma_mem = mag_mem
-        
+
         blk = ex_mem * EX_MA_BLOCKS_RATIO + ma_mem + mag_mem
-        
+
         # We need to add the free memory from the pool to the free device memory,
         # as both will be used for allocations
         mempool = cp.get_default_memory_pool()
         mem = cp.cuda.runtime.memGetInfo()[0] + mempool.total_bytes() - mempool.used_bytes()
-        
+
         # leave 200MB room for safety
-        fit = int(mem - 200 * 1024 * 1024) // blk
+        avail_mem = max(int(mem - 200 * 1024 * 1024), 0)
+        fit =  avail_mem // blk
         if not fit:
             log(1, "Cannot fit memory into device, if possible reduce frames per block. Exiting...")
+            # max_fpb is None if there is a GradFull in the scan models
+            # as 'frames_per_block' is irrelevant
+            max_fpb = max_fpb_from_scans(self.ptycho.model.scans)
+            if max_fpb is not None:
+                per_frame = blk / max_fpb
+                safe_fpb = calculate_safe_fpb(avail_mem, per_frame, NUM_BLK_SAFE_FPB)
+                log(1,f"Your current 'frames_per_block' is {max_fpb}.")
+                log(1,f"With current reconstruction parameters and computing resources, you can try setting 'frames_per_block' to {safe_fpb}.")
+                log(1,f"This would divide your reonstruction into {NUM_BLK_SAFE_FPB} blocks.")
             raise SystemExit("ptypy has been exited.")
 
         # TODO grow blocks dynamically
@@ -322,7 +336,7 @@ def engine_iterate(self, num=1):
                 # Update positions
                 if do_update_pos:
                     """
-                    Iterates through all positions and refines them by a given algorithm. 
+                    Iterates through all positions and refines them by a given algorithm.
                     """
                     log(4, "----------- START POS REF -------------")
                     for dID in self.di.S.keys():
@@ -390,7 +404,7 @@ def engine_iterate(self, num=1):
                         data_ma.record_done(self.queue, 'compute')
                         cp.cuda.runtime.memcpyAsync(dst=err_fourier.data.ptr,
                                                src=error_state.data.ptr,
-                                               size=err_fourier.nbytes, 
+                                               size=err_fourier.nbytes,
                                                kind=3, # d2d
                                                stream=self.queue.ptr)
                         if use_tiles:
@@ -405,7 +419,7 @@ def engine_iterate(self, num=1):
             cp.asnumpy(s.gpu, stream=self.queue, out=s.data)
         for name, s in self.pr.S.items():
             cp.asnumpy(s.gpu, stream=self.queue, out=s.data)
-        
+
         # Gather errors from device
         for dID, prep in self.diff_info.items():
             err_fourier = prep.err_fourier_gpu.get()

diff --git a/ptypy/accelerate/cuda_cupy/engines/stochastic.py b/ptypy/accelerate/cuda_cupy/engines/stochastic.py
@@ -19,6 +19,8 @@
 from ptypy.engines import register
 from ptypy.engines.stochastic import EPIEMixin, SDRMixin
 from ptypy.accelerate.base.engines.stochastic import _StochasticEngineSerial
+from ptypy.accelerate.base.mem_utils import (max_fpb_from_scans,
+calculate_safe_fpb)
 from ptypy.accelerate.base import address_manglers
 from .. import get_context
 from ..kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel,\
@@ -35,6 +37,8 @@
 # can be used to limit the number of blocks, simulating that they don't fit
 MAX_BLOCKS = 99999
 # MAX_BLOCKS = 10  # can be used to limit the number of blocks, simulating that they don't fit
+# the number of blocks to have with a safe value of frames_per_block
+NUM_BLK_SAFE_FPB = 3
 
 
 class _StochasticEngineCupy(_StochasticEngineSerial):
@@ -162,11 +166,19 @@ def _setup_kernels(self):
         mem = cp.cuda.runtime.memGetInfo()[0]
         blk = ex_mem * EX_MA_BLOCKS_RATIO + ma_mem + mag_mem
         # leave 200MB room for safety
-        fit = int(mem - 200 * 1024 * 1024) // blk
+        avail_mem = max(int(mem - 200 * 1024 * 1024), 0)
+        fit =  avail_mem // blk
         if not fit:
             log(1, "Cannot fit memory into device, if possible reduce frames per block. Exiting...")
-            self.context.pop()
-            self.context.detach()
+            # max_fpb is None if there is a GradFull in the scan models
+            # as 'frames_per_block' is irrelevant
+            max_fpb = max_fpb_from_scans(self.ptycho.model.scans)
+            if max_fpb is not None:
+                per_frame = blk / max_fpb
+                safe_fpb = calculate_safe_fpb(avail_mem, per_frame, NUM_BLK_SAFE_FPB)
+                log(1,f"Your current 'frames_per_block' is {max_fpb}.")
+                log(1,f"With current reconstruction parameters and computing resources, you can try setting 'frames_per_block' to {safe_fpb}.")
+                log(1,f"This would divide your reonstruction into {NUM_BLK_SAFE_FPB} blocks.")
             raise SystemExit("ptypy has been exited.")
 
         # TODO grow blocks dynamically

diff --git a/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py
@@ -21,6 +21,8 @@
 from ptypy import utils as u
 from ptypy.utils.verbose import logger, log
 from ptypy.utils import parallel
+from ptypy.accelerate.base.mem_utils import (max_fpb_from_scans,
+calculate_safe_fpb)
 from .. import get_context, get_dev_pool
 from ..kernels import PropagationKernel, RealSupportKernel, FourierSupportKernel
 from ..kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel
@@ -33,6 +35,8 @@
 
 MAX_BLOCKS = 99999  # can be used to limit the number of blocks, simulating that they don't fit
 #MAX_BLOCKS = 3  # can be used to limit the number of blocks, simulating that they don't fit
+# the number of blocks to have with a safe value of frames_per_block
+NUM_BLK_SAFE_FPB = 3
 
 @register()
 class ML_pycuda(ML_serial):
@@ -163,9 +167,21 @@ def _setup_kernels(self):
         ma_mem = mag_mem
         mem = cuda.mem_get_info()[0]
         blk = ma_mem + mag_mem
-        fit = int(mem - 200 * 1024 * 1024) // blk  # leave 200MB room for safety
+
+        # leave 200MB room for safety
+        avail_mem = max(int(mem - 200 * 1024 * 1024), 0)
+        fit =  avail_mem // blk
         if not fit:
             log(1,"Cannot fit memory into device, if possible reduce frames per block. Exiting...")
+            # max_fpb is None if there is a GradFull in the scan models
+            # as 'frames_per_block' is irrelevant
+            max_fpb = max_fpb_from_scans(self.ptycho.model.scans)
+            if max_fpb is not None:
+                per_frame = blk / max_fpb
+                safe_fpb = calculate_safe_fpb(avail_mem, per_frame, NUM_BLK_SAFE_FPB)
+                log(1,f"Your current 'frames_per_block' is {max_fpb}.")
+                log(1,f"With current reconstruction parameters and computing resources, you can try setting 'frames_per_block' to {safe_fpb}.")
+                log(1,f"This would divide your reonstruction into {NUM_BLK_SAFE_FPB} blocks.")
             raise SystemExit("ptypy has been exited.")
 
         # TODO grow blocks dynamically

diff --git a/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda.py
@@ -19,6 +19,7 @@
 from ptypy.engines import register
 from ptypy.engines.projectional import DMMixin, RAARMixin
 from ptypy.accelerate.base.engines import projectional_serial
+from ptypy.accelerate.base.mem_utils import calculate_safe_fpb
 from .. import get_context
 from ..kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel
 from ..kernels import PropagationKernel, RealSupportKernel, FourierSupportKernel
@@ -30,6 +31,10 @@
 
 __all__ = ['DM_pycuda', 'RAAR_pycuda']
 
+# the number of blocks to have with a safe value of frames_per_block
+NUM_BLK_SAFE_FPB = 3
+
+
 class _ProjectionEngine_pycuda(projectional_serial._ProjectionEngine_serial):
 
     """
@@ -123,6 +128,13 @@ def _setup_kernels(self):
             mem = cuda.mem_get_info()[0]
             if not int(mem) // aux.nbytes:
                 log(1,"Cannot fit memory into device, if possible reduce frames per block or nr. of modes. Exiting...")
+                if scan.__class__.__name__ != "GradFull":
+                    # only make sense if the model is not GradFull
+                    per_frame = (aux.nbytes / aux.shape[0]) * nmodes
+                    safe_fpb = calculate_safe_fpb(mem, per_frame, NUM_BLK_SAFE_FPB)
+                    log(1,f"Your current 'frames_per_block' is {fpc}.")
+                    log(1,f"With current reconstruction parameters and computing resources, you can try setting 'frames_per_block' to {safe_fpb}.")
+                    log(1,f"This would divide your reonstruction into {NUM_BLK_SAFE_FPB} blocks.")
                 raise SystemExit("ptypy has been exited.")
             kern.aux = gpuarray.to_gpu(aux)
 

diff --git a/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda_stream.py b/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda_stream.py
@@ -22,6 +22,8 @@
 from ptypy import utils as u
 from ptypy.utils.verbose import log, logger
 from ptypy.utils import parallel
+from ptypy.accelerate.base.mem_utils import (max_fpb_from_scans,
+calculate_safe_fpb)
 from ptypy.engines import register
 from ptypy.engines.projectional import DMMixin, RAARMixin
 from . import projectional_pycuda
@@ -32,6 +34,8 @@
 EX_MA_BLOCKS_RATIO = 2
 MAX_BLOCKS = 99999  # can be used to limit the number of blocks, simulating that they don't fit
 #MAX_BLOCKS = 3  # can be used to limit the number of blocks, simulating that they don't fit
+# the number of blocks to have with a safe value of frames_per_block
+NUM_BLK_SAFE_FPB = 3
 
 __all__ = ['DM_pycuda_stream', 'RAAR_pycuda_stream']
 
@@ -61,9 +65,21 @@ def _setup_kernels(self):
         ma_mem = mag_mem
         mem = cuda.mem_get_info()[0]
         blk = ex_mem * EX_MA_BLOCKS_RATIO + ma_mem + mag_mem
-        fit = int(mem - 200 * 1024 * 1024) // blk  # leave 200MB room for safety
+
+        # leave 200MB room for safety
+        avail_mem = max(int(mem - 200 * 1024 * 1024), 0)
+        fit =  avail_mem // blk
         if not fit:
             log(1,"Cannot fit memory into device, if possible reduce frames per block. Exiting...")
+            # max_fpb is None if there is a GradFull in the scan models
+            # as 'frames_per_block' is irrelevant
+            max_fpb = max_fpb_from_scans(self.ptycho.model.scans)
+            if max_fpb is not None:
+                per_frame = blk / max_fpb
+                safe_fpb = calculate_safe_fpb(avail_mem, per_frame, NUM_BLK_SAFE_FPB)
+                log(1,f"Your current 'frames_per_block' is {max_fpb}.")
+                log(1,f"With current reconstruction parameters and computing resources, you can try setting 'frames_per_block' to {safe_fpb}.")
+                log(1,f"This would divide your reonstruction into {NUM_BLK_SAFE_FPB} blocks.")
             raise SystemExit("ptypy has been exited.")
 
         # TODO grow blocks dynamically