Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions ptypy/accelerate/base/mem_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from math import floor


def max_fpb_from_scans(scans):
"""Find maximum 'frames_per_block' from different scan models.

Parameters
----------
scans : dict
a dictionary contains different scan models, i.e. instances of
ptypy.core.manager.ScanModel.

Returns
-------
max_fpb : int
the maximum number of 'frames_per_block' in these scan models,
None if any one of them is GradFull as it is irrelevant.

"""
max_fpb = 0
for scan in scans.values():
if scan.__class__.__name__ == "GradFull":
# the 'frames_per_block' is irrelevant for the GradFull
# model
return None
max_fpb = max(scan.max_frames_per_block, max_fpb)

return max_fpb

def calculate_safe_fpb(mem_avail, mem_per_frame, nblk=3):
"""Return a safe value of 'frames_per_block' from memory information.

Parameters
----------
mem_avail : int
the available GPU memory
mem_per_frame : int
the GPU memory required for a single frame
nblk : int, optional
the total number of blocks the data will be divided into.
Default to 3.

"""
if mem_per_frame < 0 or mem_avail < 0:
msg = "Memory should be a positive number."
raise ValueError(msg)

return floor((mem_avail / nblk) / mem_per_frame)
20 changes: 17 additions & 3 deletions ptypy/accelerate/cuda_cupy/engines/ML_cupy.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
from ptypy import utils as u
from ptypy.utils.verbose import logger, log
from ptypy.utils import parallel
from ptypy.accelerate.base.mem_utils import (max_fpb_from_scans,
calculate_safe_fpb)
from .. import get_context, log_device_memory_stats
from ..kernels import PropagationKernel, RealSupportKernel, FourierSupportKernel
from ..kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel
Expand All @@ -34,6 +36,8 @@
# can be used to limit the number of blocks, simulating that they don't fit
MAX_BLOCKS = 99999
# MAX_BLOCKS = 3 # can be used to limit the number of blocks, simulating that they don't fit
# the number of blocks to have with a safe value of frames_per_block
NUM_BLK_SAFE_FPB = 3


@register()
Expand Down Expand Up @@ -161,9 +165,19 @@ def _setup_kernels(self):
mem = cp.cuda.runtime.memGetInfo()[0] + mempool.total_bytes() - mempool.used_bytes()

# leave 200MB room for safety
fit = int(mem - 200 * 1024 * 1024) // blk
avail_mem = max(int(mem - 200 * 1024 * 1024), 0)
fit = avail_mem // blk
if not fit:
log(1, "Cannot fit memory into device, if possible reduce frames per block. Exiting...")
# max_fpb is None if there is a GradFull in the scan models
# as 'frames_per_block' is irrelevant
max_fpb = max_fpb_from_scans(self.ptycho.model.scans)
if max_fpb is not None:
per_frame = blk / max_fpb
safe_fpb = calculate_safe_fpb(avail_mem, per_frame, NUM_BLK_SAFE_FPB)
log(1,f"Your current 'frames_per_block' is {max_fpb}.")
log(1,f"With current reconstruction parameters and computing resources, you can try setting 'frames_per_block' to {safe_fpb}.")
log(1,f"This would divide your reonstruction into {NUM_BLK_SAFE_FPB} blocks.")
raise SystemExit("ptypy has been exited.")

# TODO grow blocks dynamically
Expand Down Expand Up @@ -318,7 +332,7 @@ def engine_iterate(self, num=1):
return err

def position_update(self):
"""
"""
Position refinement
"""
if not self.do_position_refinement or (not self.curiter):
Expand Down Expand Up @@ -746,7 +760,7 @@ def delxf(x, axis=-1):
self.dot = lambda x, y: self.AUK.dot(x, y).get().item()

self._grad_reg_kernel = cp.ElementwiseKernel(
"float32 fac, complex64 py, complex64 px, complex64 my, complex64 mx",
"float32 fac, complex64 py, complex64 px, complex64 my, complex64 mx",
"complex64 out",
"out = (px+py-my-mx) * fac",
"grad_reg",
Expand Down
13 changes: 12 additions & 1 deletion ptypy/accelerate/cuda_cupy/engines/projectional_cupy.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from ptypy.engines import register
from ptypy.engines.projectional import DMMixin, RAARMixin
from ptypy.accelerate.base.engines import projectional_serial
from ptypy.accelerate.base.mem_utils import calculate_safe_fpb
from ..kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel
from ..kernels import PropagationKernel, RealSupportKernel, FourierSupportKernel
from ..array_utils import ArrayUtilsKernel, GaussianSmoothingKernel,\
Expand All @@ -28,6 +29,9 @@

__all__ = ['DM_cupy', 'RAAR_cupy']

# the number of blocks to have with a safe value of frames_per_block
NUM_BLK_SAFE_FPB = 3


class _ProjectionEngine_cupy(projectional_serial._ProjectionEngine_serial):

Expand Down Expand Up @@ -122,6 +126,13 @@ def _setup_kernels(self):
mem = cp.cuda.runtime.memGetInfo()[0] + mempool.total_bytes() - mempool.used_bytes()
if not int(mem) // aux.nbytes:
log(1,"Cannot fit memory into device, if possible reduce frames per block or nr. of modes. Exiting...")
if scan.__class__.__name__ != "GradFull":
# only make sense if the model is not GradFull
per_frame = (aux.nbytes / aux.shape[0]) * nmodes
safe_fpb = calculate_safe_fpb(mem, per_frame, NUM_BLK_SAFE_FPB)
log(1,f"Your current 'frames_per_block' is {fpc}.")
log(1,f"With current reconstruction parameters and computing resources, you can try setting 'frames_per_block' to {safe_fpb}.")
log(1,f"This would divide your reonstruction into {NUM_BLK_SAFE_FPB} blocks.")
raise SystemExit("ptypy has been exited.")
kern.aux = cp.asarray(aux)

Expand Down Expand Up @@ -209,7 +220,7 @@ def engine_iterate(self, num=1):
queue.use()

for it in range(num):

reduced_error = np.zeros((3,))
reduced_error_count = 0
local_error = {}
Expand Down
30 changes: 22 additions & 8 deletions ptypy/accelerate/cuda_cupy/engines/projectional_cupy_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
from ptypy.accelerate.cuda_cupy import log_device_memory_stats
from ptypy.utils.verbose import log
from ptypy.utils import parallel
from ptypy.accelerate.base.mem_utils import (max_fpb_from_scans,
calculate_safe_fpb)
from ptypy.engines import register
from ptypy.engines.projectional import DMMixin, RAARMixin
from . import projectional_cupy
Expand All @@ -31,6 +33,8 @@
# can be used to limit the number of blocks, simulating that they don't fit
MAX_BLOCKS = 99999
# MAX_BLOCKS = 3 # can be used to limit the number of blocks, simulating that they don't fit
# the number of blocks to have with a safe value of frames_per_block
NUM_BLK_SAFE_FPB = 3

__all__ = ['DM_cupy_stream', 'RAAR_cupy_stream']

Expand All @@ -50,26 +54,36 @@ def engine_initialize(self):
self.qu_dtoh = cp.cuda.Stream()

def _setup_kernels(self):

super()._setup_kernels()
ex_mem = 0
mag_mem = 0
for scan, kern in self.kernels.items():
ex_mem = max(kern.aux.nbytes, ex_mem)
mag_mem = max(kern.FUK.gpu.fdev.nbytes, mag_mem)
ma_mem = mag_mem

blk = ex_mem * EX_MA_BLOCKS_RATIO + ma_mem + mag_mem

# We need to add the free memory from the pool to the free device memory,
# as both will be used for allocations
mempool = cp.get_default_memory_pool()
mem = cp.cuda.runtime.memGetInfo()[0] + mempool.total_bytes() - mempool.used_bytes()

# leave 200MB room for safety
fit = int(mem - 200 * 1024 * 1024) // blk
avail_mem = max(int(mem - 200 * 1024 * 1024), 0)
fit = avail_mem // blk
if not fit:
log(1, "Cannot fit memory into device, if possible reduce frames per block. Exiting...")
# max_fpb is None if there is a GradFull in the scan models
# as 'frames_per_block' is irrelevant
max_fpb = max_fpb_from_scans(self.ptycho.model.scans)
if max_fpb is not None:
per_frame = blk / max_fpb
safe_fpb = calculate_safe_fpb(avail_mem, per_frame, NUM_BLK_SAFE_FPB)
log(1,f"Your current 'frames_per_block' is {max_fpb}.")
log(1,f"With current reconstruction parameters and computing resources, you can try setting 'frames_per_block' to {safe_fpb}.")
log(1,f"This would divide your reonstruction into {NUM_BLK_SAFE_FPB} blocks.")
raise SystemExit("ptypy has been exited.")

# TODO grow blocks dynamically
Expand Down Expand Up @@ -322,7 +336,7 @@ def engine_iterate(self, num=1):
# Update positions
if do_update_pos:
"""
Iterates through all positions and refines them by a given algorithm.
Iterates through all positions and refines them by a given algorithm.
"""
log(4, "----------- START POS REF -------------")
for dID in self.di.S.keys():
Expand Down Expand Up @@ -390,7 +404,7 @@ def engine_iterate(self, num=1):
data_ma.record_done(self.queue, 'compute')
cp.cuda.runtime.memcpyAsync(dst=err_fourier.data.ptr,
src=error_state.data.ptr,
size=err_fourier.nbytes,
size=err_fourier.nbytes,
kind=3, # d2d
stream=self.queue.ptr)
if use_tiles:
Expand All @@ -405,7 +419,7 @@ def engine_iterate(self, num=1):
cp.asnumpy(s.gpu, stream=self.queue, out=s.data)
for name, s in self.pr.S.items():
cp.asnumpy(s.gpu, stream=self.queue, out=s.data)

# Gather errors from device
for dID, prep in self.diff_info.items():
err_fourier = prep.err_fourier_gpu.get()
Expand Down
18 changes: 15 additions & 3 deletions ptypy/accelerate/cuda_cupy/engines/stochastic.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
from ptypy.engines import register
from ptypy.engines.stochastic import EPIEMixin, SDRMixin
from ptypy.accelerate.base.engines.stochastic import _StochasticEngineSerial
from ptypy.accelerate.base.mem_utils import (max_fpb_from_scans,
calculate_safe_fpb)
from ptypy.accelerate.base import address_manglers
from .. import get_context
from ..kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel,\
Expand All @@ -35,6 +37,8 @@
# can be used to limit the number of blocks, simulating that they don't fit
MAX_BLOCKS = 99999
# MAX_BLOCKS = 10 # can be used to limit the number of blocks, simulating that they don't fit
# the number of blocks to have with a safe value of frames_per_block
NUM_BLK_SAFE_FPB = 3


class _StochasticEngineCupy(_StochasticEngineSerial):
Expand Down Expand Up @@ -162,11 +166,19 @@ def _setup_kernels(self):
mem = cp.cuda.runtime.memGetInfo()[0]
blk = ex_mem * EX_MA_BLOCKS_RATIO + ma_mem + mag_mem
# leave 200MB room for safety
fit = int(mem - 200 * 1024 * 1024) // blk
avail_mem = max(int(mem - 200 * 1024 * 1024), 0)
fit = avail_mem // blk
if not fit:
log(1, "Cannot fit memory into device, if possible reduce frames per block. Exiting...")
self.context.pop()
self.context.detach()
# max_fpb is None if there is a GradFull in the scan models
# as 'frames_per_block' is irrelevant
max_fpb = max_fpb_from_scans(self.ptycho.model.scans)
if max_fpb is not None:
per_frame = blk / max_fpb
safe_fpb = calculate_safe_fpb(avail_mem, per_frame, NUM_BLK_SAFE_FPB)
log(1,f"Your current 'frames_per_block' is {max_fpb}.")
log(1,f"With current reconstruction parameters and computing resources, you can try setting 'frames_per_block' to {safe_fpb}.")
log(1,f"This would divide your reonstruction into {NUM_BLK_SAFE_FPB} blocks.")
raise SystemExit("ptypy has been exited.")

# TODO grow blocks dynamically
Expand Down
18 changes: 17 additions & 1 deletion ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
from ptypy import utils as u
from ptypy.utils.verbose import logger, log
from ptypy.utils import parallel
from ptypy.accelerate.base.mem_utils import (max_fpb_from_scans,
calculate_safe_fpb)
from .. import get_context, get_dev_pool
from ..kernels import PropagationKernel, RealSupportKernel, FourierSupportKernel
from ..kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel
Expand All @@ -33,6 +35,8 @@

MAX_BLOCKS = 99999 # can be used to limit the number of blocks, simulating that they don't fit
#MAX_BLOCKS = 3 # can be used to limit the number of blocks, simulating that they don't fit
# the number of blocks to have with a safe value of frames_per_block
NUM_BLK_SAFE_FPB = 3

@register()
class ML_pycuda(ML_serial):
Expand Down Expand Up @@ -163,9 +167,21 @@ def _setup_kernels(self):
ma_mem = mag_mem
mem = cuda.mem_get_info()[0]
blk = ma_mem + mag_mem
fit = int(mem - 200 * 1024 * 1024) // blk # leave 200MB room for safety

# leave 200MB room for safety
avail_mem = max(int(mem - 200 * 1024 * 1024), 0)
fit = avail_mem // blk
if not fit:
log(1,"Cannot fit memory into device, if possible reduce frames per block. Exiting...")
# max_fpb is None if there is a GradFull in the scan models
# as 'frames_per_block' is irrelevant
max_fpb = max_fpb_from_scans(self.ptycho.model.scans)
if max_fpb is not None:
per_frame = blk / max_fpb
safe_fpb = calculate_safe_fpb(avail_mem, per_frame, NUM_BLK_SAFE_FPB)
log(1,f"Your current 'frames_per_block' is {max_fpb}.")
log(1,f"With current reconstruction parameters and computing resources, you can try setting 'frames_per_block' to {safe_fpb}.")
log(1,f"This would divide your reonstruction into {NUM_BLK_SAFE_FPB} blocks.")
raise SystemExit("ptypy has been exited.")

# TODO grow blocks dynamically
Expand Down
12 changes: 12 additions & 0 deletions ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from ptypy.engines import register
from ptypy.engines.projectional import DMMixin, RAARMixin
from ptypy.accelerate.base.engines import projectional_serial
from ptypy.accelerate.base.mem_utils import calculate_safe_fpb
from .. import get_context
from ..kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel
from ..kernels import PropagationKernel, RealSupportKernel, FourierSupportKernel
Expand All @@ -30,6 +31,10 @@

__all__ = ['DM_pycuda', 'RAAR_pycuda']

# the number of blocks to have with a safe value of frames_per_block
NUM_BLK_SAFE_FPB = 3


class _ProjectionEngine_pycuda(projectional_serial._ProjectionEngine_serial):

"""
Expand Down Expand Up @@ -123,6 +128,13 @@ def _setup_kernels(self):
mem = cuda.mem_get_info()[0]
if not int(mem) // aux.nbytes:
log(1,"Cannot fit memory into device, if possible reduce frames per block or nr. of modes. Exiting...")
if scan.__class__.__name__ != "GradFull":
# only make sense if the model is not GradFull
per_frame = (aux.nbytes / aux.shape[0]) * nmodes
safe_fpb = calculate_safe_fpb(mem, per_frame, NUM_BLK_SAFE_FPB)
log(1,f"Your current 'frames_per_block' is {fpc}.")
log(1,f"With current reconstruction parameters and computing resources, you can try setting 'frames_per_block' to {safe_fpb}.")
log(1,f"This would divide your reonstruction into {NUM_BLK_SAFE_FPB} blocks.")
raise SystemExit("ptypy has been exited.")
kern.aux = gpuarray.to_gpu(aux)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
from ptypy import utils as u
from ptypy.utils.verbose import log, logger
from ptypy.utils import parallel
from ptypy.accelerate.base.mem_utils import (max_fpb_from_scans,
calculate_safe_fpb)
from ptypy.engines import register
from ptypy.engines.projectional import DMMixin, RAARMixin
from . import projectional_pycuda
Expand All @@ -32,6 +34,8 @@
EX_MA_BLOCKS_RATIO = 2
MAX_BLOCKS = 99999 # can be used to limit the number of blocks, simulating that they don't fit
#MAX_BLOCKS = 3 # can be used to limit the number of blocks, simulating that they don't fit
# the number of blocks to have with a safe value of frames_per_block
NUM_BLK_SAFE_FPB = 3

__all__ = ['DM_pycuda_stream', 'RAAR_pycuda_stream']

Expand Down Expand Up @@ -61,9 +65,21 @@ def _setup_kernels(self):
ma_mem = mag_mem
mem = cuda.mem_get_info()[0]
blk = ex_mem * EX_MA_BLOCKS_RATIO + ma_mem + mag_mem
fit = int(mem - 200 * 1024 * 1024) // blk # leave 200MB room for safety

# leave 200MB room for safety
avail_mem = max(int(mem - 200 * 1024 * 1024), 0)
fit = avail_mem // blk
if not fit:
log(1,"Cannot fit memory into device, if possible reduce frames per block. Exiting...")
# max_fpb is None if there is a GradFull in the scan models
# as 'frames_per_block' is irrelevant
max_fpb = max_fpb_from_scans(self.ptycho.model.scans)
if max_fpb is not None:
per_frame = blk / max_fpb
safe_fpb = calculate_safe_fpb(avail_mem, per_frame, NUM_BLK_SAFE_FPB)
log(1,f"Your current 'frames_per_block' is {max_fpb}.")
log(1,f"With current reconstruction parameters and computing resources, you can try setting 'frames_per_block' to {safe_fpb}.")
log(1,f"This would divide your reonstruction into {NUM_BLK_SAFE_FPB} blocks.")
raise SystemExit("ptypy has been exited.")

# TODO grow blocks dynamically
Expand Down
Loading