From af7558973f73ae8dc7b1211ea8b903ba3e81abe7 Mon Sep 17 00:00:00 2001 From: Jaroslav Fowkes Date: Thu, 14 Nov 2024 09:27:21 +0000 Subject: [PATCH 01/12] ML with separate gradient steps --- ptypy/custom/ML_separate_grads.py | 782 ++++++++++++++++++++++++++++++ 1 file changed, 782 insertions(+) create mode 100644 ptypy/custom/ML_separate_grads.py diff --git a/ptypy/custom/ML_separate_grads.py b/ptypy/custom/ML_separate_grads.py new file mode 100644 index 000000000..49b397f9e --- /dev/null +++ b/ptypy/custom/ML_separate_grads.py @@ -0,0 +1,782 @@ +# -*- coding: utf-8 -*- +""" +Maximum Likelihood separate gradients reconstruction engine. + +TODO. + + * Implement other ML models (Poisson/Euclid) + +This file is part of the PTYPY package. + + :copyright: Copyright 2024 by the PTYPY team, see AUTHORS. + :license: see LICENSE for details. +""" +import numpy as np +import time + +from .. import utils as u +from ..utils.verbose import logger +from ..utils import parallel +from ..engines.utils import Cnorm2, Cdot +from ..engines import register +from ..engines.base import BaseEngine, PositionCorrectionEngine +from ..core.manager import Full, Vanilla, Bragg3dModel, BlockVanilla, BlockFull, GradFull, BlockGradFull + + +__all__ = ['MLSeparateGrads'] + + +@register() +class MLSeparateGrads(PositionCorrectionEngine): + """ + Maximum likelihood reconstruction engine. + + + Defaults: + + [name] + default = ML + type = str + help = + doc = + + [ML_type] + default = 'gaussian' + type = str + help = Likelihood model + choices = ['gaussian','poisson','euclid'] + doc = One of ‘gaussian’, poisson’ or ‘euclid’. + + [floating_intensities] + default = False + type = bool + help = Adaptive diffraction pattern rescaling + doc = If True, allow for adaptative rescaling of the diffraction pattern intensities (to correct for incident beam intensity fluctuations). + + [intensity_renormalization] + default = 1. + type = float + lowlim = 0.0 + help = Rescales the intensities so they can be interpreted as Poisson counts. + + [reg_del2] + default = False + type = bool + help = Whether to use a Gaussian prior (smoothing) regularizer + + [reg_del2_amplitude] + default = .01 + type = float + lowlim = 0.0 + help = Amplitude of the Gaussian prior if used + + [smooth_gradient] + default = 0.0 + type = float + help = Smoothing preconditioner + doc = Sigma for gaussian filter (turned off if 0.) + + [smooth_gradient_decay] + default = 0. + type = float + help = Decay rate for smoothing preconditioner + doc = Sigma for gaussian filter will reduce exponentially at this rate + + [scale_precond] + default = False + type = bool + help = Whether to use the object/probe scaling preconditioner + doc = This parameter can give faster convergence for weakly scattering samples. + + [probe_update_start] + default = 0 + type = int + lowlim = 0 + help = Number of iterations before probe update starts + # NOTE: probe_update_start doesn't work with this code, need to add some code to fix this + + [poly_line_coeffs] + default = quadratic + type = str + help = How many coefficients to be used in the the linesearch + choices = ['quadratic','all'] + doc = choose between the 'quadratic' approximation (default) or 'all' + + """ + + SUPPORTED_MODELS = [Full, Vanilla, Bragg3dModel, BlockVanilla, BlockFull, GradFull, BlockGradFull] + + def __init__(self, ptycho_parent, pars=None): + """ + Maximum likelihood reconstruction engine. + """ + super(MLSeparateGrads, self).__init__(ptycho_parent, pars) + + # Instance attributes + + # Object gradient + self.ob_grad = None + + # Object minimization direction + self.ob_h = None + + # Probe gradient + self.pr_grad = None + + # Probe minimization direction + self.pr_h = None + + # Working variables + # Object gradient + self.ob_grad_new = None + + # Probe gradient + self.pr_grad_new = None + + + # Other + self.tmin_ob = None + self.tmin_pr = None + self.ML_model = None + self.smooth_gradient = None + + self.ptycho.citations.add_article( + title='Maximum-likelihood refinement for coherent diffractive imaging', + author='Thibault P. and Guizar-Sicairos M.', + journal='New Journal of Physics', + volume=14, + year=2012, + page=63004, + doi='10.1088/1367-2630/14/6/063004', + comment='The maximum likelihood reconstruction algorithm', + ) + + def engine_initialize(self): + """ + Prepare for ML reconstruction. + """ + super(MLSeparateGrads, self).engine_initialize() + + # Object gradient and minimization direction + self.ob_grad = self.ob.copy(self.ob.ID + '_grad', fill=0.) + self.ob_grad_new = self.ob.copy(self.ob.ID + '_grad_new', fill=0.) + self.ob_h = self.ob.copy(self.ob.ID + '_h', fill=0.) + + # Probe gradient and minimization direction + self.pr_grad = self.pr.copy(self.pr.ID + '_grad', fill=0.) + self.pr_grad_new = self.pr.copy(self.pr.ID + '_grad_new', fill=0.) + self.pr_h = self.pr.copy(self.pr.ID + '_h', fill=0.) + + self.tmin_ob = 1. + self.tmin_pr = 1. + + # Other options + self.smooth_gradient = prepare_smoothing_preconditioner( + self.p.smooth_gradient) + + self._initialize_model() + + def _initialize_model(self): + + # Create noise model + if self.p.ML_type.lower() == "gaussian": + self.ML_model = GaussianModel(self) + # elif self.p.ML_type.lower() == "poisson": + # self.ML_model = PoissonModel(self) + # elif self.p.ML_type.lower() == "euclid": + # self.ML_model = EuclidModel(self) + else: + raise RuntimeError("Unsupported ML_type: '%s'" % self.p.ML_type) + + + + def engine_prepare(self): + """ + Last minute initialization, everything, that needs to be recalculated, + when new data arrives. + """ + # - # fill object with coverage of views + # - for name,s in self.ob_viewcover.S.items(): + # - s.fill(s.get_view_coverage()) + self.ML_model.prepare() + + def engine_iterate(self, num=1): + """ + Compute `num` iterations. + """ + tg = 0. + tc = 0. + ta = time.time() + for it in range(num): + + ######################## + # Compute new gradients + # ob: new_ob_grad + # pr: new_pr_grad + ######################## + t1 = time.time() + error_dct = self.ML_model.new_grad() + new_ob_grad, new_pr_grad = self.ob_grad_new, self.pr_grad_new + tg += time.time() - t1 + + if self.p.probe_update_start <= self.curiter: + # Apply probe support if needed + for name, s in new_pr_grad.storages.items(): + self.support_constraint(s) + #support = self.probe_support.get(name) + #if support is not None: + # s.data *= support + # FIXME: this hack doesn't work here as we step in probe and object separately + # FIXME: really it's the probe step that should be zeroed out not the gradient + else: + new_pr_grad.fill(0.) + + # Smoothing preconditioner + if self.smooth_gradient: + self.smooth_gradient.sigma *= (1. - self.p.smooth_gradient_decay) + for name, s in new_ob_grad.storages.items(): + s.data[:] = self.smooth_gradient(s.data) + + ############################ + # Compute Polak-Ribiere betas + # bt_ob = bt_num_ob/bt_denom_ob + # bt_pr = bt_num_pr/bt_denom_pr + ############################ + if self.curiter == 0: + bt_ob = 0. + bt_pr = 0. + else: + # For the object + bt_num_ob = Cnorm2(new_ob_grad) - np.real(Cdot(new_ob_grad, self.ob_grad)) + bt_denom_ob = Cnorm2(self.ob_grad) + bt_ob = max(0, bt_num_ob/bt_denom_ob) + + # For the probe + bt_num_pr = Cnorm2(new_pr_grad) - np.real(Cdot(new_pr_grad, self.pr_grad)) + bt_denom_pr = Cnorm2(self.pr_grad) + bt_pr = max(0, bt_num_pr/bt_denom_pr) + + self.ob_grad << new_ob_grad + self.pr_grad << new_pr_grad + + ############################ + # Compute next conjugates + # ob_h -= bt_ob * ob_grad + # pr_h -= bt_pr * pr_grad + # NB: in the below need to do h/tmin + # as did h*tmin when taking steps + # (don't you just love containers?) + ############################ + self.ob_h *= bt_ob / self.tmin_ob + + # Smoothing preconditioner + if self.smooth_gradient: + for name, s in self.ob_h.storages.items(): + s.data[:] -= self.smooth_gradient(self.ob_grad.storages[name].data) + else: + self.ob_h -= self.ob_grad + + self.pr_h *= bt_pr / self.tmin_pr + self.pr_h -= self.pr_grad + + ######################## + # Compute step-sizes + # ob: tmin_ob + # pr: tmin_pr + ######################## + dt = self.ptycho.FType + t2 = time.time() + + if self.p.poly_line_coeffs == "quadratic": + B_ob = self.ML_model.poly_line_coeffs_ob(self.ob_h) + B_pr = self.ML_model.poly_line_coeffs_pr(self.pr_h) + + # same as above but quicker when poly quadratic + self.tmin_ob = dt(-0.5 * B_ob[1] / B_ob[2]) + self.tmin_pr = dt(-0.5 * B_pr[1] / B_pr[2]) + + else: + raise NotImplementedError("poly_line_coeffs should be 'quadratic' or 'all'") + + tc += time.time() - t2 + + ######################## + # Take conjugate steps + # ob += tmin_ob * ob_h + # pr += tmin_pr * pr_h + ######################## + self.ob_h *= self.tmin_ob + self.pr_h *= self.tmin_pr + self.ob += self.ob_h + self.pr += self.pr_h + + # Position correction + self.position_update() + + # Allow for customized modifications at the end of each iteration + self._post_iterate_update() + + # increase iteration counter + self.curiter +=1 + + logger.info('Time spent in gradient calculation: %.2f' % tg) + logger.info(' .... in coefficient calculation: %.2f' % tc) + return error_dct # np.array([[self.ML_model.LL[0]] * 3]) + + def _post_iterate_update(self): + """ + Enables modification at the end of each ML iteration. + """ + pass + + def engine_finalize(self): + """ + Delete temporary containers. + """ + del self.ptycho.containers[self.ob_grad.ID] + del self.ob_grad + del self.ptycho.containers[self.ob_grad_new.ID] + del self.ob_grad_new + del self.ptycho.containers[self.ob_h.ID] + del self.ob_h + del self.ptycho.containers[self.pr_grad.ID] + del self.pr_grad + del self.ptycho.containers[self.pr_grad_new.ID] + del self.pr_grad_new + del self.ptycho.containers[self.pr_h.ID] + del self.pr_h + + # Save floating intensities into runtime + self.ptycho.runtime["float_intens"] = parallel.gather_dict(self.ML_model.float_intens_coeff) + + # Delete model + del self.ML_model + +class BaseModel(object): + """ + Base class for log-likelihood models. + """ + + def __init__(self, MLengine): + """ + Core functions for ML computation using a Gaussian model. + """ + self.engine = MLengine + + # Transfer commonly used attributes from ML engine + self.di = self.engine.di + self.p = self.engine.p + self.ob = self.engine.ob + self.ob_grad = self.engine.ob_grad_new + self.pr_grad = self.engine.pr_grad_new + self.pr = self.engine.pr + self.float_intens_coeff = {} + + if self.p.intensity_renormalization is None: + self.Irenorm = 1. + else: + self.Irenorm = self.p.intensity_renormalization + + if self.p.reg_del2: + self.regularizer = Regul_del2(self.p.reg_del2_amplitude) + else: + self.regularizer = None + + # Create working variables + self.LL = 0. + + + def prepare(self): + # Useful quantities + self.tot_measpts = sum(s.data.size + for s in self.di.storages.values()) + self.tot_power = self.Irenorm * sum(s.tot_power + for s in self.di.storages.values()) + # Prepare regularizer + if self.regularizer is not None: + obj_Npix = self.ob.size + expected_obj_var = obj_Npix / self.tot_power # Poisson + reg_rescale = self.tot_measpts / (8. * obj_Npix * expected_obj_var) + logger.debug( + 'Rescaling regularization amplitude using ' + 'the Poisson distribution assumption.') + logger.debug('Factor: %8.5g' % reg_rescale) + + # TODO remove usage of .p. access + self.regularizer.amplitude = self.p.reg_del2_amplitude * reg_rescale + + def __del__(self): + """ + Clean up routine + """ + # Remove working attributes + for name, diff_view in self.di.views.items(): + if not diff_view.active: + continue + try: + del diff_view.error + except: + pass + + def new_grad(self): + """ + Compute new object and probe gradient directions according to the noise model. + + Note: The negative log-likelihood and local errors should also be computed here. + """ + raise NotImplementedError + + def poly_line_coeffs_ob(self, ob_h): + """ + Compute the coefficients of the polynomial for line minimization + in direction h for the object + """ + raise NotImplementedError + + def poly_line_coeffs_pr(self, pr_h): + """ + Compute the coefficients of the polynomial for line minimization + in direction h for the probe + """ + raise NotImplementedError + + +class GaussianModel(BaseModel): + """ + Gaussian noise model. + TODO: feed actual statistical weights instead of using the Poisson statistic heuristic. + """ + + def __init__(self, MLengine): + """ + Core functions for ML computation using a Gaussian model. + """ + BaseModel.__init__(self, MLengine) + + # Gaussian model requires weights + # TODO: update this part of the code once actual weights are passed in the PODs + self.weights = self.engine.di.copy(self.engine.di.ID + '_weights') + # FIXME: This part needs to be updated once statistical weights are properly + # supported in the data preparation. + for name, di_view in self.di.views.items(): + if not di_view.active: + continue + self.weights[di_view] = (self.Irenorm * di_view.pod.ma_view.data + / (1./self.Irenorm + di_view.data)) + + def __del__(self): + """ + Clean up routine + """ + BaseModel.__del__(self) + del self.engine.ptycho.containers[self.weights.ID] + del self.weights + + def new_grad(self): + """ + Compute a new gradient direction according to a Gaussian noise model. + + Note: The negative log-likelihood and local errors are also computed + here. + """ + self.ob_grad.fill(0.) + self.pr_grad.fill(0.) + + # We need an array for MPI + LL = np.array([0.]) + error_dct = {} + + # Outer loop: through diffraction patterns + for dname, diff_view in self.di.views.items(): + if not diff_view.active: + continue + + # Weights and intensities for this view + w = self.weights[diff_view] + I = diff_view.data + + Imodel = np.zeros_like(I) + f = {} + + # First pod loop: compute total intensity + for name, pod in diff_view.pods.items(): + if not pod.active: + continue + f[name] = pod.fw(pod.probe * pod.object) + Imodel += pod.downsample(u.abs2(f[name])) + + # Floating intensity option + if self.p.floating_intensities: + self.float_intens_coeff[dname] = ((w * Imodel * I).sum() + / (w * Imodel**2).sum()) + Imodel *= self.float_intens_coeff[dname] + + DI = np.double(Imodel) - I + + # Second pod loop: gradients computation + LLL = np.sum((w * DI**2).astype(np.float64)) + for name, pod in diff_view.pods.items(): + if not pod.active: + continue + xi = pod.bw(pod.upsample(w*DI) * f[name]) + self.ob_grad[pod.ob_view] += 2. * xi * pod.probe.conj() + self.pr_grad[pod.pr_view] += 2. * xi * pod.object.conj() + + diff_view.error = LLL + error_dct[dname] = np.array([0, LLL / np.prod(DI.shape), 0]) + LL += LLL + + # MPI reduction of gradients + self.ob_grad.allreduce() + self.pr_grad.allreduce() + parallel.allreduce(LL) + + # Object regularizer + if self.regularizer: + for name, s in self.ob.storages.items(): + self.ob_grad.storages[name].data += self.regularizer.grad( + s.data) + LL += self.regularizer.LL + self.LL = LL / self.tot_measpts + + return error_dct + + def poly_line_coeffs_ob(self, ob_h): + """ + Compute the coefficients of the polynomial for line minimization + in direction h for the object + """ + + B = np.zeros((3,), dtype=np.longdouble) + Brenorm = 1. / self.LL[0]**2 + + # Outer loop: through diffraction patterns + for dname, diff_view in self.di.views.items(): + if not diff_view.active: + continue + + # Weights and intensities for this view + w = self.weights[diff_view] + I = diff_view.data + + A0 = None + A1 = None + A2 = None + + for name, pod in diff_view.pods.items(): + if not pod.active: + continue + f = pod.fw(pod.probe * pod.object) + a = pod.fw(pod.probe * ob_h[pod.ob_view]) + + if A0 is None: + A0 = u.abs2(f).astype(np.longdouble) + A1 = 2 * np.real(f * a.conj()).astype(np.longdouble) + A2 = u.abs2(a).astype(np.longdouble) + else: + A0 += u.abs2(f) + A1 += 2 * np.real(f * a.conj()) + A2 += u.abs2(a) + + if self.p.floating_intensities: + A0 *= self.float_intens_coeff[dname] + A1 *= self.float_intens_coeff[dname] + A2 *= self.float_intens_coeff[dname] + + A0 = np.double(A0) - pod.upsample(I) + #A0 -= pod.upsample(I) + w = pod.upsample(w) + + B[0] += np.dot(w.flat, (A0**2).flat) * Brenorm + B[1] += np.dot(w.flat, (2*A0*A1).flat) * Brenorm + B[2] += np.dot(w.flat, (A1**2 + 2*A0*A2).flat) * Brenorm + + parallel.allreduce(B) + + # Object regularizer + if self.regularizer: + for name, s in self.ob.storages.items(): + B += Brenorm * self.regularizer.poly_line_coeffs( + ob_h.storages[name].data, s.data) + + if np.isinf(B).any() or np.isnan(B).any(): + logger.warning( + 'Warning! inf or nan found! Trying to continue...') + B[np.isinf(B)] = 0. + B[np.isnan(B)] = 0. + + self.B = B + + return B + + def poly_line_coeffs_pr(self, pr_h): + """ + Compute the coefficients of the polynomial for line minimization + in direction h for the probe + """ + + B = np.zeros((3,), dtype=np.longdouble) + Brenorm = 1. / self.LL[0]**2 + + # Outer loop: through diffraction patterns + for dname, diff_view in self.di.views.items(): + if not diff_view.active: + continue + + # Weights and intensities for this view + w = self.weights[diff_view] + I = diff_view.data + + A0 = None + A1 = None + A2 = None + + for name, pod in diff_view.pods.items(): + if not pod.active: + continue + + f = pod.fw(pod.probe * pod.object) + a = pod.fw(pr_h[pod.pr_view] * pod.object) + + if A0 is None: + + A0 = u.abs2(f).astype(np.longdouble) + A1 = 2 * np.real(f * a.conj()).astype(np.longdouble) + A2 = u.abs2(a).astype(np.longdouble) + else: + A0 += u.abs2(f) + A1 += 2 * np.real(f * a.conj()) + A2 += u.abs2(a) + + if self.p.floating_intensities: + A0 *= self.float_intens_coeff[dname] + A1 *= self.float_intens_coeff[dname] + A2 *= self.float_intens_coeff[dname] + + A0 = np.double(A0) - pod.upsample(I) + #A0 -= pod.upsample(I) + w = pod.upsample(w) + + B[0] += np.dot(w.flat, (A0**2).flat) * Brenorm + B[1] += np.dot(w.flat, (2*A0*A1).flat) * Brenorm + B[2] += np.dot(w.flat, (A1**2 + 2*A0*A2).flat) * Brenorm + + parallel.allreduce(B) + + if np.isinf(B).any() or np.isnan(B).any(): + logger.warning( + 'Warning! inf or nan found! Trying to continue...') + B[np.isinf(B)] = 0. + B[np.isnan(B)] = 0. + + self.B = B + return B + + +class Regul_del2(object): + """\ + Squared gradient regularizer (Gaussian prior). + + This class applies to any numpy array. + """ + def __init__(self, amplitude, axes=[-2, -1]): + # Regul.__init__(self, axes) + self.axes = axes + self.amplitude = amplitude + self.delxy = None + self.g = None + self.LL = None + + def grad(self, x): + """ + Compute and return the regularizer gradient given the array x. + """ + ax0, ax1 = self.axes + del_xf = u.delxf(x, axis=ax0) + del_yf = u.delxf(x, axis=ax1) + del_xb = u.delxb(x, axis=ax0) + del_yb = u.delxb(x, axis=ax1) + + self.delxy = [del_xf, del_yf, del_xb, del_yb] + self.g = 2. * self.amplitude*(del_xb + del_yb - del_xf - del_yf) + + self.LL = self.amplitude * (u.norm2(del_xf) + + u.norm2(del_yf) + + u.norm2(del_xb) + + u.norm2(del_yb)) + + return self.g + + def poly_line_coeffs(self, h, x=None): + ax0, ax1 = self.axes + if x is None: + del_xf, del_yf, del_xb, del_yb = self.delxy + else: + del_xf = u.delxf(x, axis=ax0) + del_yf = u.delxf(x, axis=ax1) + del_xb = u.delxb(x, axis=ax0) + del_yb = u.delxb(x, axis=ax1) + + hdel_xf = u.delxf(h, axis=ax0) + hdel_yf = u.delxf(h, axis=ax1) + hdel_xb = u.delxb(h, axis=ax0) + hdel_yb = u.delxb(h, axis=ax1) + + c0 = self.amplitude * (u.norm2(del_xf) + + u.norm2(del_yf) + + u.norm2(del_xb) + + u.norm2(del_yb)) + + c1 = 2 * self.amplitude * np.real(np.vdot(del_xf, hdel_xf) + + np.vdot(del_yf, hdel_yf) + + np.vdot(del_xb, hdel_xb) + + np.vdot(del_yb, hdel_yb)) + + c2 = self.amplitude * (u.norm2(hdel_xf) + + u.norm2(hdel_yf) + + u.norm2(hdel_xb) + + u.norm2(hdel_yb)) + + self.coeff = np.array([c0, c1, c2]) + return self.coeff + + +def prepare_smoothing_preconditioner(amplitude): + """ + Factory for smoothing preconditioner. + """ + if amplitude == 0.: + return None + + class GaussFilt(object): + def __init__(self, sigma): + self.sigma = sigma + + def __call__(self, x): + return u.c_gf(x, [0, self.sigma, self.sigma]) + + # from scipy.signal import correlate2d + # class HannFilt: + # def __call__(self, x): + # y = np.empty_like(x) + # sh = x.shape + # xf = x.reshape((-1,) + sh[-2:]) + # yf = y.reshape((-1,) + sh[-2:]) + # for i in range(len(xf)): + # yf[i] = correlate2d(xf[i], + # np.array([[.0625, .125, .0625], + # [.125, .25, .125], + # [.0625, .125, .0625]]), + # mode='same') + # return y + + if amplitude > 0.: + logger.debug( + 'Using a smooth gradient filter (Gaussian blur - only for ML)') + return GaussFilt(amplitude) + + elif amplitude < 0.: + raise RuntimeError('Hann filter not implemented (negative smoothing amplitude not supported)') + # logger.debug( + # 'Using a smooth gradient filter (Hann window - only for ML)') + # return HannFilt() From b6536008abbfdaaefe91f4c23f7b76a05879da71 Mon Sep 17 00:00:00 2001 From: Jaroslav Fowkes Date: Thu, 21 Nov 2024 13:56:18 +0000 Subject: [PATCH 02/12] Add all poly coeffs to ML_separate_grads --- ptypy/custom/ML_separate_grads.py | 174 +++++++++++++++++++++++++++++- 1 file changed, 170 insertions(+), 4 deletions(-) diff --git a/ptypy/custom/ML_separate_grads.py b/ptypy/custom/ML_separate_grads.py index 49b397f9e..abb9e45f5 100644 --- a/ptypy/custom/ML_separate_grads.py +++ b/ptypy/custom/ML_separate_grads.py @@ -35,7 +35,7 @@ class MLSeparateGrads(PositionCorrectionEngine): Defaults: [name] - default = ML + default = MLSeparateGrads type = str help = doc = @@ -287,14 +287,33 @@ def engine_iterate(self, num=1): dt = self.ptycho.FType t2 = time.time() - if self.p.poly_line_coeffs == "quadratic": + if self.p.poly_line_coeffs == "all": + B_ob = self.ML_model.poly_line_all_coeffs_ob(self.ob_h) + diffB_ob = np.arange(1,len(B_ob))*B_ob[1:] # coefficients of poly derivative + roots_ob = np.roots(np.flip(diffB_ob.astype(np.double))) # roots only supports double + real_roots_ob = np.real(roots_ob[np.isreal(roots_ob)]) # not interested in complex roots + if real_roots_ob.size == 1: # single real root + self.tmin_ob = dt(real_roots_ob[0]) + else: # find real root with smallest poly objective + evalp_ob = lambda root: np.polyval(np.flip(B_ob),root) + self.tmin_ob = dt(min(real_roots_ob, key=evalp_ob)) # root with smallest poly objective + + B_pr = self.ML_model.poly_line_all_coeffs_pr(self.pr_h) + diffB_pr = np.arange(1,len(B_pr))*B_pr[1:] # coefficients of poly derivative + roots_pr = np.roots(np.flip(diffB_pr.astype(np.double))) # roots only supports double + real_roots_pr = np.real(roots_pr[np.isreal(roots_pr)]) # not interested in complex roots + if real_roots_pr.size == 1: # single real root + self.tmin_pr = dt(real_roots_pr[0]) + else: # find real root with smallest poly objective + evalp_pr = lambda root: np.polyval(np.flip(B_pr),root) + self.tmin_pr = dt(min(real_roots_pr, key=evalp_pr)) # root with smallest poly objective + elif self.p.poly_line_coeffs == "quadratic": B_ob = self.ML_model.poly_line_coeffs_ob(self.ob_h) B_pr = self.ML_model.poly_line_coeffs_pr(self.pr_h) # same as above but quicker when poly quadratic self.tmin_ob = dt(-0.5 * B_ob[1] / B_ob[2]) self.tmin_pr = dt(-0.5 * B_pr[1] / B_pr[2]) - else: raise NotImplementedError("poly_line_coeffs should be 'quadratic' or 'all'") @@ -440,6 +459,20 @@ def poly_line_coeffs_pr(self, pr_h): """ raise NotImplementedError + def poly_line_all_coeffs_ob(self, ob_h): + """ + Compute all the coefficients of the polynomial for line minimization + in direction h for the object + """ + raise NotImplementedError + + def poly_line_all_coeffs_pr(self, pr_h): + """ + Compute all the coefficients of the polynomial for line minimization + in direction h for the probe + """ + raise NotImplementedError + class GaussianModel(BaseModel): """ @@ -639,7 +672,6 @@ def poly_line_coeffs_pr(self, pr_h): a = pod.fw(pr_h[pod.pr_view] * pod.object) if A0 is None: - A0 = u.abs2(f).astype(np.longdouble) A1 = 2 * np.real(f * a.conj()).astype(np.longdouble) A2 = u.abs2(a).astype(np.longdouble) @@ -672,6 +704,140 @@ def poly_line_coeffs_pr(self, pr_h): self.B = B return B + def poly_line_all_coeffs_ob(self, ob_h): + """ + Compute all the coefficients of the polynomial for line minimization + in direction h + """ + + B = np.zeros((5,), dtype=np.longdouble) + Brenorm = 1. / self.LL[0]**2 + + # Outer loop: through diffraction patterns + for dname, diff_view in self.di.views.items(): + if not diff_view.active: + continue + + # Weights and intensities for this view + w = self.weights[diff_view] + I = diff_view.data + + A0 = None + A1 = None + A2 = None + + for name, pod in diff_view.pods.items(): + if not pod.active: + continue + f = pod.fw(pod.probe * pod.object) + a = pod.fw(pod.probe * ob_h[pod.ob_view]) + + if A0 is None: + A0 = u.abs2(f).astype(np.longdouble) + A1 = 2 * np.real(f * a.conj()).astype(np.longdouble) + A2 = u.abs2(a).astype(np.longdouble) + else: + A0 += u.abs2(f) + A1 += 2 * np.real(f * a.conj()) + A2 += u.abs2(a) + + if self.p.floating_intensities: + A0 *= self.float_intens_coeff[dname] + A1 *= self.float_intens_coeff[dname] + A2 *= self.float_intens_coeff[dname] + + A0 = np.double(A0) - pod.upsample(I) + #A0 -= pod.upsample(I) + w = pod.upsample(w) + + B[0] += np.dot(w.flat, (A0**2).flat) * Brenorm + B[1] += np.dot(w.flat, (2*A0*A1).flat) * Brenorm + B[2] += np.dot(w.flat, (A1**2 + 2*A0*A2).flat) * Brenorm + B[3] += np.dot(w.flat, (2*A1*A2).flat) * Brenorm + B[4] += np.dot(w.flat, (A2**2).flat) * Brenorm + + parallel.allreduce(B) + + # Object regularizer + if self.regularizer: + for name, s in self.ob.storages.items(): + B[:3] += Brenorm * self.regularizer.poly_line_coeffs( + ob_h.storages[name].data, s.data) + + if np.isinf(B).any() or np.isnan(B).any(): + logger.warning( + 'Warning! inf or nan found! Trying to continue...') + B[np.isinf(B)] = 0. + B[np.isnan(B)] = 0. + + self.B = B + + return B + + def poly_line_all_coeffs_pr(self, pr_h): + """ + Compute all the coefficients of the polynomial for line minimization + in direction h + """ + + B = np.zeros((5,), dtype=np.longdouble) + Brenorm = 1. / self.LL[0]**2 + + # Outer loop: through diffraction patterns + for dname, diff_view in self.di.views.items(): + if not diff_view.active: + continue + + # Weights and intensities for this view + w = self.weights[diff_view] + I = diff_view.data + + A0 = None + A1 = None + A2 = None + + for name, pod in diff_view.pods.items(): + if not pod.active: + continue + f = pod.fw(pod.probe * pod.object) + a = pod.fw(pr_h[pod.pr_view] * pod.object) + + if A0 is None: + A0 = u.abs2(f).astype(np.longdouble) + A1 = 2 * np.real(f * a.conj()).astype(np.longdouble) + A2 = u.abs2(a).astype(np.longdouble) + else: + A0 += u.abs2(f) + A1 += 2 * np.real(f * a.conj()) + A2 += u.abs2(a) + + if self.p.floating_intensities: + A0 *= self.float_intens_coeff[dname] + A1 *= self.float_intens_coeff[dname] + A2 *= self.float_intens_coeff[dname] + + A0 = np.double(A0) - pod.upsample(I) + #A0 -= pod.upsample(I) + w = pod.upsample(w) + + B[0] += np.dot(w.flat, (A0**2).flat) * Brenorm + B[1] += np.dot(w.flat, (2*A0*A1).flat) * Brenorm + B[2] += np.dot(w.flat, (A1**2 + 2*A0*A2).flat) * Brenorm + B[3] += np.dot(w.flat, (2*A1*A2).flat) * Brenorm + B[4] += np.dot(w.flat, (A2**2).flat) * Brenorm + + parallel.allreduce(B) + + if np.isinf(B).any() or np.isnan(B).any(): + logger.warning( + 'Warning! inf or nan found! Trying to continue...') + B[np.isinf(B)] = 0. + B[np.isnan(B)] = 0. + + self.B = B + + return B + class Regul_del2(object): """\ From ef7e14ebbeb0e2b5b37b18e15261e4d04b9c5ac0 Mon Sep 17 00:00:00 2001 From: Jaroslav Fowkes Date: Thu, 21 Nov 2024 14:53:54 +0000 Subject: [PATCH 03/12] Add Poisson and Euclid models to ML_separate_grads --- ptypy/custom/ML_separate_grads.py | 736 +++++++++++++++++++++++++++++- 1 file changed, 729 insertions(+), 7 deletions(-) diff --git a/ptypy/custom/ML_separate_grads.py b/ptypy/custom/ML_separate_grads.py index abb9e45f5..3c77e4b54 100644 --- a/ptypy/custom/ML_separate_grads.py +++ b/ptypy/custom/ML_separate_grads.py @@ -4,7 +4,7 @@ TODO. - * Implement other ML models (Poisson/Euclid) + * Implement other regularizers This file is part of the PTYPY package. @@ -181,10 +181,10 @@ def _initialize_model(self): # Create noise model if self.p.ML_type.lower() == "gaussian": self.ML_model = GaussianModel(self) - # elif self.p.ML_type.lower() == "poisson": - # self.ML_model = PoissonModel(self) - # elif self.p.ML_type.lower() == "euclid": - # self.ML_model = EuclidModel(self) + elif self.p.ML_type.lower() == "poisson": + self.ML_model = PoissonModel(self) + elif self.p.ML_type.lower() == "euclid": + self.ML_model = EuclidModel(self) else: raise RuntimeError("Unsupported ML_type: '%s'" % self.p.ML_type) @@ -707,7 +707,7 @@ def poly_line_coeffs_pr(self, pr_h): def poly_line_all_coeffs_ob(self, ob_h): """ Compute all the coefficients of the polynomial for line minimization - in direction h + in direction h for the object """ B = np.zeros((5,), dtype=np.longdouble) @@ -777,7 +777,7 @@ def poly_line_all_coeffs_ob(self, ob_h): def poly_line_all_coeffs_pr(self, pr_h): """ Compute all the coefficients of the polynomial for line minimization - in direction h + in direction h for the probe """ B = np.zeros((5,), dtype=np.longdouble) @@ -839,6 +839,728 @@ def poly_line_all_coeffs_pr(self, pr_h): return B +class PoissonModel(BaseModel): + """ + Poisson noise model. + """ + + def __init__(self, MLengine): + """ + Core functions for ML computation using a Poisson model. + """ + BaseModel.__init__(self, MLengine) + from scipy import special + self.LLbase = {} + for name, di_view in self.di.views.items(): + if not di_view.active: + continue + self.LLbase[name] = special.gammaln(di_view.data+1).sum() + + def new_grad(self): + """ + Compute a new gradient direction according to a Poisson noise model. + + Note: The negative log-likelihood and local errors are also computed + here. + """ + self.ob_grad.fill(0.) + self.pr_grad.fill(0.) + + # We need an array for MPI + LL = np.array([0.]) + error_dct = {} + + # Outer loop: through diffraction patterns + for dname, diff_view in self.di.views.items(): + if not diff_view.active: + continue + + # Mask and intensities for this view + I = diff_view.data + m = diff_view.pod.ma_view.data + + Imodel = np.zeros_like(I) + f = {} + + # First pod loop: compute total intensity + for name, pod in diff_view.pods.items(): + if not pod.active: + continue + f[name] = pod.fw(pod.probe * pod.object) + Imodel += u.abs2(f[name]) + + # Floating intensity option + if self.p.floating_intensities: + self.float_intens_coeff[dname] = I.sum() / Imodel.sum() + Imodel *= self.float_intens_coeff[dname] + + Imodel += 1e-6 + DI = m * (1. - I / Imodel) + + # Second pod loop: gradients computation + LLL = self.LLbase[dname] + (m * (Imodel - I * np.log(Imodel))).sum().astype(np.float64) + for name, pod in diff_view.pods.items(): + if not pod.active: + continue + xi = pod.bw(DI * f[name]) + self.ob_grad[pod.ob_view] += 2 * xi * pod.probe.conj() + self.pr_grad[pod.pr_view] += 2 * xi * pod.object.conj() + + diff_view.error = LLL + error_dct[dname] = np.array([0, LLL / np.prod(DI.shape), 0]) + LL += LLL + + # MPI reduction of gradients + self.ob_grad.allreduce() + self.pr_grad.allreduce() + parallel.allreduce(LL) + + # Object regularizer + if self.regularizer: + for name, s in self.ob.storages.items(): + self.ob_grad.storages[name].data += self.regularizer.grad( + s.data) + LL += self.regularizer.LL + + self.LL = LL / self.tot_measpts + + return error_dct + + def poly_line_coeffs_ob(self, ob_h): + """ + Compute the coefficients of the polynomial for line minimization + in direction h for the object + """ + B = np.zeros((3,), dtype=np.longdouble) + Brenorm = 1/(self.tot_measpts * self.LL[0])**2 + + # Outer loop: through diffraction patterns + for dname, diff_view in self.di.views.items(): + if not diff_view.active: + continue + + # Weights and intensities for this view + I = diff_view.data + m = diff_view.pod.ma_view.data + + A0 = None + A1 = None + A2 = None + + for name, pod in diff_view.pods.items(): + if not pod.active: + continue + f = pod.fw(pod.probe * pod.object) + a = pod.fw(pod.probe * ob_h[pod.ob_view]) + + if A0 is None: + A0 = u.abs2(f).astype(np.longdouble) + A1 = 2 * np.real(f * a.conj()).astype(np.longdouble) + A2 = u.abs2(a).astype(np.longdouble) + else: + A0 += u.abs2(f) + A1 += 2 * np.real(f * a.conj()) + A2 += u.abs2(a) + + if self.p.floating_intensities: + A0 *= self.float_intens_coeff[dname] + A1 *= self.float_intens_coeff[dname] + A2 *= self.float_intens_coeff[dname] + + A0 += 1e-6 + DI = 1. - I/A0 + + B[0] += (self.LLbase[dname] + (m * (A0 - I * np.log(A0))).sum().astype(np.float64)) * Brenorm + B[1] += np.dot(m.flat, (A1*DI).flat) * Brenorm + B[2] += (np.dot(m.flat, (A2*DI).flat) + 0.5*np.dot(m.flat, (I*(A1/A0)**2).flat)) * Brenorm + + parallel.allreduce(B) + + # Object regularizer + if self.regularizer: + for name, s in self.ob.storages.items(): + B += Brenorm * self.regularizer.poly_line_coeffs( + ob_h.storages[name].data, s.data) + + if np.isinf(B).any() or np.isnan(B).any(): + logger.warning( + 'Warning! inf or nan found! Trying to continue...') + B[np.isinf(B)] = 0. + B[np.isnan(B)] = 0. + + self.B = B + + return B + + def poly_line_coeffs_pr(self, pr_h): + """ + Compute the coefficients of the polynomial for line minimization + in direction h for the probe + """ + B = np.zeros((3,), dtype=np.longdouble) + Brenorm = 1/(self.tot_measpts * self.LL[0])**2 + + # Outer loop: through diffraction patterns + for dname, diff_view in self.di.views.items(): + if not diff_view.active: + continue + + # Weights and intensities for this view + I = diff_view.data + m = diff_view.pod.ma_view.data + + A0 = None + A1 = None + A2 = None + + for name, pod in diff_view.pods.items(): + if not pod.active: + continue + f = pod.fw(pod.probe * pod.object) + a = pod.fw(pr_h[pod.pr_view] * pod.object) + + if A0 is None: + A0 = u.abs2(f).astype(np.longdouble) + A1 = 2 * np.real(f * a.conj()).astype(np.longdouble) + A2 = u.abs2(a).astype(np.longdouble) + else: + A0 += u.abs2(f) + A1 += 2 * np.real(f * a.conj()) + A2 += u.abs2(a) + + if self.p.floating_intensities: + A0 *= self.float_intens_coeff[dname] + A1 *= self.float_intens_coeff[dname] + A2 *= self.float_intens_coeff[dname] + + A0 += 1e-6 + DI = 1. - I/A0 + + B[0] += (self.LLbase[dname] + (m * (A0 - I * np.log(A0))).sum().astype(np.float64)) * Brenorm + B[1] += np.dot(m.flat, (A1*DI).flat) * Brenorm + B[2] += (np.dot(m.flat, (A2*DI).flat) + 0.5*np.dot(m.flat, (I*(A1/A0)**2).flat)) * Brenorm + + parallel.allreduce(B) + + if np.isinf(B).any() or np.isnan(B).any(): + logger.warning( + 'Warning! inf or nan found! Trying to continue...') + B[np.isinf(B)] = 0. + B[np.isnan(B)] = 0. + + self.B = B + + return B + + def poly_line_all_coeffs_ob(self, ob_h): + """ + Compute all the coefficients of the polynomial for line minimization + in direction h for the object + """ + B = np.zeros((5,), dtype=np.longdouble) + Brenorm = 1/(self.tot_measpts * self.LL[0])**2 + + # Outer loop: through diffraction patterns + for dname, diff_view in self.di.views.items(): + if not diff_view.active: + continue + + # Weights and intensities for this view + I = diff_view.data + m = diff_view.pod.ma_view.data + + A0 = None + A1 = None + A2 = None + + for name, pod in diff_view.pods.items(): + if not pod.active: + continue + f = pod.fw(pod.probe * pod.object) + a = pod.fw(pod.probe * ob_h[pod.ob_view]) + + if A0 is None: + A0 = u.abs2(f).astype(np.longdouble) + A1 = 2 * np.real(f * a.conj()).astype(np.longdouble) + A2 = u.abs2(a).astype(np.longdouble) + else: + A0 += u.abs2(f) + A1 += 2 * np.real(f * a.conj()) + A2 += u.abs2(a) + + if self.p.floating_intensities: + A0 *= self.float_intens_coeff[dname] + A1 *= self.float_intens_coeff[dname] + A2 *= self.float_intens_coeff[dname] + + A0 += 1e-6 + DI = 1. - I/A0 + + B[0] += (self.LLbase[dname] + (m * (A0 - I * np.log(A0))).sum().astype(np.float64)) * Brenorm + B[1] += np.dot(m.flat, (A1*DI).flat) * Brenorm + B[2] += (np.dot(m.flat, (A2*DI).flat) + 0.5*np.dot(m.flat, (I*(A1/A0)**2).flat)) * Brenorm + B[3] += (0.5*np.dot(m.flat, (I*((2*A1*A2)/A0**2)).flat)) * Brenorm + B[4] += (0.5*np.dot(m.flat, (I*((A2**2)/A0**2)).flat)) * Brenorm + + parallel.allreduce(B) + + # Object regularizer + if self.regularizer: + for name, s in self.ob.storages.items(): + B[:3] += Brenorm * self.regularizer.poly_line_coeffs( + ob_h.storages[name].data, s.data) + + if np.isinf(B).any() or np.isnan(B).any(): + logger.warning( + 'Warning! inf or nan found! Trying to continue...') + B[np.isinf(B)] = 0. + B[np.isnan(B)] = 0. + + self.B = B + + return B + +def poly_line_all_coeffs_pr(self, pr_h): + """ + Compute all the coefficients of the polynomial for line minimization + in direction h for the probe + """ + B = np.zeros((5,), dtype=np.longdouble) + Brenorm = 1/(self.tot_measpts * self.LL[0])**2 + + # Outer loop: through diffraction patterns + for dname, diff_view in self.di.views.items(): + if not diff_view.active: + continue + + # Weights and intensities for this view + I = diff_view.data + m = diff_view.pod.ma_view.data + + A0 = None + A1 = None + A2 = None + + for name, pod in diff_view.pods.items(): + if not pod.active: + continue + f = pod.fw(pod.probe * pod.object) + a = pod.fw(pr_h[pod.pr_view] * pod.object) + + if A0 is None: + A0 = u.abs2(f).astype(np.longdouble) + A1 = 2 * np.real(f * a.conj()).astype(np.longdouble) + A2 = u.abs2(a).astype(np.longdouble) + else: + A0 += u.abs2(f) + A1 += 2 * np.real(f * a.conj()) + A2 += u.abs2(a) + + if self.p.floating_intensities: + A0 *= self.float_intens_coeff[dname] + A1 *= self.float_intens_coeff[dname] + A2 *= self.float_intens_coeff[dname] + + A0 += 1e-6 + DI = 1. - I/A0 + + B[0] += (self.LLbase[dname] + (m * (A0 - I * np.log(A0))).sum().astype(np.float64)) * Brenorm + B[1] += np.dot(m.flat, (A1*DI).flat) * Brenorm + B[2] += (np.dot(m.flat, (A2*DI).flat) + 0.5*np.dot(m.flat, (I*(A1/A0)**2).flat)) * Brenorm + B[3] += (0.5*np.dot(m.flat, (I*((2*A1*A2)/A0**2)).flat)) * Brenorm + B[4] += (0.5*np.dot(m.flat, (I*((A2**2)/A0**2)).flat)) * Brenorm + + parallel.allreduce(B) + + if np.isinf(B).any() or np.isnan(B).any(): + logger.warning( + 'Warning! inf or nan found! Trying to continue...') + B[np.isinf(B)] = 0. + B[np.isnan(B)] = 0. + + self.B = B + + return B + + +class EuclidModel(BaseModel): + """ + Euclidean (Amplitude) noise model. + TODO: feed actual statistical weights instead of using a fixed variance. + """ + + def __init__(self, MLengine): + """ + Core functions for ML computation using a Euclidean model. + """ + BaseModel.__init__(self, MLengine) + + # Euclidean model requires weights + # TODO: update this part of the code once actual weights are passed in the PODs + self.weights = self.engine.di.copy(self.engine.di.ID + '_weights') + # FIXME: This part needs to be updated once statistical weights are properly + # supported in the data preparation. + for name, di_view in self.di.views.items(): + if not di_view.active: + continue + self.weights[di_view] = di_view.pod.ma_view.data # just the mask for now + #self.weights[di_view] = (di_view.pod.ma_view.data + # / (1. + stat_weights/di_view.data)) + + def __del__(self): + """ + Clean up routine + """ + BaseModel.__del__(self) + del self.engine.ptycho.containers[self.weights.ID] + del self.weights + + def new_grad(self): + """ + Compute a new gradient direction according to a Euclidean noise model. + + Note: The negative log-likelihood and local errors are also computed + here. + """ + self.ob_grad.fill(0.) + self.pr_grad.fill(0.) + + # We need an array for MPI + LL = np.array([0.]) + error_dct = {} + + # Outer loop: through diffraction patterns + for dname, diff_view in self.di.views.items(): + if not diff_view.active: + continue + + # Weights and amplitudes for this view + w = self.weights[diff_view] + A = np.sqrt(diff_view.data) + + Amodel = np.zeros_like(A) + f = {} + + # First pod loop: compute total amplitude + for name, pod in diff_view.pods.items(): + if not pod.active: + continue + f[name] = pod.fw(pod.probe * pod.object) + Amodel += np.sqrt(u.abs2(f[name])) + + # Floating intensity option + if self.p.floating_intensities: + self.float_intens_coeff[dname] = A.sum() / Amodel.sum() + Amodel *= self.float_intens_coeff[dname] + + Amodel += 1e-6 # cf Poisson model + DA = (1. - A / Amodel) + + # Second pod loop: gradients computation + LLL = np.sum((w * (Amodel - A)**2).astype(np.float64)) + for name, pod in diff_view.pods.items(): + if not pod.active: + continue + xi = pod.bw(w*DA * f[name]) + self.ob_grad[pod.ob_view] += 2. * xi * pod.probe.conj() + self.pr_grad[pod.pr_view] += 2. * xi * pod.object.conj() + + diff_view.error = LLL + error_dct[dname] = np.array([0, LLL / np.prod(DA.shape), 0]) + LL += LLL + + # MPI reduction of gradients + self.ob_grad.allreduce() + self.pr_grad.allreduce() + parallel.allreduce(LL) + + # Object regularizer + if self.regularizer: + for name, s in self.ob.storages.items(): + self.ob_grad.storages[name].data += self.regularizer.grad( + s.data) + LL += self.regularizer.LL + self.LL = LL / self.tot_measpts + + return error_dct + + def poly_line_coeffs_ob(self, ob_h): + """ + Compute the coefficients of the polynomial for line minimization + in direction h for the object + """ + + B = np.zeros((3,), dtype=np.longdouble) + Brenorm = 1. / self.LL[0]**2 + + # Outer loop: through diffraction patterns + for dname, diff_view in self.di.views.items(): + if not diff_view.active: + continue + + # Weights and amplitudes for this view + w = self.weights[diff_view] + A = np.sqrt(diff_view.data) + + A0 = None + A1 = None + A2 = None + + for name, pod in diff_view.pods.items(): + if not pod.active: + continue + f = pod.fw(pod.probe * pod.object) + a = pod.fw(pod.probe * ob_h[pod.ob_view]) + + if A0 is None: + A0 = u.abs2(f).astype(np.longdouble) + A1 = 2 * np.real(f * a.conj()).astype(np.longdouble) + A2 = u.abs2(a).astype(np.longdouble) + else: + A0 += u.abs2(f) + A1 += 2 * np.real(f * a.conj()) + A2 += u.abs2(a) + + if self.p.floating_intensities: + A0 *= self.float_intens_coeff[dname] + A1 *= self.float_intens_coeff[dname] + A2 *= self.float_intens_coeff[dname] + + A0 += 1e-12 # cf Poisson model sqrt(1e-12) = 1e-6 + DA = 1. - A/np.sqrt(A0) + + B[0] += np.dot(w.flat, ((np.sqrt(A0) - A)**2).flat) * Brenorm + B[1] += np.dot(w.flat, (A1*DA).flat) * Brenorm + B[2] += (np.dot(w.flat, (A2*DA).flat) + 0.25*np.dot(w.flat, (A1**2 * A/A0**(3/2)).flat)) * Brenorm + + parallel.allreduce(B) + + # Object regularizer + if self.regularizer: + for name, s in self.ob.storages.items(): + B += Brenorm * self.regularizer.poly_line_coeffs( + ob_h.storages[name].data, s.data) + + if np.isinf(B).any() or np.isnan(B).any(): + logger.warning( + 'Warning! inf or nan found! Trying to continue...') + B[np.isinf(B)] = 0. + B[np.isnan(B)] = 0. + + self.B = B + + return B + + def poly_line_coeffs_pr(self, pr_h): + """ + Compute the coefficients of the polynomial for line minimization + in direction h for the probe + """ + + B = np.zeros((3,), dtype=np.longdouble) + Brenorm = 1. / self.LL[0]**2 + + # Outer loop: through diffraction patterns + for dname, diff_view in self.di.views.items(): + if not diff_view.active: + continue + + # Weights and amplitudes for this view + w = self.weights[diff_view] + A = np.sqrt(diff_view.data) + + A0 = None + A1 = None + A2 = None + + for name, pod in diff_view.pods.items(): + if not pod.active: + continue + f = pod.fw(pod.probe * pod.object) + a = pod.fw(pr_h[pod.pr_view] * pod.object) + + if A0 is None: + A0 = u.abs2(f).astype(np.longdouble) + A1 = 2 * np.real(f * a.conj()).astype(np.longdouble) + A2 = u.abs2(a).astype(np.longdouble) + else: + A0 += u.abs2(f) + A1 += 2 * np.real(f * a.conj()) + A2 += u.abs2(a) + + if self.p.floating_intensities: + A0 *= self.float_intens_coeff[dname] + A1 *= self.float_intens_coeff[dname] + A2 *= self.float_intens_coeff[dname] + + A0 += 1e-12 # cf Poisson model sqrt(1e-12) = 1e-6 + DA = 1. - A/np.sqrt(A0) + + B[0] += np.dot(w.flat, ((np.sqrt(A0) - A)**2).flat) * Brenorm + B[1] += np.dot(w.flat, (A1*DA).flat) * Brenorm + B[2] += (np.dot(w.flat, (A2*DA).flat) + 0.25*np.dot(w.flat, (A1**2 * A/A0**(3/2)).flat)) * Brenorm + + parallel.allreduce(B) + + if np.isinf(B).any() or np.isnan(B).any(): + logger.warning( + 'Warning! inf or nan found! Trying to continue...') + B[np.isinf(B)] = 0. + B[np.isnan(B)] = 0. + + self.B = B + + return B + + def poly_line_all_coeffs_ob(self, ob_h): + """ + Compute all the coefficients of the polynomial for line minimization + in direction h for the object + """ + + B = np.zeros((9,), dtype=np.longdouble) + Brenorm = 1. / self.LL[0]**2 + + # Outer loop: through diffraction patterns + for dname, diff_view in self.di.views.items(): + if not diff_view.active: + continue + + # Weights and amplitudes for this view + w = self.weights[diff_view] + A = np.sqrt(diff_view.data) + + A0 = None + A1 = None + A2 = None + + for name, pod in diff_view.pods.items(): + if not pod.active: + continue + f = pod.fw(pod.probe * pod.object) + a = pod.fw(pod.probe * ob_h[pod.ob_view]) + + if A0 is None: + A0 = u.abs2(f).astype(np.longdouble) + A1 = 2 * np.real(f * a.conj()).astype(np.longdouble) + A2 = u.abs2(a).astype(np.longdouble) + else: + A0 += u.abs2(f) + A1 += 2 * np.real(f * a.conj()) + A2 += u.abs2(a) + + if self.p.floating_intensities: + A0 *= self.float_intens_coeff[dname] + A1 *= self.float_intens_coeff[dname] + A2 *= self.float_intens_coeff[dname] + + A0 += 1e-12 # cf Poisson model sqrt(1e-12) = 1e-6 + DA = 1. - A/np.sqrt(A0) + DA32 = A/A0**(3/2) + + B[0] += np.dot(w.flat, ((np.sqrt(A0) - A)**2).flat) * Brenorm + B[1] += np.dot(w.flat, (A1*DA).flat) * Brenorm + B[2] += (np.dot(w.flat, (A2*DA).flat) + 0.25*np.dot(w.flat, (A1**2 * DA32).flat)) * Brenorm + B[3] += (0.25*np.dot(w.flat, (2*A1*A2 * DA32).flat) - 0.125*np.dot(w.flat, (A1**3/A0**2).flat)) * Brenorm + B[4] += (0.25*np.dot(w.flat, ((A2**2) * DA32).flat) - 0.125*np.dot(w.flat, ((3*A1**2*A2)/A0**2).flat) + + 0.015625*np.dot(w.flat, (A1**4/A0**3).flat)) * Brenorm + B[5] += (- 0.125*np.dot(w.flat, ((3*A1*A2**2)/A0**2).flat) + + 0.015625*np.dot(w.flat, ((4*A1**3*A2)/A0**3).flat)) * Brenorm + B[6] += (- 0.125*np.dot(w.flat, ((A2**3)/A0**2).flat) + + 0.015625*np.dot(w.flat, ((6*A1**2*A2**2)/A0**3).flat)) * Brenorm + B[7] += (0.015625*np.dot(w.flat, ((4*A1*A2**3)/A0**3).flat)) * Brenorm + B[8] += (0.015625*np.dot(w.flat, ((A2**4)/A0**3).flat)) * Brenorm + + parallel.allreduce(B) + + # Object regularizer + if self.regularizer: + for name, s in self.ob.storages.items(): + B[:3] += Brenorm * self.regularizer.poly_line_coeffs( + ob_h.storages[name].data, s.data) + + if np.isinf(B).any() or np.isnan(B).any(): + logger.warning( + 'Warning! inf or nan found! Trying to continue...') + B[np.isinf(B)] = 0. + B[np.isnan(B)] = 0. + + self.B = B + + return B + +def poly_line_all_coeffs_pr(self, pr_h): + """ + Compute all the coefficients of the polynomial for line minimization + in direction h for the probe + """ + + B = np.zeros((9,), dtype=np.longdouble) + Brenorm = 1. / self.LL[0]**2 + + # Outer loop: through diffraction patterns + for dname, diff_view in self.di.views.items(): + if not diff_view.active: + continue + + # Weights and amplitudes for this view + w = self.weights[diff_view] + A = np.sqrt(diff_view.data) + + A0 = None + A1 = None + A2 = None + + for name, pod in diff_view.pods.items(): + if not pod.active: + continue + f = pod.fw(pod.probe * pod.object) + a = pod.fw(pr_h[pod.pr_view] * pod.object) + + if A0 is None: + A0 = u.abs2(f).astype(np.longdouble) + A1 = 2 * np.real(f * a.conj()).astype(np.longdouble) + A2 = u.abs2(a).astype(np.longdouble) + else: + A0 += u.abs2(f) + A1 += 2 * np.real(f * a.conj()) + A2 += u.abs2(a) + + if self.p.floating_intensities: + A0 *= self.float_intens_coeff[dname] + A1 *= self.float_intens_coeff[dname] + A2 *= self.float_intens_coeff[dname] + + A0 += 1e-12 # cf Poisson model sqrt(1e-12) = 1e-6 + DA = 1. - A/np.sqrt(A0) + DA32 = A/A0**(3/2) + + B[0] += np.dot(w.flat, ((np.sqrt(A0) - A)**2).flat) * Brenorm + B[1] += np.dot(w.flat, (A1*DA).flat) * Brenorm + B[2] += (np.dot(w.flat, (A2*DA).flat) + 0.25*np.dot(w.flat, (A1**2 * DA32).flat)) * Brenorm + B[3] += (0.25*np.dot(w.flat, (2*A1*A2 * DA32).flat) - 0.125*np.dot(w.flat, (A1**3/A0**2).flat)) * Brenorm + B[4] += (0.25*np.dot(w.flat, ((A2**2) * DA32).flat) - 0.125*np.dot(w.flat, ((3*A1**2*A2)/A0**2).flat) + + 0.015625*np.dot(w.flat, (A1**4/A0**3).flat)) * Brenorm + B[5] += (- 0.125*np.dot(w.flat, ((3*A1*A2**2)/A0**2).flat) + + 0.015625*np.dot(w.flat, ((4*A1**3*A2)/A0**3).flat)) * Brenorm + B[6] += (- 0.125*np.dot(w.flat, ((A2**3)/A0**2).flat) + + 0.015625*np.dot(w.flat, ((6*A1**2*A2**2)/A0**3).flat)) * Brenorm + B[7] += (0.015625*np.dot(w.flat, ((4*A1*A2**3)/A0**3).flat)) * Brenorm + B[8] += (0.015625*np.dot(w.flat, ((A2**4)/A0**3).flat)) * Brenorm + + parallel.allreduce(B) + + if np.isinf(B).any() or np.isnan(B).any(): + logger.warning( + 'Warning! inf or nan found! Trying to continue...') + B[np.isinf(B)] = 0. + B[np.isnan(B)] = 0. + + self.B = B + + return B + + class Regul_del2(object): """\ Squared gradient regularizer (Gaussian prior). From ae6f01b54389e57aaa52fd526b7b38164e343016 Mon Sep 17 00:00:00 2001 From: Jaroslav Fowkes Date: Thu, 21 Nov 2024 16:04:26 +0000 Subject: [PATCH 04/12] Serialize ML with separate gradient steps --- ...L_separate_grads.py => MLSeparateGrads.py} | 0 ptypy/custom/MLSeparateGrads_serial.py | 612 ++++++++++++++++++ 2 files changed, 612 insertions(+) rename ptypy/custom/{ML_separate_grads.py => MLSeparateGrads.py} (100%) create mode 100644 ptypy/custom/MLSeparateGrads_serial.py diff --git a/ptypy/custom/ML_separate_grads.py b/ptypy/custom/MLSeparateGrads.py similarity index 100% rename from ptypy/custom/ML_separate_grads.py rename to ptypy/custom/MLSeparateGrads.py diff --git a/ptypy/custom/MLSeparateGrads_serial.py b/ptypy/custom/MLSeparateGrads_serial.py new file mode 100644 index 000000000..6731e00aa --- /dev/null +++ b/ptypy/custom/MLSeparateGrads_serial.py @@ -0,0 +1,612 @@ +# -*- coding: utf-8 -*- +""" +Maximum Likelihood reconstruction engine. + +TODO. + + * Implement other regularizers + +This file is part of the PTYPY package. + + :copyright: Copyright 2024 by the PTYPY team, see AUTHORS. + :license: see LICENSE for details. +""" +import numpy as np +import time + +from .MLSeparateGrads import MLSeparateGrads, BaseModel +from ..accelerate.base.engines.projectional_serial import serialize_array_access +from .. import utils as u +from ..utils.verbose import logger, log +from ..utils import parallel +from ..engines.utils import Cnorm2, Cdot +from ..engines import register +from ..accelerate.base.kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel +from ..accelerate.base.array_utils import complex_gaussian_filter, complex_gaussian_filter_fft + + +__all__ = ['MLSeparateGrads_serial'] + +@register() +class MLSeparateGrads_serial(MLSeparateGrads): + + """ + Defaults: + + [smooth_gradient_method] + default = convolution + type = str + help = Method to be used for smoothing the gradient, choose between ```convolution``` or ```fft```. + """ + + def __init__(self, ptycho_parent, pars=None): + """ + Maximum likelihood reconstruction engine. + """ + super(MLSeparateGrads_serial, self).__init__(ptycho_parent, pars) + + self.kernels = {} + self.diff_info = {} + self.cn2_ob_grad = 0. + self.cn2_pr_grad = 0. + + def engine_initialize(self): + """ + Prepare for ML reconstruction. + """ + super(MLSeparateGrads_serial, self).engine_initialize() + self._setup_kernels() + + def _initialize_model(self): + + # Create noise model + if self.p.ML_type.lower() == "gaussian": + self.ML_model = GaussianModel(self) + elif self.p.ML_type.lower() == "poisson": + raise NotImplementedError('Poisson norm model not yet implemented') + elif self.p.ML_type.lower() == "euclid": + raise NotImplementedError('Euclid norm model not yet implemented') + else: + raise RuntimeError("Unsupported ML_type: '%s'" % self.p.ML_type) + + def _setup_kernels(self): + """ + Setup kernels, one for each scan. Derive scans from ptycho class + """ + # get the scans + for label, scan in self.ptycho.model.scans.items(): + + kern = u.Param() + kern.scanmodel = type(scan).__name__ + self.kernels[label] = kern + + # TODO: needs to be adapted for broad bandwidth + geo = scan.geometries[0] + + # Get info to shape buffer arrays + fpc = scan.max_frames_per_block + + # TODO : make this more foolproof + try: + nmodes = scan.p.coherence.num_probe_modes + except: + nmodes = 1 + + # create buffer arrays + ash = (fpc * nmodes,) + tuple(geo.shape) + aux = np.zeros(ash, dtype=np.complex64) + kern.aux = aux + kern.a = np.zeros(ash, dtype=np.complex64) + kern.b = np.zeros(ash, dtype=np.complex64) + + # setup kernels, one for each SCAN. + kern.GDK = GradientDescentKernel(aux, nmodes) + kern.GDK.allocate() + + kern.POK = PoUpdateKernel() + kern.POK.allocate() + + kern.AWK = AuxiliaryWaveKernel() + kern.AWK.allocate() + + kern.FW = geo.propagator.fw + kern.BW = geo.propagator.bw + kern.resolution = geo.resolution[0] + + if self.do_position_refinement: + kern.PCK = PositionCorrectionKernel(aux, nmodes, self.p.position_refinement, geo.resolution) + kern.PCK.allocate() + + def engine_prepare(self): + + ## Serialize new data ## + + for label, d in self.ptycho.new_data: + prep = u.Param() + prep.label = label + self.diff_info[d.ID] = prep + prep.err_phot = np.zeros((d.data.shape[0],), dtype=np.float32) + # set floating intensity coefficients to 1.0 + # they get overridden if self.p.floating_intensities=True + prep.float_intens_coeff = np.ones((d.data.shape[0],), dtype=np.float32) + + # Unfortunately this needs to be done for all pods, since + # the shape of the probe / object was modified. + # TODO: possible scaling issue + for label, d in self.di.storages.items(): + prep = self.diff_info[d.ID] + prep.view_IDs, prep.poe_IDs, prep.addr = serialize_array_access(d) + # Re-create exit addresses when gradient models (single exit buffer per view) are used + # TODO: this should not be necessary, kernels should not use exit wave information + if self.kernels[prep.label].scanmodel in ("GradFull", "BlockGradFull"): + for i,addr in enumerate(prep.addr): + nmodes = len(addr[:,2,0]) + for j,ea in enumerate(addr[:,2,0]): + prep.addr[i,j,2,0] = i*nmodes+j + prep.I = d.data + if self.do_position_refinement: + prep.original_addr = np.zeros_like(prep.addr) + prep.original_addr[:] = prep.addr + prep.ma = self.ma.S[d.ID].data + + self.ML_model.prepare() + + def _get_smooth_gradient(self, data, sigma): + if self.p.smooth_gradient_method == "convolution": + return complex_gaussian_filter(data, sigma) + elif self.p.smooth_gradient_method == "fft": + return complex_gaussian_filter_fft(data, sigma) + else: + raise NotImplementedError("smooth_gradient_method should be ```convolution``` or ```fft```.") + + def _replace_ob_grad(self): + new_ob_grad = self.ob_grad_new + # Smoothing preconditioner + if self.smooth_gradient: + self.smooth_gradient.sigma *= (1. - self.p.smooth_gradient_decay) + for name, s in new_ob_grad.storages.items(): + s.data[:] = self._get_smooth_gradient(s.data, self.smooth_gradient.sigma) + + norm = Cnorm2(new_ob_grad) + dot = np.real(Cdot(new_ob_grad, self.ob_grad)) + self.ob_grad << new_ob_grad + return norm, dot + + def _replace_pr_grad(self): + new_pr_grad = self.pr_grad_new + # probe support + if self.p.probe_update_start <= self.curiter: + # Apply probe support if needed + for name, s in new_pr_grad.storages.items(): + self.support_constraint(s) + # FIXME: this hack doesn't work here as we step in probe and object separately + # FIXME: really it's the probe step that should be zeroed out not the gradient + else: + new_pr_grad.fill(0.) + + norm = Cnorm2(new_pr_grad) + dot = np.real(Cdot(new_pr_grad, self.pr_grad)) + self.pr_grad << new_pr_grad + return norm, dot + + def engine_iterate(self, num=1): + """ + Compute `num` iterations. + """ + ######################## + # Compute new gradient + ######################## + tg = 0. + tc = 0. + ta = time.time() + for it in range(num): + + ######################## + # Compute new gradients + # ob: new_ob_grad + # pr: new_pr_grad + ######################## + t1 = time.time() + error_dct = self.ML_model.new_grad() + tg += time.time() - t1 + + cn2_new_pr_grad, cdotr_pr_grad = self._replace_pr_grad() + cn2_new_ob_grad, cdotr_ob_grad = self._replace_ob_grad() + + ############################ + # Compute Polak-Ribiere betas + # bt_ob = bt_num_ob/bt_denom_ob + # bt_pr = bt_num_pr/bt_denom_pr + ############################ + if self.curiter == 0: + bt_ob = 0. + bt_pr = 0. + else: + # For the object + bt_num_ob = cn2_new_ob_grad - cdotr_ob_grad + bt_denom_ob = self.cn2_ob_grad + bt_ob = max(0, bt_num_ob/bt_denom_ob) + + # For the probe + bt_num_pr = cn2_new_pr_grad - cdotr_pr_grad + bt_denom_pr = self.cn2_pr_grad + bt_pr = max(0, bt_num_pr/bt_denom_pr) + + self.cn2_ob_grad = cn2_new_ob_grad + self.cn2_pr_grad = cn2_new_pr_grad + + ############################ + # Compute next conjugates + # ob_h -= bt_ob * ob_grad + # pr_h -= bt_pr * pr_grad + # NB: in the below need to do h/tmin + # as did h*tmin when taking steps + # (don't you just love containers?) + ############################ + dt = self.ptycho.FType + self.ob_h *= dt(bt_ob / self.tmin_ob) + + # Smoothing preconditioner + if self.smooth_gradient: + for name, s in self.ob_h.storages.items(): + s.data[:] -= self._get_smooth_gradient(self.ob_grad.storages[name].data, self.smooth_gradient.sigma) + else: + self.ob_h -= self.ob_grad + + self.pr_h *= dt(bt_pr / self.tmin_pr) + self.pr_h -= self.pr_grad + + ######################## + # Compute step-sizes + # ob: tmin_ob + # pr: tmin_pr + ######################## + t2 = time.time() + B_ob = self.ML_model.poly_line_coeffs_ob(self.ob_h) + B_pr = self.ML_model.poly_line_coeffs_pr(self.pr_h) + tc += time.time() - t2 + + self.tmin_ob = dt(-0.5 * B_ob[1] / B_ob[2]) + self.tmin_pr = dt(-0.5 * B_pr[1] / B_pr[2]) + + ######################## + # Take conjugate steps + # ob += tmin_ob * ob_h + # pr += tmin_pr * pr_h + ######################## + self.ob_h *= self.tmin_ob + self.pr_h *= self.tmin_pr + self.ob += self.ob_h + self.pr += self.pr_h + + # Position correction + self.position_update() + + # Allow for customized modifications at the end of each iteration + self._post_iterate_update() + + # increase iteration counter + self.curiter += 1 + + logger.info('Time spent in gradient calculation: %.2f' % tg) + logger.info(' .... in coefficient calculation: %.2f' % tc) + return error_dct # np.array([[self.ML_model.LL[0]] * 3]) + + def position_update(self): + """ + Position refinement + """ + if not self.do_position_refinement: + return + do_update_pos = (self.p.position_refinement.stop > self.curiter >= self.p.position_refinement.start) + do_update_pos &= (self.curiter % self.p.position_refinement.interval) == 0 + + # Update positions + if do_update_pos: + """ + Iterates through all positions and refines them by a given algorithm. + """ + log(4, "----------- START POS REF -------------") + for dID in self.di.S.keys(): + + prep = self.diff_info[dID] + pID, oID, eID = prep.poe_IDs + ob = self.ob.S[oID].data + pr = self.pr.S[pID].data + kern = self.kernels[prep.label] + aux = kern.aux + addr = prep.addr + original_addr = prep.original_addr + mangled_addr = addr.copy() + w = prep.weights + I = prep.I + err_phot = prep.err_phot + + PCK = kern.PCK + FW = kern.FW + + # Keep track of object boundaries + max_oby = ob.shape[-2] - aux.shape[-2] - 1 + max_obx = ob.shape[-1] - aux.shape[-1] - 1 + + # We need to re-calculate the current error + PCK.build_aux(aux, addr, ob, pr) + aux[:] = FW(aux) + PCK.log_likelihood_ml(aux, addr, I, w, err_phot) + error_state = np.zeros_like(err_phot) + error_state[:] = err_phot + PCK.mangler.setup_shifts(self.curiter, nframes=addr.shape[0]) + + log(4, 'Position refinement trial: iteration %s' % (self.curiter)) + for i in range(PCK.mangler.nshifts): + PCK.mangler.get_address(i, addr, mangled_addr, max_oby, max_obx) + PCK.build_aux(aux, mangled_addr, ob, pr) + aux[:] = FW(aux) + PCK.log_likelihood_ml(aux, mangled_addr, I, w, err_phot) + PCK.update_addr_and_error_state(addr, error_state, mangled_addr, err_phot) + + prep.err_phot = error_state + prep.addr = addr + + def engine_finalize(self): + """ + try deleting ever helper contianer + """ + if self.do_position_refinement and self.p.position_refinement.record: + for label, d in self.di.storages.items(): + prep = self.diff_info[d.ID] + res = self.kernels[prep.label].resolution + for i,view in enumerate(d.views): + for j,(pname, pod) in enumerate(view.pods.items()): + delta = (prep.addr[i][j][1][1:] - prep.original_addr[i][j][1][1:]) * res + pod.ob_view.coord += delta + pod.ob_view.storage.update_views(pod.ob_view) + self.ptycho.record_positions = True + + # Save floating intensities into runtime + float_intens_coeff = {} + for label, d in self.di.storages.items(): + prep = self.diff_info[d.ID] + float_intens_coeff[label] = prep.float_intens_coeff + self.ptycho.runtime["float_intens"] = parallel.gather_dict(float_intens_coeff) + super().engine_finalize() + + +class BaseModelSerial(BaseModel): + """ + Base class for log-likelihood models. + """ + + def __del__(self): + """ + Clean up routine + """ + pass + + +class GaussianModel(BaseModelSerial): + """ + Gaussian noise model. + TODO: feed actual statistical weights instead of using the Poisson statistic heuristic. + """ + + def __init__(self, MLengine): + """ + Core functions for ML computation using a Gaussian model. + """ + super(GaussianModel, self).__init__(MLengine) + + def prepare(self): + + super(GaussianModel, self).prepare() + + for label, d in self.engine.ptycho.new_data: + prep = self.engine.diff_info[d.ID] + prep.weights = (self.Irenorm * self.engine.ma.S[d.ID].data + / (1. / self.Irenorm + d.data)).astype(d.data.dtype) + + def __del__(self): + """ + Clean up routine + """ + super(GaussianModel, self).__del__() + + def new_grad(self): + """ + Compute a new gradient direction according to a Gaussian noise model. + + Note: The negative log-likelihood and local errors are also computed + here. + """ + ob_grad = self.engine.ob_grad_new + pr_grad = self.engine.pr_grad_new + ob_grad << 0. + pr_grad << 0. + + # We need an array for MPI + LL = np.array([0.]) + error_dct = {} + + for dID in self.di.S.keys(): + prep = self.engine.diff_info[dID] + # find probe, object in exit ID in dependence of dID + pID, oID, eID = prep.poe_IDs + + # references for kernels + kern = self.engine.kernels[prep.label] + GDK = kern.GDK + AWK = kern.AWK + POK = kern.POK + + aux = kern.aux + + FW = kern.FW + BW = kern.BW + + # get addresses and auxilliary array + addr = prep.addr + w = prep.weights + err_phot = prep.err_phot + fic = prep.float_intens_coeff + + # local references + ob = self.engine.ob.S[oID].data + obg = ob_grad.S[oID].data + pr = self.engine.pr.S[pID].data + prg = pr_grad.S[pID].data + I = prep.I + + # make propagated exit (to buffer) + AWK.build_aux_no_ex(aux, addr, ob, pr, add=False) + + # forward prop + aux[:] = FW(aux) + + GDK.make_model(aux, addr) + + if self.p.floating_intensities: + GDK.floating_intensity(addr, w, I, fic) + + GDK.main(aux, addr, w, I) + GDK.error_reduce(addr, err_phot) + aux[:] = BW(aux) + + POK.ob_update_ML(addr, obg, pr, aux) + POK.pr_update_ML(addr, prg, ob, aux) + + for dID, prep in self.engine.diff_info.items(): + err_phot = prep.err_phot + LL += err_phot.sum() + err_phot /= np.prod(prep.weights.shape[-2:]) + err_fourier = np.zeros_like(err_phot) + err_exit = np.zeros_like(err_phot) + errs = np.ascontiguousarray(np.vstack([err_fourier, err_phot, err_exit]).T) + error_dct.update(zip(prep.view_IDs, errs)) + + # MPI reduction of gradients + ob_grad.allreduce() + pr_grad.allreduce() + parallel.allreduce(LL) + + # Object regularizer + if self.regularizer: + for name, s in self.engine.ob.storages.items(): + ob_grad.storages[name].data += self.regularizer.grad(s.data) + LL += self.regularizer.LL + + self.LL = LL / self.tot_measpts + return error_dct + + def poly_line_coeffs_ob(self, c_ob_h): + """ + Compute the coefficients of the polynomial for line minimization + in direction h for the object + """ + + B = np.zeros((3,), dtype=np.longdouble) + Brenorm = 1. / self.LL[0] ** 2 + + # Outer loop: through diffraction patterns + for dID in self.di.S.keys(): + prep = self.engine.diff_info[dID] + + # find probe, object in exit ID in dependence of dID + pID, oID, eID = prep.poe_IDs + + # references for kernels + kern = self.engine.kernels[prep.label] + GDK = kern.GDK + AWK = kern.AWK + + f = kern.aux + a = kern.a + + FW = kern.FW + + # get addresses and auxilliary array + addr = prep.addr + w = prep.weights + fic = prep.float_intens_coeff + + # local references + ob = self.ob.S[oID].data + ob_h = c_ob_h.S[oID].data + pr = self.pr.S[pID].data + I = self.di.S[dID].data + + # make propagated exit (to buffer) + AWK.build_aux_no_ex(f, addr, ob, pr, add=False) + AWK.build_aux_no_ex(a, addr, ob_h, pr, add=False) + + # forward prop + f[:] = FW(f) + a[:] = FW(a) + + GDK.make_a012(f, a, 0, addr, I, fic) # FIXME: need new kernel + GDK.fill_b(addr, Brenorm, w, B) + + parallel.allreduce(B) + + # Object regularizer + if self.regularizer: + for name, s in self.ob.storages.items(): + B += Brenorm * self.regularizer.poly_line_coeffs( + c_ob_h.storages[name].data, s.data) + + self.B = B + + return B + +def poly_line_coeffs_pr(self, c_pr_h): + """ + Compute the coefficients of the polynomial for line minimization + in direction h for the probe + """ + + B = np.zeros((3,), dtype=np.longdouble) + Brenorm = 1. / self.LL[0] ** 2 + + # Outer loop: through diffraction patterns + for dID in self.di.S.keys(): + prep = self.engine.diff_info[dID] + + # find probe, object in exit ID in dependence of dID + pID, oID, eID = prep.poe_IDs + + # references for kernels + kern = self.engine.kernels[prep.label] + GDK = kern.GDK + AWK = kern.AWK + + f = kern.aux + a = kern.a + + FW = kern.FW + + # get addresses and auxilliary array + addr = prep.addr + w = prep.weights + fic = prep.float_intens_coeff + + # local references + ob = self.ob.S[oID].data + pr = self.pr.S[pID].data + pr_h = c_pr_h.S[pID].data + I = self.di.S[dID].data + + # make propagated exit (to buffer) + AWK.build_aux_no_ex(f, addr, ob, pr, add=False) + AWK.build_aux_no_ex(a, addr, ob, pr_h, add=True) + + # forward prop + f[:] = FW(f) + a[:] = FW(a) + + GDK.make_a012(f, a, 0, addr, I, fic) # FIXME: need new kernel + GDK.fill_b(addr, Brenorm, w, B) + + parallel.allreduce(B) + + self.B = B + + return B From 17ba4adfe557eba0443170dc7f1ea33c9c901d70 Mon Sep 17 00:00:00 2001 From: Jaroslav Fowkes Date: Thu, 21 Nov 2024 16:21:48 +0000 Subject: [PATCH 05/12] Drop probe_update_start from MLSeparateGrads --- ptypy/custom/MLSeparateGrads.py | 24 ++++++------------------ ptypy/custom/MLSeparateGrads_serial.py | 12 +++--------- 2 files changed, 9 insertions(+), 27 deletions(-) diff --git a/ptypy/custom/MLSeparateGrads.py b/ptypy/custom/MLSeparateGrads.py index 3c77e4b54..1448f879d 100644 --- a/ptypy/custom/MLSeparateGrads.py +++ b/ptypy/custom/MLSeparateGrads.py @@ -88,13 +88,6 @@ class MLSeparateGrads(PositionCorrectionEngine): help = Whether to use the object/probe scaling preconditioner doc = This parameter can give faster convergence for weakly scattering samples. - [probe_update_start] - default = 0 - type = int - lowlim = 0 - help = Number of iterations before probe update starts - # NOTE: probe_update_start doesn't work with this code, need to add some code to fix this - [poly_line_coeffs] default = quadratic type = str @@ -219,17 +212,12 @@ def engine_iterate(self, num=1): new_ob_grad, new_pr_grad = self.ob_grad_new, self.pr_grad_new tg += time.time() - t1 - if self.p.probe_update_start <= self.curiter: - # Apply probe support if needed - for name, s in new_pr_grad.storages.items(): - self.support_constraint(s) - #support = self.probe_support.get(name) - #if support is not None: - # s.data *= support - # FIXME: this hack doesn't work here as we step in probe and object separately - # FIXME: really it's the probe step that should be zeroed out not the gradient - else: - new_pr_grad.fill(0.) + # Apply probe support if needed + for name, s in new_pr_grad.storages.items(): + self.support_constraint(s) + #support = self.probe_support.get(name) + #if support is not None: + # s.data *= support # Smoothing preconditioner if self.smooth_gradient: diff --git a/ptypy/custom/MLSeparateGrads_serial.py b/ptypy/custom/MLSeparateGrads_serial.py index 6731e00aa..572e9a323 100644 --- a/ptypy/custom/MLSeparateGrads_serial.py +++ b/ptypy/custom/MLSeparateGrads_serial.py @@ -174,15 +174,9 @@ def _replace_ob_grad(self): def _replace_pr_grad(self): new_pr_grad = self.pr_grad_new - # probe support - if self.p.probe_update_start <= self.curiter: - # Apply probe support if needed - for name, s in new_pr_grad.storages.items(): - self.support_constraint(s) - # FIXME: this hack doesn't work here as we step in probe and object separately - # FIXME: really it's the probe step that should be zeroed out not the gradient - else: - new_pr_grad.fill(0.) + # Apply probe support if needed + for name, s in new_pr_grad.storages.items(): + self.support_constraint(s) norm = Cnorm2(new_pr_grad) dot = np.real(Cdot(new_pr_grad, self.pr_grad)) From fe0df7d260b10fb16fb5f9025e3af82bfeeade91 Mon Sep 17 00:00:00 2001 From: Jaroslav Fowkes Date: Mon, 6 Jan 2025 09:04:04 +0000 Subject: [PATCH 06/12] Add required kernel function --- ptypy/accelerate/base/kernels.py | 31 ++++++++++++++++++++++++++ ptypy/custom/MLSeparateGrads_serial.py | 4 ++-- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/ptypy/accelerate/base/kernels.py b/ptypy/accelerate/base/kernels.py index c5343f940..2d6b438da 100644 --- a/ptypy/accelerate/base/kernels.py +++ b/ptypy/accelerate/base/kernels.py @@ -269,6 +269,37 @@ def make_model(self, b_aux, addr): tf = aux.reshape(sh[0], self.nmodes, sh[1], sh[2]) Imodel[:] = ((tf * tf.conj()).real).sum(1) + def make_a012_notb(self, b_f, b_a, addr, I, fic): + + # reference shape (= GPU global dims) + sh = I.shape + + # stopper + maxz = I.shape[0] + + A0 = self.npy.Imodel + A1 = self.npy.LLerr + A2 = self.npy.LLden + + # batch buffers + f = b_f[:maxz * self.nmodes] + a = b_a[:maxz * self.nmodes] + + ## Actual math ## (subset of FUK.fourier_error) + fc = fic.reshape((maxz,1,1)) + A0.fill(0.) + tf = np.real(f * f.conj()).astype(self.ftype) + A0[:maxz] = np.double(tf.reshape(maxz, self.nmodes, sh[1], sh[2]).sum(1) * fc) - I + + A1.fill(0.) + tf = 2. * np.real(f * a.conj()) + A1[:maxz] = tf.reshape(maxz, self.nmodes, sh[1], sh[2]).sum(1) * fc + + A2.fill(0.) + tf = np.real(a * a.conj()) + A2[:maxz] = tf.reshape(maxz, self.nmodes, sh[1], sh[2]).sum(1) * fc + return + def make_a012(self, b_f, b_a, b_b, addr, I, fic): # reference shape (= GPU global dims) diff --git a/ptypy/custom/MLSeparateGrads_serial.py b/ptypy/custom/MLSeparateGrads_serial.py index 572e9a323..e5c2283e7 100644 --- a/ptypy/custom/MLSeparateGrads_serial.py +++ b/ptypy/custom/MLSeparateGrads_serial.py @@ -536,7 +536,7 @@ def poly_line_coeffs_ob(self, c_ob_h): f[:] = FW(f) a[:] = FW(a) - GDK.make_a012(f, a, 0, addr, I, fic) # FIXME: need new kernel + GDK.make_a012_notb(f, a, addr, I, fic) GDK.fill_b(addr, Brenorm, w, B) parallel.allreduce(B) @@ -596,7 +596,7 @@ def poly_line_coeffs_pr(self, c_pr_h): f[:] = FW(f) a[:] = FW(a) - GDK.make_a012(f, a, 0, addr, I, fic) # FIXME: need new kernel + GDK.make_a012_notb(f, a, addr, I, fic) GDK.fill_b(addr, Brenorm, w, B) parallel.allreduce(B) From efe000ff9b266cd3e0dbe70a39263f4b2364cb8d Mon Sep 17 00:00:00 2001 From: Jaroslav Fowkes Date: Mon, 6 Jan 2025 09:18:57 +0000 Subject: [PATCH 07/12] Add kernel regression test --- .../gradient_descent_kernel_test.py | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/test/accelerate_tests/base_tests/gradient_descent_kernel_test.py b/test/accelerate_tests/base_tests/gradient_descent_kernel_test.py index 453edc2ff..9749f8e39 100644 --- a/test/accelerate_tests/base_tests/gradient_descent_kernel_test.py +++ b/test/accelerate_tests/base_tests/gradient_descent_kernel_test.py @@ -125,6 +125,67 @@ def test_floating_intensity(self): np.testing.assert_array_almost_equal(exp_fic, fic, err_msg="floating intensity coeff (fic) has not been updated as expected") + def test_make_a012_notb(self): + b_f, b_a, _, I, w, err_sum, addr = self.prepare_arrays() + fic = np.ones((I.shape[0],), dtype=I.dtype) + GDK=GradientDescentKernel(b_f, addr.shape[1]) + GDK.allocate() + GDK.make_a012_notb(b_f, b_a, addr, I, fic) + #print('Imodel',repr(GDK.npy.Imodel)) + #print('LLerr',repr(GDK.npy.LLerr)) + #print('LLden',repr(GDK.npy.LLden)) + exp_A0 = np.array([[[ 1., 1., 1.], + [ 2., 2., 2.], + [ 5., 5., 5.]], + + [[12., 12., 12.], + [13., 13., 13.], + [16., 16., 16.]], + + [[37., 37., 37.], + [38., 38., 38.], + [41., 41., 41.]], + + [[ 0., 0., 0.], + [ 0., 0., 0.], + [ 0., 0., 0.]]], dtype=FLOAT_TYPE) + np.testing.assert_array_almost_equal(exp_A0, GDK.npy.Imodel, + err_msg="`Imodel` buffer (=A0) has not been updated as expected") + exp_A1 = np.array([[[ 0., 0., 0.], + [ 2., 6., 10.], + [ 4., 12., 20.]], + + [[ 0., 0., 0.], + [10., 14., 18.], + [20., 28., 36.]], + + [[ 0., 0., 0.], + [18., 22., 26.], + [36., 44., 52.]], + + [[ 0., 0., 0.], + [ 0., 0., 0.], + [ 0., 0., 0.]]], dtype=FLOAT_TYPE) + np.testing.assert_array_almost_equal(exp_A1, GDK.npy.LLerr, + err_msg="`LLerr` buffer (=A1) has not been updated as expected") + exp_A2 = np.array([[[ 0., 4., 12.], + [ 4., 8., 16.], + [12., 16., 24.]], + + [[ 0., 12., 28.], + [12., 24., 40.], + [28., 40., 56.]], + + [[ 0., 20., 44.], + [20., 40., 64.], + [44., 64., 88.]], + + [[ 0., 0., 0.], + [ 0., 0., 0.], + [ 0., 0., 0.]]], dtype=FLOAT_TYPE) + print(GDK.npy.LLden) + np.testing.assert_array_almost_equal(exp_A2, GDK.npy.LLden, + err_msg="`LLden` buffer (=A2) has not been updated as expected") def test_make_a012(self): b_f, b_a, b_b, I, w, err_sum, addr = self.prepare_arrays() From 60c7e60828e1add0490e86434dc5e27e7a334397 Mon Sep 17 00:00:00 2001 From: Jaroslav Fowkes Date: Mon, 6 Jan 2025 09:29:23 +0000 Subject: [PATCH 08/12] Fix kernel regression test --- .../gradient_descent_kernel_test.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/test/accelerate_tests/base_tests/gradient_descent_kernel_test.py b/test/accelerate_tests/base_tests/gradient_descent_kernel_test.py index 9749f8e39..675d7690a 100644 --- a/test/accelerate_tests/base_tests/gradient_descent_kernel_test.py +++ b/test/accelerate_tests/base_tests/gradient_descent_kernel_test.py @@ -168,22 +168,21 @@ def test_make_a012_notb(self): [ 0., 0., 0.]]], dtype=FLOAT_TYPE) np.testing.assert_array_almost_equal(exp_A1, GDK.npy.LLerr, err_msg="`LLerr` buffer (=A1) has not been updated as expected") - exp_A2 = np.array([[[ 0., 4., 12.], - [ 4., 8., 16.], - [12., 16., 24.]], + exp_A2 = np.array([[[ 0., 2., 8.], + [ 2., 4., 10.], + [ 8., 10., 16.]], - [[ 0., 12., 28.], - [12., 24., 40.], - [28., 40., 56.]], + [[ 0., 2., 8.], + [ 2., 4., 10.], + [ 8., 10., 16.]], - [[ 0., 20., 44.], - [20., 40., 64.], - [44., 64., 88.]], + [[ 0., 2., 8.], + [ 2., 4., 10.], + [ 8., 10., 16.]], [[ 0., 0., 0.], [ 0., 0., 0.], [ 0., 0., 0.]]], dtype=FLOAT_TYPE) - print(GDK.npy.LLden) np.testing.assert_array_almost_equal(exp_A2, GDK.npy.LLden, err_msg="`LLden` buffer (=A2) has not been updated as expected") From 7c1a288060c96438598af36c43297f6264f2319c Mon Sep 17 00:00:00 2001 From: Jari Date: Tue, 7 Jan 2025 14:52:21 +0000 Subject: [PATCH 09/12] Add CUDA kernel --- .../accelerate/cuda_common/make_a012_notb.cu | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 ptypy/accelerate/cuda_common/make_a012_notb.cu diff --git a/ptypy/accelerate/cuda_common/make_a012_notb.cu b/ptypy/accelerate/cuda_common/make_a012_notb.cu new file mode 100644 index 000000000..94c7ab68a --- /dev/null +++ b/ptypy/accelerate/cuda_common/make_a012_notb.cu @@ -0,0 +1,61 @@ +/** fmag_all_update without b. + * + * Data types: + * - IN_TYPE: the data type for the inputs (float or double) + * - OUT_TYPE: the data type for the outputs (float or double) + * - MATH_TYPE: the data type used for computation + * - ACC_TYPE: data type used for accumulation + */ + +#include "common.cuh" + +extern "C" __global__ void make_a012_notb(const complex* f, + const complex* a, + const IN_TYPE* I, + const IN_TYPE* fic, + OUT_TYPE* A0, + OUT_TYPE* A1, + OUT_TYPE* A2, + int z, + int y, + int x, + int maxz) +{ + int ix = threadIdx.x + blockIdx.x * blockDim.x; + int iz = blockIdx.z; + + if (ix >= x) + return; + + if (iz >= maxz) + { + A0[iz * x + ix] = OUT_TYPE(0); // make sure it's the right type (double/float) + A1[iz * x + ix] = OUT_TYPE(0); + A2[iz * x + ix] = OUT_TYPE(0); + return; + } + + // we sum across y directly, as this is the number of modes, + // which is typically small + auto sumtf0 = ACC_TYPE(0); + auto sumtf1 = ACC_TYPE(0); + auto sumtf2 = ACC_TYPE(0); + for (auto iy = 0; iy < y; ++iy) + { + complex fv = f[iz * y * x + iy * x + ix]; + sumtf0 += fv.real() * fv.real() + fv.imag() * fv.imag(); + + complex av = a[iz * y * x + iy * x + ix]; + // 2 * real(f * conj(a)) + sumtf1 += MATH_TYPE(2) * (fv.real() * av.real() + fv.imag() * av.imag()); + + // abs(a)^2 + sumtf2 += av.real() * av.real() + av.imag() * av.imag(); + } + + MATH_TYPE Iv = I[iz * x + ix]; + MATH_TYPE ficv = fic[iz]; + A0[iz * x + ix] = OUT_TYPE(MATH_TYPE(sumtf0) * ficv - Iv); + A1[iz * x + ix] = OUT_TYPE(MATH_TYPE(sumtf1) * ficv); + A2[iz * x + ix] = OUT_TYPE(MATH_TYPE(sumtf2) * ficv); +} \ No newline at end of file From 9ad9985b71c6e1852c8fcf96e8913cbd6ed619a4 Mon Sep 17 00:00:00 2001 From: Jaroslav Fowkes Date: Mon, 13 Jan 2025 13:34:31 +0000 Subject: [PATCH 10/12] Add cupy and pycuda kernels --- ptypy/accelerate/cuda_cupy/kernels.py | 24 ++++++++++++++++++++++++ ptypy/accelerate/cuda_pycuda/kernels.py | 23 +++++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/ptypy/accelerate/cuda_cupy/kernels.py b/ptypy/accelerate/cuda_cupy/kernels.py index b743e9b08..becaf78b2 100644 --- a/ptypy/accelerate/cuda_cupy/kernels.py +++ b/ptypy/accelerate/cuda_cupy/kernels.py @@ -676,6 +676,7 @@ def __init__(self, aux, nmodes=1, queue=None, accumulate_type='double', math_typ } self.make_model_cuda = load_kernel('make_model', subs) self.make_a012_cuda = load_kernel('make_a012', subs) + self.make_a012_notb_cuda = load_kernel('make_a012_notb', subs) self.error_reduce_cuda = load_kernel('error_reduce', { **subs, 'OUT_TYPE': 'float' if self.ftype == np.float32 else 'double', @@ -748,6 +749,29 @@ def make_a012(self, b_f, b_a, b_b, addr, I, fic): args=(b_f, b_a, b_b, I, fic, A0, A1, A2, z, y, x, maxz)) + def make_a012_notb(self, b_f, b_a, addr, I, fic): + # reference shape (= GPU global dims) + sh = I.shape + + # stopper + maxz = I.shape[0] + + A0 = self.gpu.Imodel + A1 = self.gpu.LLerr + A2 = self.gpu.LLden + + z = np.int32(sh[0]) + maxz = np.int32(maxz) + y = np.int32(self.nmodes) + x = np.int32(sh[1]*sh[2]) + bx = 1024 + if self.queue is not None: + self.queue.use() + self.make_a012_notb_cuda(grid=(int((x + bx - 1) // bx), 1, int(z)), + block=(bx, 1, 1), + args=(b_f, b_a, I, fic, + A0, A1, A2, z, y, x, maxz)) + def fill_b(self, addr, Brenorm, w, B): # stopper maxz = w.shape[0] diff --git a/ptypy/accelerate/cuda_pycuda/kernels.py b/ptypy/accelerate/cuda_pycuda/kernels.py index 8fcebb6ef..541a1e8b1 100644 --- a/ptypy/accelerate/cuda_pycuda/kernels.py +++ b/ptypy/accelerate/cuda_pycuda/kernels.py @@ -654,6 +654,7 @@ def __init__(self, aux, nmodes=1, queue=None, accumulate_type = 'double', math_t } self.make_model_cuda = load_kernel('make_model', subs) self.make_a012_cuda = load_kernel('make_a012', subs) + self.make_a012_notb_cuda = load_kernel('make_a012_notb', subs) self.error_reduce_cuda = load_kernel('error_reduce', { **subs, 'OUT_TYPE': 'float' if self.ftype == np.float32 else 'double', @@ -723,6 +724,28 @@ def make_a012(self, b_f, b_a, b_b, addr, I, fic): grid=(int((x + bx - 1) // bx), 1, int(z)), stream=self.queue) + def make_a012_notb(self, b_f, b_a, addr, I, fic): + # reference shape (= GPU global dims) + sh = I.shape + + # stopper + maxz = I.shape[0] + + A0 = self.gpu.Imodel + A1 = self.gpu.LLerr + A2 = self.gpu.LLden + + z = np.int32(sh[0]) + maxz = np.int32(maxz) + y = np.int32(self.nmodes) + x = np.int32(sh[1]*sh[2]) + bx = 1024 + self.make_a012_notb_cuda(b_f, b_a, I, fic, + A0, A1, A2, z, y, x, maxz, + block=(bx, 1, 1), + grid=(int((x + bx - 1) // bx), 1, int(z)), + stream=self.queue) + def fill_b(self, addr, Brenorm, w, B): # stopper maxz = w.shape[0] From b25b4f6a4c9a97af6b648d01c5df619614a700a3 Mon Sep 17 00:00:00 2001 From: Jaroslav Fowkes Date: Mon, 13 Jan 2025 13:46:03 +0000 Subject: [PATCH 11/12] Add cupy and pycuda kernel regression tests --- .../gradient_descent_kernel_test.py | 73 +++++++++++++++++-- .../gradient_descent_kernel_test.py | 73 +++++++++++++++++-- 2 files changed, 136 insertions(+), 10 deletions(-) diff --git a/test/accelerate_tests/cuda_cupy_tests/gradient_descent_kernel_test.py b/test/accelerate_tests/cuda_cupy_tests/gradient_descent_kernel_test.py index b3a6e8ff7..55837771b 100644 --- a/test/accelerate_tests/cuda_cupy_tests/gradient_descent_kernel_test.py +++ b/test/accelerate_tests/cuda_cupy_tests/gradient_descent_kernel_test.py @@ -23,9 +23,9 @@ class GradientDescentKernelTest(CupyCudaTest): def prepare_arrays(self, performance=False): if not performance: nmodes = 2 - N_buf = 4 - N = 3 - A = 3 + N_buf = 4 + N = 3 + A = 3 else: nmodes = 4 N_buf = 100 @@ -198,6 +198,69 @@ def test_make_a012(self): exp_A2, GDK.gpu.LLden.get(), err_msg="`LLden` buffer (=A2) has not been updated as expected") + def test_make_a012_notb(self): + b_f, b_a, _, I, w, err_sum, addr, fic = self.prepare_arrays() + GDK = GradientDescentKernel(b_f, addr.shape[1]) + GDK.allocate() + GDK.make_a012_notb(b_f, b_a, addr, I, fic) + + exp_A0 = np.array([[[1., 1., 1.], + [2., 2., 2.], + [5., 5., 5.]], + + [[12., 12., 12.], + [13., 13., 13.], + [16., 16., 16.]], + + [[37., 37., 37.], + [38., 38., 38.], + [41., 41., 41.]], + + [[0., 0., 0.], + [0., 0., 0.], + [0., 0., 0.]]], dtype=FLOAT_TYPE) + np.testing.assert_array_almost_equal( + exp_A0, GDK.gpu.Imodel.get(), + err_msg="`Imodel` buffer (=A0) has not been updated as expected") + + exp_A1 = np.array([[[0., 0., 0.], + [2., 6., 10.], + [4., 12., 20.]], + + [[0., 0., 0.], + [10., 14., 18.], + [20., 28., 36.]], + + [[0., 0., 0.], + [18., 22., 26.], + [36., 44., 52.]], + + [[0., 0., 0.], + [0., 0., 0.], + [0., 0., 0.]]], dtype=FLOAT_TYPE) + np.testing.assert_array_almost_equal( + exp_A1, GDK.gpu.LLerr.get(), + err_msg="`LLerr` buffer (=A1) has not been updated as expected") + + exp_A2 = np.array([[[ 0., 2., 8.], + [ 2., 4., 10.], + [ 8., 10., 16.]], + + [[ 0., 2., 8.], + [ 2., 4., 10.], + [ 8., 10., 16.]], + + [[ 0., 2., 8.], + [ 2., 4., 10.], + [ 8., 10., 16.]], + + [[ 0., 0., 0.], + [ 0., 0., 0.], + [ 0., 0., 0.]]], dtype=FLOAT_TYPE) + np.testing.assert_array_almost_equal( + exp_A2, GDK.gpu.LLden.get(), + err_msg="`LLden` buffer (=A2) has not been updated as expected") + @unittest.skipIf(not perfrun, "performance test") def test_make_a012_performance(self): b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays(performance=True) @@ -233,7 +296,7 @@ def test_fill_b_perf(self): GDK.allocate() GDK.make_a012(b_f, b_a, b_b, addr, I, fic) GDK.fill_b(addr, Brenorm, w, B_dev) - + def test_error_reduce(self): b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays() GDK = GradientDescentKernel(b_f, addr.shape[1]) @@ -246,7 +309,7 @@ def test_error_reduce(self): np.testing.assert_array_almost_equal( exp_err, err_sum.get(), err_msg="`err_sum` has not been updated as expected") - + @unittest.skipIf(not perfrun, "performance test") def test_error_reduce_perf(self): b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays(performance=True) diff --git a/test/accelerate_tests/cuda_pycuda_tests/gradient_descent_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/gradient_descent_kernel_test.py index 6caed13f2..c08a61e9f 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/gradient_descent_kernel_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/gradient_descent_kernel_test.py @@ -23,9 +23,9 @@ class GradientDescentKernelTest(PyCudaTest): def prepare_arrays(self, performance=False): if not performance: nmodes = 2 - N_buf = 4 - N = 3 - A = 3 + N_buf = 4 + N = 3 + A = 3 else: nmodes = 4 N_buf = 100 @@ -198,6 +198,69 @@ def test_make_a012(self): exp_A2, GDK.gpu.LLden.get(), err_msg="`LLden` buffer (=A2) has not been updated as expected") + def test_make_a012_notb(self): + b_f, b_a, _, I, w, err_sum, addr, fic = self.prepare_arrays() + GDK = GradientDescentKernel(b_f, addr.shape[1]) + GDK.allocate() + GDK.make_a012_notb(b_f, b_a, addr, I, fic) + + exp_A0 = np.array([[[1., 1., 1.], + [2., 2., 2.], + [5., 5., 5.]], + + [[12., 12., 12.], + [13., 13., 13.], + [16., 16., 16.]], + + [[37., 37., 37.], + [38., 38., 38.], + [41., 41., 41.]], + + [[0., 0., 0.], + [0., 0., 0.], + [0., 0., 0.]]], dtype=FLOAT_TYPE) + np.testing.assert_array_almost_equal( + exp_A0, GDK.gpu.Imodel.get(), + err_msg="`Imodel` buffer (=A0) has not been updated as expected") + + exp_A1 = np.array([[[0., 0., 0.], + [2., 6., 10.], + [4., 12., 20.]], + + [[0., 0., 0.], + [10., 14., 18.], + [20., 28., 36.]], + + [[0., 0., 0.], + [18., 22., 26.], + [36., 44., 52.]], + + [[0., 0., 0.], + [0., 0., 0.], + [0., 0., 0.]]], dtype=FLOAT_TYPE) + np.testing.assert_array_almost_equal( + exp_A1, GDK.gpu.LLerr.get(), + err_msg="`LLerr` buffer (=A1) has not been updated as expected") + + exp_A2 = np.array([[[ 0., 2., 8.], + [ 2., 4., 10.], + [ 8., 10., 16.]], + + [[ 0., 2., 8.], + [ 2., 4., 10.], + [ 8., 10., 16.]], + + [[ 0., 2., 8.], + [ 2., 4., 10.], + [ 8., 10., 16.]], + + [[ 0., 0., 0.], + [ 0., 0., 0.], + [ 0., 0., 0.]]], dtype=FLOAT_TYPE) + np.testing.assert_array_almost_equal( + exp_A2, GDK.gpu.LLden.get(), + err_msg="`LLden` buffer (=A2) has not been updated as expected") + @unittest.skipIf(not perfrun, "performance test") def test_make_a012_performance(self): b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays(performance=True) @@ -233,7 +296,7 @@ def test_fill_b_perf(self): GDK.allocate() GDK.make_a012(b_f, b_a, b_b, addr, I, fic) GDK.fill_b(addr, Brenorm, w, B_dev) - + def test_error_reduce(self): b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays() GDK = GradientDescentKernel(b_f, addr.shape[1]) @@ -246,7 +309,7 @@ def test_error_reduce(self): np.testing.assert_array_almost_equal( exp_err, err_sum.get(), err_msg="`err_sum` has not been updated as expected") - + @unittest.skipIf(not perfrun, "performance test") def test_error_reduce_perf(self): b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays(performance=True) From 1a24467f5f6ac12d306dd22325d264d5a64fe057 Mon Sep 17 00:00:00 2001 From: Jaroslav Fowkes Date: Mon, 13 Jan 2025 14:19:49 +0000 Subject: [PATCH 12/12] CuPy and PyCUDA MLSeparateGrads --- ptypy/custom/MLSeparateGrads_cupy.py | 881 +++++++++++++++++++++++++ ptypy/custom/MLSeparateGrads_pycuda.py | 858 ++++++++++++++++++++++++ 2 files changed, 1739 insertions(+) create mode 100644 ptypy/custom/MLSeparateGrads_cupy.py create mode 100644 ptypy/custom/MLSeparateGrads_pycuda.py diff --git a/ptypy/custom/MLSeparateGrads_cupy.py b/ptypy/custom/MLSeparateGrads_cupy.py new file mode 100644 index 000000000..14899895c --- /dev/null +++ b/ptypy/custom/MLSeparateGrads_cupy.py @@ -0,0 +1,881 @@ +# -*- coding: utf-8 -*- +""" +Maximum Likelihood reconstruction engine. + +TODO. + + * Implement other regularizers + +This file is part of the PTYPY package. + + :copyright: Copyright 2014 by the PTYPY team, see AUTHORS. + :license: see LICENSE for details. +""" +import numpy as np +import cupy as cp +import cupyx + +from ptypy.engines import register +from .MLSeparateGrads_serial import MLSeparateGrads_serial +from ptypy.accelerate.base.engines.ML_serial import BaseModelSerial +from ptypy import utils as u +from ptypy.utils.verbose import logger, log +from ptypy.utils import parallel +from ptypy.accelerate.cuda_cupy import get_context, log_device_memory_stats +from ptypy.accelerate.cuda_cupy.kernels import PropagationKernel, RealSupportKernel, FourierSupportKernel +from ptypy.accelerate.cuda_cupy.kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel +from ptypy.accelerate.cuda_cupy.array_utils import ArrayUtilsKernel, DerivativesKernel, GaussianSmoothingKernel, FFTGaussianSmoothingKernel, TransposeKernel +from ptypy.accelerate.cuda_cupy.mem_utils import GpuDataManager + +#from ..mem_utils import GpuDataManager +from ptypy.accelerate.base import address_manglers + +__all__ = ['MLSeparateGrads_cupy'] + +# can be used to limit the number of blocks, simulating that they don't fit +MAX_BLOCKS = 99999 +# MAX_BLOCKS = 3 # can be used to limit the number of blocks, simulating that they don't fit + + +@register() +class MLSeparateGrads_cupy(MLSeparateGrads_serial): + + """ + Defaults: + + [probe_update_cuda_atomics] + default = False + type = bool + help = For GPU, use the atomics version for probe update kernel + + [object_update_cuda_atomics] + default = True + type = bool + help = For GPU, use the atomics version for object update kernel + + [fft_lib] + default = cuda + type = str + help = Choose the cupy-compatible FFT module. + doc = One of: + - ``'cuda'`` : ptypy's cuda wrapper (delayed load, but fastest compute if all data is on GPU) + - ``'cupy'`` : cupy using cufft (fast load, slowest compute due to additional store/load stages) + choices = 'cuda','cupy' + userlevel = 2 + + """ + + def __init__(self, ptycho_parent, pars=None): + """ + Maximum likelihood reconstruction engine. + """ + super().__init__(ptycho_parent, pars) + + def engine_initialize(self): + """ + Prepare for ML reconstruction. + """ + self.queue = get_context(new_queue=True) + + self.qu_htod = cp.cuda.Stream() + self.qu_dtoh = cp.cuda.Stream() + + if self.p.smooth_gradient_method == "convolution": + self.GSK = GaussianSmoothingKernel(queue=self.queue) + self.GSK.tmp = None + + if self.p.smooth_gradient_method == "fft": + self.FGSK = FFTGaussianSmoothingKernel(queue=self.queue) + + # Real/Fourier Support Kernel + self.RSK = {} + self.FSK = {} + + super().engine_initialize() + # self._setup_kernels() + + def _setup_kernels(self): + """ + Setup kernels, one for each scan. Derive scans from ptycho class + """ + AUK = ArrayUtilsKernel(queue=self.queue) + self._dot_kernel = AUK.dot + # get the scans + for label, scan in self.ptycho.model.scans.items(): + + kern = u.Param() + kern.scanmodel = type(scan).__name__ + self.kernels[label] = kern + + # TODO: needs to be adapted for broad bandwidth + geo = scan.geometries[0] + + # Get info to shape buffer arrays + fpc = scan.max_frames_per_block + + # TODO : make this more foolproof + try: + nmodes = scan.p.coherence.num_probe_modes * \ + scan.p.coherence.num_object_modes + except: + nmodes = 1 + + # create buffer arrays + ash = (fpc * nmodes,) + tuple([int(s) for s in geo.shape]) + aux = cp.zeros(ash, dtype=np.complex64) + kern.aux = aux + kern.a = cp.zeros(ash, dtype=np.complex64) + kern.b = cp.zeros(ash, dtype=np.complex64) + + # setup kernels, one for each SCAN. + kern.GDK = GradientDescentKernel( + aux, nmodes, queue=self.queue, math_type="double") + kern.GDK.allocate() + + kern.POK = PoUpdateKernel(queue_thread=self.queue) + kern.POK.allocate() + + kern.AWK = AuxiliaryWaveKernel(queue_thread=self.queue) + kern.AWK.allocate() + + kern.TK = TransposeKernel(queue=self.queue) + + kern.PROP = PropagationKernel( + aux, geo.propagator, queue_thread=self.queue, fft_type=self.p.fft_lib) + kern.PROP.allocate() + kern.resolution = geo.resolution[0] + + if self.do_position_refinement: + kern.PCK = PositionCorrectionKernel( + aux, nmodes, self.p.position_refinement, geo.resolution, queue_thread=self.queue) + kern.PCK.allocate() + + mag_mem = 0 + for scan, kern in self.kernels.items(): + mag_mem = max(kern.aux.nbytes // 2, mag_mem) + ma_mem = mag_mem + blk = ma_mem + mag_mem + + # We need to add the free memory from the pool to the free device memory, + # as both will be used for allocations + mempool = cp.get_default_memory_pool() + mem = cp.cuda.runtime.memGetInfo()[0] + mempool.total_bytes() - mempool.used_bytes() + + # leave 200MB room for safety + fit = int(mem - 200 * 1024 * 1024) // blk + if not fit: + log(1, "Cannot fit memory into device, if possible reduce frames per block. Exiting...") + raise SystemExit("ptypy has been exited.") + + # TODO grow blocks dynamically + nma = min(fit, MAX_BLOCKS) + log_device_memory_stats(4) + log(4, 'Free memory available: {:.2f} GB'.format(float(mem)/(1024**3))) + log(4, 'Memory to be allocated per block: {:.2f} GB'.format(float(blk)/(1024**3))) + log(4, 'CuPy max blocks fitting on GPU: ma_arrays={}'.format(nma)) + # reset memory or create new + self.w_data = GpuDataManager(ma_mem, 0, nma, False) + self.I_data = GpuDataManager(mag_mem, 0, nma, False) + + def engine_prepare(self): + + super().engine_prepare() + ## Serialize new data ## + use_tiles = (not self.p.probe_update_cuda_atomics) or ( + not self.p.object_update_cuda_atomics) + + # recursive copy to gpu for probe and object + for _cname, c in self.ptycho.containers.items(): + if c.original != self.pr and c.original != self.ob: + continue + for _sname, s in c.S.items(): + # convert data here + s.gpu = cp.asarray(s.data) + s.cpu = cupyx.empty_pinned( + s.data.shape, s.data.dtype, order="C") + s.cpu[:] = s.data + + for label, d in self.ptycho.new_data: + prep = self.diff_info[d.ID] + prep.err_phot_gpu = cp.asarray(prep.err_phot) + prep.fic_gpu = cp.ones_like(prep.err_phot_gpu) + + if use_tiles: + prep.addr2 = np.ascontiguousarray( + np.transpose(prep.addr, (2, 3, 0, 1))) + + prep.addr_gpu = cp.asarray(prep.addr) + if self.do_position_refinement: + prep.original_addr_gpu = cp.asarray(prep.original_addr) + prep.error_state_gpu = cp.empty_like(prep.err_phot_gpu) + prep.mangled_addr_gpu = prep.addr_gpu.copy() + + # Todo: Which address to pick? + if use_tiles: + prep.addr2_gpu = cp.asarray(prep.addr2) + + prep.I = cupyx.empty_pinned(d.data.shape, d.data.dtype, order="C") + prep.I[:] = d.data + + # Todo: avoid that extra copy of data + if self.do_position_refinement: + ma = self.ma.S[d.ID].data.astype(np.float32) + prep.ma = cupyx.empty_pinned(ma.shape, ma.dtype, order="C") + prep.ma[:] = ma + + log(4, 'Free memory on device: %.2f GB' % + (float(cp.cuda.runtime.memGetInfo()[0])/1e9)) + self.w_data.add_data_block() + self.I_data.add_data_block() + + self.dID_list = list(self.di.S.keys()) + + def _initialize_model(self): + + # Create noise model + if self.p.ML_type.lower() == "gaussian": + self.ML_model = GaussianModel(self) + elif self.p.ML_type.lower() == "poisson": + raise NotImplementedError('Poisson norm model not yet implemented') + elif self.p.ML_type.lower() == "euclid": + raise NotImplementedError('Euclid norm model not yet implemented') + else: + raise RuntimeError("Unsupported ML_type: '%s'" % self.p.ML_type) + + def _set_pr_ob_ref_for_data(self, dev='gpu', container=None, sync_copy=False): + """ + Overloading the context of Storage.data here, to allow for in-place math on Container instances: + """ + if container is not None: + if container.original == self.pr or container.original == self.ob: + for s in container.S.values(): + # convert data here + if dev == 'gpu': + s.data = s.gpu + if sync_copy: + s.gpu.set(s.cpu) + elif dev == 'cpu': + s.data = s.cpu + if sync_copy: + s.gpu.get(out=s.cpu) + #print('%s to cpu' % s.ID) + else: + for container in self.ptycho.containers.values(): + self._set_pr_ob_ref_for_data( + dev=dev, container=container, sync_copy=sync_copy) + + def _get_smooth_gradient(self, data, sigma): + if self.p.smooth_gradient_method == "convolution": + if self.GSK.tmp is None: + self.GSK.tmp = cp.empty(data.shape, dtype=np.complex64) + try: + self.GSK.convolution(data, [sigma, sigma], tmp=self.GSK.tmp) + except MemoryError: + raise RuntimeError("Convolution kernel too large for direct convolution on GPU", + "Please reduce parameter smooth_gradient or set smooth_gradient_method='fft'.") + elif self.p.smooth_gradient_method == "fft": + self.FGSK.filter(data, sigma) + else: + raise NotImplementedError("smooth_gradient_method should be ```convolution``` or ```fft```.") + return data + + def _replace_ob_grad(self): + new_ob_grad = self.ob_grad_new + # Smoothing preconditioner + if self.smooth_gradient: + self.smooth_gradient.sigma *= (1. - self.p.smooth_gradient_decay) + for name, s in new_ob_grad.storages.items(): + s.gpu = self._get_smooth_gradient( + s.gpu, self.smooth_gradient.sigma) + + return self._replace_grad(self.ob_grad, new_ob_grad) + + def _replace_pr_grad(self): + new_pr_grad = self.pr_grad_new + # Apply probe support if needed + for name, s in new_pr_grad.storages.items(): + self.support_constraint(s) + + return self._replace_grad(self.pr_grad, new_pr_grad) + + def _replace_grad(self, grad, new_grad): + norm = np.double(0.) + dot = np.double(0.) + for name, new in new_grad.storages.items(): + old = grad.storages[name] + norm += self._dot_kernel(new.gpu, new.gpu).get()[0] + dot += self._dot_kernel(new.gpu, old.gpu).get()[0] + old.gpu[:] = new.gpu + return norm, dot + + def engine_iterate(self, num=1): + err = super().engine_iterate(num) + # copy all data back to cpu + self._set_pr_ob_ref_for_data(dev='cpu', container=None, sync_copy=True) + return err + + def position_update(self): + """ + Position refinement + """ + if not self.do_position_refinement or (not self.curiter): + return + do_update_pos = (self.p.position_refinement.stop > + self.curiter >= self.p.position_refinement.start) + do_update_pos &= (self.curiter % + self.p.position_refinement.interval) == 0 + use_tiles = (not self.p.probe_update_cuda_atomics) or ( + not self.p.object_update_cuda_atomics) + + # Update positions + if do_update_pos: + """ + Iterates through all positions and refines them by a given algorithm. + """ + log(4, "----------- START POS REF -------------") + for dID in self.dID_list: + + prep = self.diff_info[dID] + pID, oID, eID = prep.poe_IDs + ob = self.ob.S[oID].gpu + pr = self.pr.S[pID].gpu + kern = self.kernels[prep.label] + aux = kern.aux + addr = prep.addr_gpu + original_addr = prep.original_addr + mangled_addr = prep.mangled_addr_gpu + err_phot = prep.err_phot_gpu + error_state = prep.error_state_gpu + + # copy intensities and weights to GPU + ev_w, w, data_w = self.w_data.to_gpu( + prep.weights, dID, self.qu_htod) + ev, I, data_I = self.I_data.to_gpu(prep.I, dID, self.qu_htod) + + PCK = kern.PCK + TK = kern.TK + PROP = kern.PROP + + # Keep track of object boundaries + max_oby = ob.shape[-2] - aux.shape[-2] - 1 + max_obx = ob.shape[-1] - aux.shape[-1] - 1 + + # We need to re-calculate the current error + PCK.build_aux(aux, addr, ob, pr) + PROP.fw(aux, aux) + PCK.queue.wait_event(ev) + # w & I now on device + PCK.log_likelihood_ml(aux, addr, I, w, err_phot) + cp.cuda.runtime.memcpy(dst=error_state.data.ptr, + src=err_phot.data.ptr, + size=err_phot.nbytes, + kind=3) # d2d + + PCK.mangler.setup_shifts(self.curiter, nframes=addr.shape[0]) + + log(4, 'Position refinement trial: iteration %s' % (self.curiter)) + for i in range(PCK.mangler.nshifts): + PCK.mangler.get_address( + i, addr, mangled_addr, max_oby, max_obx) + PCK.build_aux(aux, mangled_addr, ob, pr) + PROP.fw(aux, aux) + PCK.log_likelihood_ml(aux, mangled_addr, I, w, err_phot) + PCK.update_addr_and_error_state( + addr, error_state, mangled_addr, err_phot) + + data_w.record_done(self.queue, 'compute') + data_I.record_done(self.queue, 'compute') + cp.cuda.runtime.memcpy(dst=err_phot.data.ptr, + src=error_state.data.ptr, + size=err_phot.nbytes, + kind=3) # d2d + if use_tiles: + s1 = addr.shape[0] * addr.shape[1] + s2 = addr.shape[2] * addr.shape[3] + TK.transpose(addr.reshape(s1, s2), + prep.addr2_gpu.reshape(s2, s1)) + + self.dID_list.reverse() + + def support_constraint(self, storage=None): + """ + Enforces 2D support constraint on probe. + """ + if storage is None: + for s in self.pr.storages.values(): + self.support_constraint(s) + + # Fourier space + support = self._probe_fourier_support.get(storage.ID) + if support is not None: + if storage.ID not in self.FSK: + supp = support.astype(np.complex64) + self.FSK[storage.ID] = FourierSupportKernel( + supp, self.queue, self.p.fft_lib) + self.FSK[storage.ID].allocate() + self.FSK[storage.ID].apply_fourier_support(storage.gpu) + + # Real space + support = self._probe_support.get(storage.ID) + if support is not None: + if storage.ID not in self.RSK: + self.RSK[storage.ID] = RealSupportKernel( + support.astype(np.complex64)) + self.RSK[storage.ID].allocate() + self.RSK[storage.ID].apply_real_support(storage.gpu) + + def engine_finalize(self): + """ + Clear all GPU data, pinned memory, etc + """ + self.w_data = None + self.I_data = None + + for name, s in self.pr.S.items(): + s.data = s.gpu.get() # need this, otherwise getting segfault once context is detached + # no longer need those + del s.gpu + del s.cpu + for name, s in self.ob.S.items(): + s.data = s.gpu.get() # need this, otherwise getting segfault once context is detached + # no longer need those + del s.gpu + del s.cpu + for dID, prep in self.diff_info.items(): + prep.addr = prep.addr_gpu.get() + prep.float_intens_coeff = prep.fic_gpu.get() + + # self.queue.synchronize() + super().engine_finalize() + + log_device_memory_stats(4) + + +class GaussianModel(BaseModelSerial): + """ + Gaussian noise model. + TODO: feed actual statistical weights instead of using the Poisson statistic heuristic. + """ + + def __init__(self, MLengine): + """ + Core functions for ML computation using a Gaussian model. + """ + super(GaussianModel, self).__init__(MLengine) + + if self.p.reg_del2: + self.regularizer = Regul_del2_cupy( + self.p.reg_del2_amplitude, + queue=self.engine.queue + ) + else: + self.regularizer = None + + def prepare(self): + + super(GaussianModel, self).prepare() + + for label, d in self.engine.ptycho.new_data: + prep = self.engine.diff_info[d.ID] + w = (self.Irenorm * self.engine.ma.S[d.ID].data + / (1. / self.Irenorm + d.data)).astype(d.data.dtype) + prep.weights = cupyx.empty_pinned(w.shape, w.dtype, order="C") + prep.weights[:] = w + + def __del__(self): + """ + Clean up routine + """ + super(GaussianModel, self).__del__() + + def new_grad(self): + """ + Compute a new gradient direction according to a Gaussian noise model. + + Note: The negative log-likelihood and local errors are also computed + here. + """ + ob_grad = self.engine.ob_grad_new + pr_grad = self.engine.pr_grad_new + qu_htod = self.engine.qu_htod + queue = self.engine.queue + + self.engine._set_pr_ob_ref_for_data('gpu') + ob_grad << 0. + pr_grad << 0. + + # We need an array for MPI + LL = np.array([0.]) + error_dct = {} + + for dID in self.engine.dID_list: + prep = self.engine.diff_info[dID] + # find probe, object in exit ID in dependence of dID + pID, oID, eID = prep.poe_IDs + + # references for kernels + kern = self.engine.kernels[prep.label] + GDK = kern.GDK + AWK = kern.AWK + POK = kern.POK + aux = kern.aux + + FW = kern.PROP.fw + BW = kern.PROP.bw + + # get addresses and auxilliary array + addr = prep.addr_gpu + fic = prep.fic_gpu + + err_phot = prep.err_phot_gpu + # local references + ob = self.engine.ob.S[oID].data + obg = ob_grad.S[oID].data + pr = self.engine.pr.S[pID].data + prg = pr_grad.S[pID].data + + # Schedule w & I to device + ev_w, w, data_w = self.engine.w_data.to_gpu( + prep.weights, dID, qu_htod) + ev, I, data_I = self.engine.I_data.to_gpu(prep.I, dID, qu_htod) + + # make propagated exit (to buffer) + AWK.build_aux_no_ex(aux, addr, ob, pr, add=False) + + # forward prop + FW(aux, aux) + GDK.make_model(aux, addr) + + queue.wait_event(ev) + + if self.p.floating_intensities: + GDK.floating_intensity(addr, w, I, fic) + + GDK.main(aux, addr, w, I) + data_w.record_done(queue, 'compute') + data_I.record_done(queue, 'compute') + + GDK.error_reduce(addr, err_phot) + + BW(aux, aux) + + use_atomics = self.p.object_update_cuda_atomics + addr = prep.addr_gpu if use_atomics else prep.addr2_gpu + POK.ob_update_ML(addr, obg, pr, aux, atomics=use_atomics) + + use_atomics = self.p.probe_update_cuda_atomics + addr = prep.addr_gpu if use_atomics else prep.addr2_gpu + POK.pr_update_ML(addr, prg, ob, aux, atomics=use_atomics) + + queue.synchronize() + self.engine.dID_list.reverse() + + # TODO we err_phot.sum, but not necessarily this error_dct until the end of contiguous iteration + for dID, prep in self.engine.diff_info.items(): + err_phot = prep.err_phot_gpu.get() + LL += err_phot.sum() + err_phot /= np.prod(prep.weights.shape[-2:]) + err_fourier = np.zeros_like(err_phot) + err_exit = np.zeros_like(err_phot) + errs = np.ascontiguousarray( + np.vstack([err_fourier, err_phot, err_exit]).T) + error_dct.update(zip(prep.view_IDs, errs)) + + # MPI reduction of gradients + + # DtoH copies + for s in ob_grad.S.values(): + s.gpu.get(out=s.cpu) + for s in pr_grad.S.values(): + s.gpu.get(out=s.cpu) + self.engine._set_pr_ob_ref_for_data('cpu') + + ob_grad.allreduce() + pr_grad.allreduce() + parallel.allreduce(LL) + + # HtoD cause we continue on gpu + for s in ob_grad.S.values(): + s.gpu.set(s.cpu) + for s in pr_grad.S.values(): + s.gpu.set(s.cpu) + self.engine._set_pr_ob_ref_for_data('gpu') + + # Object regularizer + if self.regularizer: + for name, s in self.engine.ob.storages.items(): + ob_grad.storages[name].data += self.regularizer.grad(s.data) + LL += self.regularizer.LL + + self.LL = LL / self.tot_measpts + + return error_dct + + def poly_line_coeffs_ob(self, c_ob_h): + """ + Compute the coefficients of the polynomial for line minimization + in direction h for the object + """ + self.engine._set_pr_ob_ref_for_data('gpu') + qu_htod = self.engine.qu_htod + queue = self.engine.queue + + # does not accept np.longdouble + B = cp.zeros((3,), dtype=np.float32) + Brenorm = 1. / self.LL[0] ** 2 + + # Outer loop: through diffraction patterns + for dID in self.engine.dID_list: + prep = self.engine.diff_info[dID] + + # find probe, object in exit ID in dependence of dID + pID, oID, eID = prep.poe_IDs + + # references for kernels + kern = self.engine.kernels[prep.label] + GDK = kern.GDK + AWK = kern.AWK + + f = kern.aux + a = kern.a + + FW = kern.PROP.fw + + # get addresses and auxiliary arrays + addr = prep.addr_gpu + fic = prep.fic_gpu + + # Schedule w & I to device + ev_w, w, data_w = self.engine.w_data.to_gpu( + prep.weights, dID, qu_htod) + ev, I, data_I = self.engine.I_data.to_gpu(prep.I, dID, qu_htod) + + # local references + ob = self.ob.S[oID].data + ob_h = c_ob_h.S[oID].data + pr = self.pr.S[pID].data + + # make propagated exit (to buffer) + AWK.build_aux_no_ex(f, addr, ob, pr, add=False) + AWK.build_aux_no_ex(a, addr, ob_h, pr, add=False) + + # forward prop + FW(f, f) + FW(a, a) + + queue.wait_event(ev) + + GDK.make_a012_notb(f, a, addr, I, fic) + GDK.fill_b(addr, Brenorm, w, B) + + data_w.record_done(queue, 'compute') + data_I.record_done(queue, 'compute') + + queue.synchronize() + self.engine.dID_list.reverse() + + B = B.get() + parallel.allreduce(B) + + # Object regularizer + if self.regularizer: + for name, s in self.ob.storages.items(): + B += Brenorm * self.regularizer.poly_line_coeffs( + c_ob_h.storages[name].data, s.data) + + self.B = B + + return B + + def poly_line_coeffs_pr(self, c_pr_h): + """ + Compute the coefficients of the polynomial for line minimization + in direction h for the probe + """ + self.engine._set_pr_ob_ref_for_data('gpu') + qu_htod = self.engine.qu_htod + queue = self.engine.queue + + # does not accept np.longdouble + B = cp.zeros((3,), dtype=np.float32) + Brenorm = 1. / self.LL[0] ** 2 + + # Outer loop: through diffraction patterns + for dID in self.engine.dID_list: + prep = self.engine.diff_info[dID] + + # find probe, object in exit ID in dependence of dID + pID, oID, eID = prep.poe_IDs + + # references for kernels + kern = self.engine.kernels[prep.label] + GDK = kern.GDK + AWK = kern.AWK + + f = kern.aux + a = kern.a + + FW = kern.PROP.fw + + # get addresses and auxiliary arrays + addr = prep.addr_gpu + fic = prep.fic_gpu + + # Schedule w & I to device + ev_w, w, data_w = self.engine.w_data.to_gpu( + prep.weights, dID, qu_htod) + ev, I, data_I = self.engine.I_data.to_gpu(prep.I, dID, qu_htod) + + # local references + ob = self.ob.S[oID].data + pr = self.pr.S[pID].data + pr_h = c_pr_h.S[pID].data + + # make propagated exit (to buffer) + AWK.build_aux_no_ex(f, addr, ob, pr, add=False) + AWK.build_aux_no_ex(a, addr, ob, pr_h, add=True) + + # forward prop + FW(f, f) + FW(a, a) + + queue.wait_event(ev) + + GDK.make_a012_notb(f, a, addr, I, fic) + GDK.fill_b(addr, Brenorm, w, B) + + data_w.record_done(queue, 'compute') + data_I.record_done(queue, 'compute') + + queue.synchronize() + self.engine.dID_list.reverse() + + B = B.get() + parallel.allreduce(B) + + self.B = B + + return B + + +class Regul_del2_cupy(object): + """\ + Squared gradient regularizer (Gaussian prior). + + This class applies to any numpy array. + """ + + def __init__(self, amplitude, axes=[-2, -1], queue=None): + # Regul.__init__(self, axes) + self.axes = axes + self.amplitude = amplitude + self.delxy = None + self.g = None + self.LL = None + self.queue = queue + self.AUK = ArrayUtilsKernel(queue=queue) + self.DELK_c = DerivativesKernel(np.complex64, queue=queue) + self.DELK_f = DerivativesKernel(np.float32, queue=queue) + + + def empty(x): return cp.empty( + x.shape, x.dtype) + + def delxb(x, axis=-1): + out = empty(x) + if x.dtype == np.float32: + self.DELK_f.delxb(x, out, axis) + elif x.dtype == np.complex64: + self.DELK_c.delxb(x, out, axis) + else: + raise TypeError("Type %s invalid for derivatives" % x.dtype) + return out + + self.delxb = delxb + + def delxf(x, axis=-1): + out = empty(x) + if x.dtype == np.float32: + self.DELK_f.delxf(x, out, axis) + elif x.dtype == np.complex64: + self.DELK_c.delxf(x, out, axis) + else: + raise TypeError("Type %s invalid for derivatives" % x.dtype) + return out + + self.delxf = delxf + self.norm = lambda x: self.AUK.norm2(x).get().item() + self.dot = lambda x, y: self.AUK.dot(x, y).get().item() + + self._grad_reg_kernel = cp.ElementwiseKernel( + "float32 fac, complex64 py, complex64 px, complex64 my, complex64 mx", + "complex64 out", + "out = (px+py-my-mx) * fac", + "grad_reg", + no_return=True + ) + + def grad(amp, px, py, mx, my): + out = empty(px) + if self.queue is not None: + self.queue.use() + self._grad_reg_kernel(amp, py, px, mx, my, out) + return out + self.reg_grad = grad + + def grad(self, x): + """ + Compute and return the regularizer gradient given the array x. + """ + ax0, ax1 = self.axes + del_xf = self.delxf(x, axis=ax0) + del_yf = self.delxf(x, axis=ax1) + del_xb = self.delxb(x, axis=ax0) + del_yb = self.delxb(x, axis=ax1) + + self.delxy = [del_xf, del_yf, del_xb, del_yb] + + # TODO this one might be slow, maybe try with elementwise kernel + #self.g = (del_xb + del_yb - del_xf - del_yf) * 2. * self.amplitude + self.g = self.reg_grad(2. * self.amplitude, + del_xb, del_yb, del_xf, del_yf) + + self.LL = self.amplitude * (self.norm(del_xf) + + self.norm(del_yf) + + self.norm(del_xb) + + self.norm(del_yb)) + + return self.g + + def poly_line_coeffs(self, h, x=None): + ax0, ax1 = self.axes + if x is None: + del_xf, del_yf, del_xb, del_yb = self.delxy + else: + del_xf = self.delxf(x, axis=ax0) + del_yf = self.delxf(x, axis=ax1) + del_xb = self.delxb(x, axis=ax0) + del_yb = self.delxb(x, axis=ax1) + + hdel_xf = self.delxf(h, axis=ax0) + hdel_yf = self.delxf(h, axis=ax1) + hdel_xb = self.delxb(h, axis=ax0) + hdel_yb = self.delxb(h, axis=ax1) + + c0 = self.amplitude * (self.norm(del_xf) + + self.norm(del_yf) + + self.norm(del_xb) + + self.norm(del_yb)) + + c1 = 2 * self.amplitude * (self.dot(del_xf, hdel_xf) + + self.dot(del_yf, hdel_yf) + + self.dot(del_xb, hdel_xb) + + self.dot(del_yb, hdel_yb)) + + c2 = self.amplitude * (self.norm(hdel_xf) + + self.norm(hdel_yf) + + self.norm(hdel_xb) + + self.norm(hdel_yb)) + + self.coeff = np.array([c0, c1, c2]) + return self.coeff diff --git a/ptypy/custom/MLSeparateGrads_pycuda.py b/ptypy/custom/MLSeparateGrads_pycuda.py new file mode 100644 index 000000000..501c95864 --- /dev/null +++ b/ptypy/custom/MLSeparateGrads_pycuda.py @@ -0,0 +1,858 @@ +# -*- coding: utf-8 -*- +""" +Maximum Likelihood reconstruction engine. + +TODO. + + * Implement other regularizers + +This file is part of the PTYPY package. + + :copyright: Copyright 2014 by the PTYPY team, see AUTHORS. + :license: see LICENSE for details. +""" +import numpy as np +from pycuda import gpuarray +import pycuda.driver as cuda + +from ptypy.engines import register +from .MLSeparateGrads_serial import MLSeparateGrads_serial +from ptypy.accelerate.base.engines.ML_serial import BaseModelSerial +from ptypy import utils as u +from ptypy.utils.verbose import logger, log +from ptypy.utils import parallel +from ptypy.accelerate.cuda_pycuda import get_context, get_dev_pool +from ptypy.accelerate.cuda_pycuda.kernels import PropagationKernel, RealSupportKernel, FourierSupportKernel +from ptypy.accelerate.cuda_pycuda.kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel +from ptypy.accelerate.cuda_pycuda.array_utils import ArrayUtilsKernel, DerivativesKernel, GaussianSmoothingKernel, FFTGaussianSmoothingKernel, TransposeKernel + +from ptypy.accelerate.cuda_pycuda.mem_utils import GpuDataManager +from ptypy.accelerate.base import address_manglers + +__all__ = ['MLSeparateGrads_pycuda'] + +MAX_BLOCKS = 99999 # can be used to limit the number of blocks, simulating that they don't fit +#MAX_BLOCKS = 3 # can be used to limit the number of blocks, simulating that they don't fit + +@register() +class MLSeparateGrads_pycuda(MLSeparateGrads_serial): + + """ + Defaults: + + [probe_update_cuda_atomics] + default = False + type = bool + help = For GPU, use the atomics version for probe update kernel + + [object_update_cuda_atomics] + default = True + type = bool + help = For GPU, use the atomics version for object update kernel + + [use_cuda_device_memory_pool] + default = True + type = bool + help = For GPU, use a device memory pool + + [fft_lib] + default = reikna + type = str + help = Choose the pycuda-compatible FFT module. + doc = One of: + - ``'reikna'`` : the reikna packaga (fast load, competitive compute for streaming) + - ``'cuda'`` : ptypy's cuda wrapper (delayed load, but fastest compute if all data is on GPU) + - ``'skcuda'`` : scikit-cuda (fast load, slowest compute due to additional store/load stages) + choices = 'reikna','cuda','skcuda' + userlevel = 2 + + """ + + def __init__(self, ptycho_parent, pars=None): + """ + Maximum likelihood reconstruction engine. + """ + super().__init__(ptycho_parent, pars) + + def engine_initialize(self): + """ + Prepare for ML reconstruction. + """ + self.context, self.queue = get_context(new_queue=True) + + if self.p.use_cuda_device_memory_pool: + self._dev_pool = get_dev_pool() + self.allocate = self._dev_pool.allocate + else: + self._dev_pool = None + self.allocate = cuda.mem_alloc + + self.qu_htod = cuda.Stream() + self.qu_dtoh = cuda.Stream() + + if self.p.smooth_gradient_method == "convolution": + self.GSK = GaussianSmoothingKernel(queue=self.queue) + self.GSK.tmp = None + + if self.p.smooth_gradient_method == "fft": + self.FGSK = FFTGaussianSmoothingKernel(queue=self.queue) + + # Real/Fourier Support Kernel + self.RSK = {} + self.FSK = {} + + super().engine_initialize() + #self._setup_kernels() + + def _setup_kernels(self): + """ + Setup kernels, one for each scan. Derive scans from ptycho class + """ + AUK = ArrayUtilsKernel(queue=self.queue) + self._dot_kernel = AUK.dot + # get the scans + for label, scan in self.ptycho.model.scans.items(): + + kern = u.Param() + kern.scanmodel = type(scan).__name__ + self.kernels[label] = kern + + # TODO: needs to be adapted for broad bandwidth + geo = scan.geometries[0] + + # Get info to shape buffer arrays + fpc = scan.max_frames_per_block + + # TODO : make this more foolproof + try: + nmodes = scan.p.coherence.num_probe_modes * \ + scan.p.coherence.num_object_modes + except: + nmodes = 1 + + # create buffer arrays + ash = (fpc * nmodes,) + tuple([int(s) for s in geo.shape]) + aux = gpuarray.zeros(ash, dtype=np.complex64) + kern.aux = aux + kern.a = gpuarray.zeros(ash, dtype=np.complex64) + kern.b = gpuarray.zeros(ash, dtype=np.complex64) + + # setup kernels, one for each SCAN. + kern.GDK = GradientDescentKernel(aux, nmodes, queue=self.queue, math_type="double") + kern.GDK.allocate() + + kern.POK = PoUpdateKernel(queue_thread=self.queue) + kern.POK.allocate() + + kern.AWK = AuxiliaryWaveKernel(queue_thread=self.queue) + kern.AWK.allocate() + + kern.TK = TransposeKernel(queue=self.queue) + + kern.PROP = PropagationKernel(aux, geo.propagator, queue_thread=self.queue, fft=self.p.fft_lib) + kern.PROP.allocate() + kern.resolution = geo.resolution[0] + + if self.do_position_refinement: + kern.PCK = PositionCorrectionKernel(aux, nmodes, self.p.position_refinement, geo.resolution, queue_thread=self.queue) + kern.PCK.allocate() + + mag_mem = 0 + for scan, kern in self.kernels.items(): + mag_mem = max(kern.aux.nbytes // 2, mag_mem) + ma_mem = mag_mem + mem = cuda.mem_get_info()[0] + blk = ma_mem + mag_mem + fit = int(mem - 200 * 1024 * 1024) // blk # leave 200MB room for safety + if not fit: + log(1,"Cannot fit memory into device, if possible reduce frames per block. Exiting...") + raise SystemExit("ptypy has been exited.") + + # TODO grow blocks dynamically + nma = min(fit, MAX_BLOCKS) + log(4, 'Free memory available: {:.2f} GB'.format(float(mem)/(1024**3))) + log(4, 'Memory to be allocated per block {:.2f} GB'.format(float(blk)/(1024**3))) + log(4, 'PyCUDA max blocks fitting on GPU: ma_arrays={}'.format(nma)) + # reset memory or create new + self.w_data = GpuDataManager(ma_mem, 0, nma, False) + self.I_data = GpuDataManager(mag_mem, 0, nma, False) + + def engine_prepare(self): + + super().engine_prepare() + ## Serialize new data ## + use_tiles = (not self.p.probe_update_cuda_atomics) or (not self.p.object_update_cuda_atomics) + + # recursive copy to gpu for probe and object + for _cname, c in self.ptycho.containers.items(): + if c.original != self.pr and c.original != self.ob: + continue + for _sname, s in c.S.items(): + # convert data here + s.gpu = gpuarray.to_gpu(s.data) + s.cpu = cuda.pagelocked_empty(s.data.shape, s.data.dtype, order="C") + s.cpu[:] = s.data + + for label, d in self.ptycho.new_data: + prep = self.diff_info[d.ID] + prep.err_phot_gpu = gpuarray.to_gpu(prep.err_phot) + prep.fic_gpu = gpuarray.ones_like(prep.err_phot_gpu) + + if use_tiles: + prep.addr2 = np.ascontiguousarray(np.transpose(prep.addr, (2, 3, 0, 1))) + + prep.addr_gpu = gpuarray.to_gpu(prep.addr) + if self.do_position_refinement: + prep.original_addr_gpu = gpuarray.to_gpu(prep.original_addr) + prep.error_state_gpu = gpuarray.empty_like(prep.err_phot_gpu) + prep.mangled_addr_gpu = prep.addr_gpu.copy() + + # Todo: Which address to pick? + if use_tiles: + prep.addr2_gpu = gpuarray.to_gpu(prep.addr2) + + prep.I = cuda.pagelocked_empty(d.data.shape, d.data.dtype, order="C", mem_flags=4) + prep.I[:] = d.data + + # Todo: avoid that extra copy of data + if self.do_position_refinement: + ma = self.ma.S[d.ID].data.astype(np.float32) + prep.ma = cuda.pagelocked_empty(ma.shape, ma.dtype, order="C", mem_flags=4) + prep.ma[:] = ma + + log(4, 'Free memory on device: %.2f GB' % (float(cuda.mem_get_info()[0])/1e9)) + self.w_data.add_data_block() + self.I_data.add_data_block() + + self.dID_list = list(self.di.S.keys()) + + def _initialize_model(self): + + # Create noise model + if self.p.ML_type.lower() == "gaussian": + self.ML_model = GaussianModel(self) + elif self.p.ML_type.lower() == "poisson": + raise NotImplementedError('Poisson norm model not yet implemented') + elif self.p.ML_type.lower() == "euclid": + raise NotImplementedError('Euclid norm model not yet implemented') + else: + raise RuntimeError("Unsupported ML_type: '%s'" % self.p.ML_type) + + def _set_pr_ob_ref_for_data(self, dev='gpu', container=None, sync_copy=False): + """ + Overloading the context of Storage.data here, to allow for in-place math on Container instances: + """ + if container is not None: + if container.original==self.pr or container.original==self.ob: + for s in container.S.values(): + # convert data here + if dev == 'gpu': + s.data = s.gpu + if sync_copy: s.gpu.set(s.cpu) + elif dev == 'cpu': + s.data = s.cpu + if sync_copy: + s.gpu.get(s.cpu) + #print('%s to cpu' % s.ID) + else: + for container in self.ptycho.containers.values(): + self._set_pr_ob_ref_for_data(dev=dev, container=container, sync_copy=sync_copy) + + def _get_smooth_gradient(self, data, sigma): + if self.p.smooth_gradient_method == "convolution": + if self.GSK.tmp is None: + self.GSK.tmp = gpuarray.empty(data.shape, dtype=np.complex64) + try: + self.GSK.convolution(data, [sigma, sigma], tmp=self.GSK.tmp) + except MemoryError: + raise RuntimeError("Convolution kernel too large for direct convolution on GPU", + "Please reduce parameter smooth_gradient or set smooth_gradient_method='fft'.") + elif self.p.smooth_gradient_method == "fft": + self.FGSK.filter(data, sigma) + else: + raise NotImplementedError("smooth_gradient_method should be ```convolution``` or ```fft```.") + return data + + def _replace_ob_grad(self): + new_ob_grad = self.ob_grad_new + # Smoothing preconditioner + if self.smooth_gradient: + self.smooth_gradient.sigma *= (1. - self.p.smooth_gradient_decay) + for name, s in new_ob_grad.storages.items(): + s.gpu = self._get_smooth_gradient(s.gpu, self.smooth_gradient.sigma) + + return self._replace_grad(self.ob_grad, new_ob_grad) + + def _replace_pr_grad(self): + new_pr_grad = self.pr_grad_new + # Apply probe support if needed + for name, s in new_pr_grad.storages.items(): + self.support_constraint(s) + + return self._replace_grad(self.pr_grad , new_pr_grad) + + def _replace_grad(self, grad, new_grad): + norm = np.double(0.) + dot = np.double(0.) + for name, new in new_grad.storages.items(): + old = grad.storages[name] + norm += self._dot_kernel(new.gpu,new.gpu).get()[0] + dot += self._dot_kernel(new.gpu,old.gpu).get()[0] + old.gpu[:] = new.gpu + return norm, dot + + def engine_iterate(self, num=1): + err = super().engine_iterate(num) + # copy all data back to cpu + self._set_pr_ob_ref_for_data(dev='cpu', container=None, sync_copy=True) + return err + + def position_update(self): + """ + Position refinement + """ + if not self.do_position_refinement or (not self.curiter): + return + do_update_pos = (self.p.position_refinement.stop > self.curiter >= self.p.position_refinement.start) + do_update_pos &= (self.curiter % self.p.position_refinement.interval) == 0 + use_tiles = (not self.p.probe_update_cuda_atomics) or (not self.p.object_update_cuda_atomics) + + # Update positions + if do_update_pos: + """ + Iterates through all positions and refines them by a given algorithm. + """ + log(4, "----------- START POS REF -------------") + for dID in self.dID_list: + + prep = self.diff_info[dID] + pID, oID, eID = prep.poe_IDs + ob = self.ob.S[oID].gpu + pr = self.pr.S[pID].gpu + kern = self.kernels[prep.label] + aux = kern.aux + addr = prep.addr_gpu + original_addr = prep.original_addr + mangled_addr = prep.mangled_addr_gpu + err_phot = prep.err_phot_gpu + error_state = prep.error_state_gpu + + # copy intensities and weights to GPU + ev_w, w, data_w = self.w_data.to_gpu(prep.weights, dID, self.qu_htod) + ev, I, data_I = self.I_data.to_gpu(prep.I, dID, self.qu_htod) + + PCK = kern.PCK + TK = kern.TK + PROP = kern.PROP + + # Keep track of object boundaries + max_oby = ob.shape[-2] - aux.shape[-2] - 1 + max_obx = ob.shape[-1] - aux.shape[-1] - 1 + + # We need to re-calculate the current error + PCK.build_aux(aux, addr, ob, pr) + PROP.fw(aux, aux) + PCK.queue.wait_for_event(ev) + # w & I now on device + PCK.log_likelihood_ml(aux, addr, I, w, err_phot) + cuda.memcpy_dtod(dest=error_state.ptr, + src=err_phot.ptr, + size=err_phot.nbytes) + + PCK.mangler.setup_shifts(self.curiter, nframes=addr.shape[0]) + + log(4, 'Position refinement trial: iteration %s' % (self.curiter)) + for i in range(PCK.mangler.nshifts): + PCK.mangler.get_address(i, addr, mangled_addr, max_oby, max_obx) + PCK.build_aux(aux, mangled_addr, ob, pr) + PROP.fw(aux, aux) + PCK.log_likelihood_ml(aux, mangled_addr, I, w, err_phot) + PCK.update_addr_and_error_state(addr, error_state, mangled_addr, err_phot) + + data_w.record_done(self.queue, 'compute') + data_I.record_done(self.queue, 'compute') + cuda.memcpy_dtod(dest=err_phot.ptr, + src=error_state.ptr, + size=err_phot.nbytes) + if use_tiles: + s1 = addr.shape[0] * addr.shape[1] + s2 = addr.shape[2] * addr.shape[3] + TK.transpose(addr.reshape(s1, s2), prep.addr2_gpu.reshape(s2, s1)) + + self.dID_list.reverse() + + def support_constraint(self, storage=None): + """ + Enforces 2D support constraint on probe. + """ + if storage is None: + for s in self.pr.storages.values(): + self.support_constraint(s) + + # Fourier space + support = self._probe_fourier_support.get(storage.ID) + if support is not None: + if storage.ID not in self.FSK: + supp = support.astype(np.complex64) + self.FSK[storage.ID] = FourierSupportKernel(supp, self.queue, self.p.fft_lib) + self.FSK[storage.ID].allocate() + self.FSK[storage.ID].apply_fourier_support(storage.gpu) + + # Real space + support = self._probe_support.get(storage.ID) + if support is not None: + if storage.ID not in self.RSK: + self.RSK[storage.ID] = RealSupportKernel(support.astype(np.complex64)) + self.RSK[storage.ID].allocate() + self.RSK[storage.ID].apply_real_support(storage.gpu) + + def engine_finalize(self): + """ + Clear all GPU data, pinned memory, etc + """ + self.w_data = None + self.I_data = None + + for name, s in self.pr.S.items(): + s.data = s.gpu.get() # need this, otherwise getting segfault once context is detached + # no longer need those + del s.gpu + del s.cpu + for name, s in self.ob.S.items(): + s.data = s.gpu.get() # need this, otherwise getting segfault once context is detached + # no longer need those + del s.gpu + del s.cpu + for dID, prep in self.diff_info.items(): + prep.addr = prep.addr_gpu.get() + prep.float_intens_coeff = prep.fic_gpu.get() + + + #self.queue.synchronize() + super().engine_finalize() + +class GaussianModel(BaseModelSerial): + """ + Gaussian noise model. + TODO: feed actual statistical weights instead of using the Poisson statistic heuristic. + """ + + def __init__(self, MLengine): + """ + Core functions for ML computation using a Gaussian model. + """ + super(GaussianModel, self).__init__(MLengine) + + if self.p.reg_del2: + self.regularizer = Regul_del2_pycuda( + self.p.reg_del2_amplitude, + queue=self.engine.queue, + allocator=self.engine.allocate + ) + else: + self.regularizer = None + + def prepare(self): + + super(GaussianModel, self).prepare() + + for label, d in self.engine.ptycho.new_data: + prep = self.engine.diff_info[d.ID] + w = (self.Irenorm * self.engine.ma.S[d.ID].data + / (1. / self.Irenorm + d.data)).astype(d.data.dtype) + prep.weights = cuda.pagelocked_empty(w.shape, w.dtype, order="C", mem_flags=4) + prep.weights[:] = w + + def __del__(self): + """ + Clean up routine + """ + super(GaussianModel, self).__del__() + + def new_grad(self): + """ + Compute a new gradient direction according to a Gaussian noise model. + + Note: The negative log-likelihood and local errors are also computed + here. + """ + ob_grad = self.engine.ob_grad_new + pr_grad = self.engine.pr_grad_new + qu_htod = self.engine.qu_htod + queue = self.engine.queue + + self.engine._set_pr_ob_ref_for_data('gpu') + ob_grad << 0. + pr_grad << 0. + + # We need an array for MPI + LL = np.array([0.]) + error_dct = {} + + for dID in self.engine.dID_list: + prep = self.engine.diff_info[dID] + # find probe, object in exit ID in dependence of dID + pID, oID, eID = prep.poe_IDs + + # references for kernels + kern = self.engine.kernels[prep.label] + GDK = kern.GDK + AWK = kern.AWK + POK = kern.POK + aux = kern.aux + + FW = kern.PROP.fw + BW = kern.PROP.bw + + # get addresses and auxilliary array + addr = prep.addr_gpu + fic = prep.fic_gpu + + err_phot = prep.err_phot_gpu + # local references + ob = self.engine.ob.S[oID].data + obg = ob_grad.S[oID].data + pr = self.engine.pr.S[pID].data + prg = pr_grad.S[pID].data + + # Schedule w & I to device + ev_w, w, data_w = self.engine.w_data.to_gpu(prep.weights, dID, qu_htod) + ev, I, data_I = self.engine.I_data.to_gpu(prep.I, dID, qu_htod) + + # make propagated exit (to buffer) + AWK.build_aux_no_ex(aux, addr, ob, pr, add=False) + + # forward prop + FW(aux, aux) + GDK.make_model(aux, addr) + + queue.wait_for_event(ev) + + if self.p.floating_intensities: + GDK.floating_intensity(addr, w, I, fic) + + GDK.main(aux, addr, w, I) + data_w.record_done(queue, 'compute') + data_I.record_done(queue, 'compute') + + GDK.error_reduce(addr, err_phot) + + BW(aux, aux) + + use_atomics = self.p.object_update_cuda_atomics + addr = prep.addr_gpu if use_atomics else prep.addr2_gpu + POK.ob_update_ML(addr, obg, pr, aux, atomics=use_atomics) + + use_atomics = self.p.probe_update_cuda_atomics + addr = prep.addr_gpu if use_atomics else prep.addr2_gpu + POK.pr_update_ML(addr, prg, ob, aux, atomics=use_atomics) + + queue.synchronize() + self.engine.dID_list.reverse() + + # TODO we err_phot.sum, but not necessarily this error_dct until the end of contiguous iteration + for dID, prep in self.engine.diff_info.items(): + err_phot = prep.err_phot_gpu.get() + LL += err_phot.sum() + err_phot /= np.prod(prep.weights.shape[-2:]) + err_fourier = np.zeros_like(err_phot) + err_exit = np.zeros_like(err_phot) + errs = np.ascontiguousarray(np.vstack([err_fourier, err_phot, err_exit]).T) + error_dct.update(zip(prep.view_IDs, errs)) + + # MPI reduction of gradients + + # DtoH copies + for s in ob_grad.S.values(): + s.gpu.get(s.cpu) + for s in pr_grad.S.values(): + s.gpu.get(s.cpu) + self.engine._set_pr_ob_ref_for_data('cpu') + + ob_grad.allreduce() + pr_grad.allreduce() + parallel.allreduce(LL) + + # HtoD cause we continue on gpu + for s in ob_grad.S.values(): + s.gpu.set(s.cpu) + for s in pr_grad.S.values(): + s.gpu.set(s.cpu) + self.engine._set_pr_ob_ref_for_data('gpu') + + # Object regularizer + if self.regularizer: + for name, s in self.engine.ob.storages.items(): + ob_grad.storages[name].data += self.regularizer.grad(s.data) + LL += self.regularizer.LL + + self.LL = LL / self.tot_measpts + + return error_dct + + def poly_line_coeffs_ob(self, c_ob_h): + """ + Compute the coefficients of the polynomial for line minimization + in direction h for the object + """ + self.engine._set_pr_ob_ref_for_data('gpu') + qu_htod = self.engine.qu_htod + queue = self.engine.queue + + B = gpuarray.zeros((3,), dtype=np.float32) # does not accept np.longdouble + Brenorm = 1. / self.LL[0] ** 2 + + # Outer loop: through diffraction patterns + for dID in self.engine.dID_list: + prep = self.engine.diff_info[dID] + + # find probe, object in exit ID in dependence of dID + pID, oID, eID = prep.poe_IDs + + # references for kernels + kern = self.engine.kernels[prep.label] + GDK = kern.GDK + AWK = kern.AWK + + f = kern.aux + a = kern.a + + FW = kern.PROP.fw + + # get addresses and auxiliary arrays + addr = prep.addr_gpu + fic = prep.fic_gpu + + # Schedule w & I to device + ev_w, w, data_w = self.engine.w_data.to_gpu(prep.weights, dID, qu_htod) + ev, I, data_I = self.engine.I_data.to_gpu(prep.I, dID, qu_htod) + + # local references + ob = self.ob.S[oID].data + ob_h = c_ob_h.S[oID].data + pr = self.pr.S[pID].data + + # make propagated exit (to buffer) + AWK.build_aux_no_ex(f, addr, ob, pr, add=False) + AWK.build_aux_no_ex(a, addr, ob_h, pr, add=False) + + # forward prop + FW(f,f) + FW(a,a) + + queue.wait_for_event(ev) + + GDK.make_a012_notb(f, a, addr, I, fic) + GDK.fill_b(addr, Brenorm, w, B) + + data_w.record_done(queue, 'compute') + data_I.record_done(queue, 'compute') + + queue.synchronize() + self.engine.dID_list.reverse() + + B = B.get() + parallel.allreduce(B) + + # Object regularizer + if self.regularizer: + for name, s in self.ob.storages.items(): + B += Brenorm * self.regularizer.poly_line_coeffs( + c_ob_h.storages[name].data, s.data) + + self.B = B + + return B + + def poly_line_coeffs_pr(self, c_pr_h): + """ + Compute the coefficients of the polynomial for line minimization + in direction h for the probe + """ + self.engine._set_pr_ob_ref_for_data('gpu') + qu_htod = self.engine.qu_htod + queue = self.engine.queue + + B = gpuarray.zeros((3,), dtype=np.float32) # does not accept np.longdouble + Brenorm = 1. / self.LL[0] ** 2 + + # Outer loop: through diffraction patterns + for dID in self.engine.dID_list: + prep = self.engine.diff_info[dID] + + # find probe, object in exit ID in dependence of dID + pID, oID, eID = prep.poe_IDs + + # references for kernels + kern = self.engine.kernels[prep.label] + GDK = kern.GDK + AWK = kern.AWK + + f = kern.aux + a = kern.a + + FW = kern.PROP.fw + + # get addresses and auxiliary arrays + addr = prep.addr_gpu + fic = prep.fic_gpu + + # Schedule w & I to device + ev_w, w, data_w = self.engine.w_data.to_gpu(prep.weights, dID, qu_htod) + ev, I, data_I = self.engine.I_data.to_gpu(prep.I, dID, qu_htod) + + # local references + ob = self.ob.S[oID].data + pr = self.pr.S[pID].data + pr_h = c_pr_h.S[pID].data + + # make propagated exit (to buffer) + AWK.build_aux_no_ex(f, addr, ob, pr, add=False) + AWK.build_aux_no_ex(a, addr, ob, pr_h, add=True) + + # forward prop + FW(f,f) + FW(a,a) + + queue.wait_for_event(ev) + + GDK.make_a012_notb(f, a, addr, I, fic) + GDK.fill_b(addr, Brenorm, w, B) + + data_w.record_done(queue, 'compute') + data_I.record_done(queue, 'compute') + + queue.synchronize() + self.engine.dID_list.reverse() + + B = B.get() + parallel.allreduce(B) + + self.B = B + + return B + + +class Regul_del2_pycuda(object): + """\ + Squared gradient regularizer (Gaussian prior). + + This class applies to any numpy array. + """ + def __init__(self, amplitude, axes=[-2, -1], queue=None, allocator=None): + # Regul.__init__(self, axes) + self.axes = axes + self.amplitude = amplitude + self.delxy = None + self.g = None + self.LL = None + self.queue = queue + self.AUK = ArrayUtilsKernel(queue=queue) + self.DELK_c = DerivativesKernel(np.complex64, queue=queue) + self.DELK_f = DerivativesKernel(np.float32, queue=queue) + + if allocator is None: + self._dev_pool = get_dev_pool() + self.allocator=self._dev_pool.allocate + else: + self.allocator = allocator + self._dev_pool= None + + empty = lambda x: gpuarray.empty(x.shape, x.dtype, allocator=self.allocator) + + def delxb(x, axis=-1): + out = empty(x) + if x.dtype == np.float32: + self.DELK_f.delxb(x, out, axis) + elif x.dtype == np.complex64: + self.DELK_c.delxb(x, out, axis) + else: + raise TypeError("Type %s invalid for derivatives" % x.dtype) + return out + + self.delxb = delxb + + def delxf(x, axis=-1): + out = empty(x) + if x.dtype == np.float32: + self.DELK_f.delxf(x, out, axis) + elif x.dtype == np.complex64: + self.DELK_c.delxf(x, out, axis) + else: + raise TypeError("Type %s invalid for derivatives" % x.dtype) + return out + + self.delxf = delxf + self.norm = lambda x : self.AUK.norm2(x).get().item() + self.dot = lambda x, y : self.AUK.dot(x,y).get().item() + + from pycuda.elementwise import ElementwiseKernel + self._grad_reg_kernel = ElementwiseKernel( + "pycuda::complex *g, float fac, \ + pycuda::complex *py, pycuda::complex *px, \ + pycuda::complex *my, pycuda::complex *mx", + "g[i] = (px[i]+py[i]-my[i]-mx[i]) * fac", + "grad_reg", + ) + def grad(amp, px,py, mx, my): + out = empty(px) + self._grad_reg_kernel(out, amp, py, px, mx, my, stream=self.queue) + return out + self.reg_grad = grad + + def grad(self, x): + """ + Compute and return the regularizer gradient given the array x. + """ + ax0, ax1 = self.axes + del_xf = self.delxf(x, axis=ax0) + del_yf = self.delxf(x, axis=ax1) + del_xb = self.delxb(x, axis=ax0) + del_yb = self.delxb(x, axis=ax1) + + self.delxy = [del_xf, del_yf, del_xb, del_yb] + + # TODO this one might be slow, maybe try with elementwise kernel + #self.g = (del_xb + del_yb - del_xf - del_yf) * 2. * self.amplitude + self.g = self.reg_grad(2. * self.amplitude, del_xb, del_yb, del_xf, del_yf) + + + self.LL = self.amplitude * (self.norm(del_xf) + + self.norm(del_yf) + + self.norm(del_xb) + + self.norm(del_yb)) + + return self.g + + def poly_line_coeffs(self, h, x=None): + ax0, ax1 = self.axes + if x is None: + del_xf, del_yf, del_xb, del_yb = self.delxy + else: + del_xf = self.delxf(x, axis=ax0) + del_yf = self.delxf(x, axis=ax1) + del_xb = self.delxb(x, axis=ax0) + del_yb = self.delxb(x, axis=ax1) + + hdel_xf = self.delxf(h, axis=ax0) + hdel_yf = self.delxf(h, axis=ax1) + hdel_xb = self.delxb(h, axis=ax0) + hdel_yb = self.delxb(h, axis=ax1) + + c0 = self.amplitude * (self.norm(del_xf) + + self.norm(del_yf) + + self.norm(del_xb) + + self.norm(del_yb)) + + c1 = 2 * self.amplitude * (self.dot(del_xf, hdel_xf) + + self.dot(del_yf, hdel_yf) + + self.dot(del_xb, hdel_xb) + + self.dot(del_yb, hdel_yb)) + + c2 = self.amplitude * (self.norm(hdel_xf) + + self.norm(hdel_yf) + + self.norm(hdel_xb) + + self.norm(hdel_yb)) + + self.coeff = np.array([c0, c1, c2]) + return self.coeff