From af7558973f73ae8dc7b1211ea8b903ba3e81abe7 Mon Sep 17 00:00:00 2001
From: Jaroslav Fowkes <jaroslav.fowkes@stfc.ac.uk>
Date: Thu, 14 Nov 2024 09:27:21 +0000
Subject: [PATCH 01/12] ML with separate gradient steps

---
 ptypy/custom/ML_separate_grads.py | 782 ++++++++++++++++++++++++++++++
 1 file changed, 782 insertions(+)
 create mode 100644 ptypy/custom/ML_separate_grads.py

diff --git a/ptypy/custom/ML_separate_grads.py b/ptypy/custom/ML_separate_grads.py
new file mode 100644
index 000000000..49b397f9e
--- /dev/null
+++ b/ptypy/custom/ML_separate_grads.py
@@ -0,0 +1,782 @@
+# -*- coding: utf-8 -*-
+"""
+Maximum Likelihood separate gradients reconstruction engine.
+
+TODO.
+
+  * Implement other ML models (Poisson/Euclid)
+
+This file is part of the PTYPY package.
+
+    :copyright: Copyright 2024 by the PTYPY team, see AUTHORS.
+    :license: see LICENSE for details.
+"""
+import numpy as np
+import time
+
+from .. import utils as u
+from ..utils.verbose import logger
+from ..utils import parallel
+from ..engines.utils import Cnorm2, Cdot
+from ..engines import register
+from ..engines.base import BaseEngine, PositionCorrectionEngine
+from ..core.manager import Full, Vanilla, Bragg3dModel, BlockVanilla, BlockFull, GradFull, BlockGradFull
+
+
+__all__ = ['MLSeparateGrads']
+
+
+@register()
+class MLSeparateGrads(PositionCorrectionEngine):
+    """
+    Maximum likelihood reconstruction engine.
+
+
+    Defaults:
+
+    [name]
+    default = ML
+    type = str
+    help =
+    doc =
+
+    [ML_type]
+    default = 'gaussian'
+    type = str
+    help = Likelihood model
+    choices = ['gaussian','poisson','euclid']
+    doc = One of ‘gaussian’, poisson’ or ‘euclid’.
+
+    [floating_intensities]
+    default = False
+    type = bool
+    help = Adaptive diffraction pattern rescaling
+    doc = If True, allow for adaptative rescaling of the diffraction pattern intensities (to correct for incident beam intensity fluctuations).
+
+    [intensity_renormalization]
+    default = 1.
+    type = float
+    lowlim = 0.0
+    help = Rescales the intensities so they can be interpreted as Poisson counts.
+
+    [reg_del2]
+    default = False
+    type = bool
+    help = Whether to use a Gaussian prior (smoothing) regularizer
+
+    [reg_del2_amplitude]
+    default = .01
+    type = float
+    lowlim = 0.0
+    help = Amplitude of the Gaussian prior if used
+
+    [smooth_gradient]
+    default = 0.0
+    type = float
+    help = Smoothing preconditioner
+    doc = Sigma for gaussian filter (turned off if 0.)
+
+    [smooth_gradient_decay]
+    default = 0.
+    type = float
+    help = Decay rate for smoothing preconditioner
+    doc = Sigma for gaussian filter will reduce exponentially at this rate
+
+    [scale_precond]
+    default = False
+    type = bool
+    help = Whether to use the object/probe scaling preconditioner
+    doc = This parameter can give faster convergence for weakly scattering samples.
+
+    [probe_update_start]
+    default = 0
+    type = int
+    lowlim = 0
+    help = Number of iterations before probe update starts
+    # NOTE: probe_update_start doesn't work with this code, need to add some code to fix this
+
+    [poly_line_coeffs]
+    default = quadratic
+    type = str
+    help = How many coefficients to be used in the the linesearch
+    choices = ['quadratic','all']
+    doc = choose between the 'quadratic' approximation (default) or 'all'
+
+    """
+
+    SUPPORTED_MODELS = [Full, Vanilla, Bragg3dModel, BlockVanilla, BlockFull, GradFull, BlockGradFull]
+
+    def __init__(self, ptycho_parent, pars=None):
+        """
+        Maximum likelihood reconstruction engine.
+        """
+        super(MLSeparateGrads, self).__init__(ptycho_parent, pars)
+
+        # Instance attributes
+
+        # Object gradient
+        self.ob_grad = None
+
+        # Object minimization direction
+        self.ob_h = None
+
+        # Probe gradient
+        self.pr_grad = None
+
+        # Probe minimization direction
+        self.pr_h = None
+
+        # Working variables
+        # Object gradient
+        self.ob_grad_new = None
+
+        # Probe gradient
+        self.pr_grad_new = None
+
+
+        # Other
+        self.tmin_ob = None
+        self.tmin_pr = None
+        self.ML_model = None
+        self.smooth_gradient = None
+
+        self.ptycho.citations.add_article(
+            title='Maximum-likelihood refinement for coherent diffractive imaging',
+            author='Thibault P. and Guizar-Sicairos M.',
+            journal='New Journal of Physics',
+            volume=14,
+            year=2012,
+            page=63004,
+            doi='10.1088/1367-2630/14/6/063004',
+            comment='The maximum likelihood reconstruction algorithm',
+        )
+
+    def engine_initialize(self):
+        """
+        Prepare for ML reconstruction.
+        """
+        super(MLSeparateGrads, self).engine_initialize()
+
+        # Object gradient and minimization direction
+        self.ob_grad = self.ob.copy(self.ob.ID + '_grad', fill=0.)
+        self.ob_grad_new = self.ob.copy(self.ob.ID + '_grad_new', fill=0.)
+        self.ob_h = self.ob.copy(self.ob.ID + '_h', fill=0.)
+
+        # Probe gradient and minimization direction
+        self.pr_grad = self.pr.copy(self.pr.ID + '_grad', fill=0.)
+        self.pr_grad_new = self.pr.copy(self.pr.ID + '_grad_new', fill=0.)
+        self.pr_h = self.pr.copy(self.pr.ID + '_h', fill=0.)
+
+        self.tmin_ob = 1.
+        self.tmin_pr = 1.
+
+        # Other options
+        self.smooth_gradient = prepare_smoothing_preconditioner(
+            self.p.smooth_gradient)
+
+        self._initialize_model()
+
+    def _initialize_model(self):
+
+        # Create noise model
+        if self.p.ML_type.lower() == "gaussian":
+            self.ML_model = GaussianModel(self)
+        # elif self.p.ML_type.lower() == "poisson":
+        #     self.ML_model = PoissonModel(self)
+        # elif self.p.ML_type.lower() == "euclid":
+        #     self.ML_model = EuclidModel(self)
+        else:
+            raise RuntimeError("Unsupported ML_type: '%s'" % self.p.ML_type)
+
+
+
+    def engine_prepare(self):
+        """
+        Last minute initialization, everything, that needs to be recalculated,
+        when new data arrives.
+        """
+        # - # fill object with coverage of views
+        # - for name,s in self.ob_viewcover.S.items():
+        # -    s.fill(s.get_view_coverage())
+        self.ML_model.prepare()
+
+    def engine_iterate(self, num=1):
+        """
+        Compute `num` iterations.
+        """
+        tg = 0.
+        tc = 0.
+        ta = time.time()
+        for it in range(num):
+
+            ########################
+            # Compute new gradients
+            # ob: new_ob_grad
+            # pr: new_pr_grad
+            ########################
+            t1 = time.time()
+            error_dct = self.ML_model.new_grad()
+            new_ob_grad, new_pr_grad = self.ob_grad_new, self.pr_grad_new
+            tg += time.time() - t1
+
+            if self.p.probe_update_start <= self.curiter:
+                # Apply probe support if needed
+                for name, s in new_pr_grad.storages.items():
+                    self.support_constraint(s)
+                    #support = self.probe_support.get(name)
+                    #if support is not None:
+                    #    s.data *= support
+            # FIXME: this hack doesn't work here as we step in probe and object separately
+            # FIXME: really it's the probe step that should be zeroed out not the gradient
+            else:
+                new_pr_grad.fill(0.)
+
+            # Smoothing preconditioner
+            if self.smooth_gradient:
+                self.smooth_gradient.sigma *= (1. - self.p.smooth_gradient_decay)
+                for name, s in new_ob_grad.storages.items():
+                    s.data[:] = self.smooth_gradient(s.data)
+
+            ############################
+            # Compute Polak-Ribiere betas
+            # bt_ob = bt_num_ob/bt_denom_ob
+            # bt_pr = bt_num_pr/bt_denom_pr
+            ############################
+            if self.curiter == 0:
+                bt_ob = 0.
+                bt_pr = 0.
+            else:
+                # For the object
+                bt_num_ob = Cnorm2(new_ob_grad) - np.real(Cdot(new_ob_grad, self.ob_grad))
+                bt_denom_ob = Cnorm2(self.ob_grad)
+                bt_ob = max(0, bt_num_ob/bt_denom_ob)
+
+                # For the probe
+                bt_num_pr = Cnorm2(new_pr_grad) - np.real(Cdot(new_pr_grad, self.pr_grad))
+                bt_denom_pr = Cnorm2(self.pr_grad)
+                bt_pr = max(0, bt_num_pr/bt_denom_pr)
+
+            self.ob_grad << new_ob_grad
+            self.pr_grad << new_pr_grad
+
+            ############################
+            # Compute next conjugates
+            # ob_h -= bt_ob * ob_grad
+            # pr_h -= bt_pr * pr_grad
+            # NB: in the below need to do h/tmin
+            # as did h*tmin when taking steps
+            # (don't you just love containers?)
+            ############################
+            self.ob_h *= bt_ob / self.tmin_ob
+
+            # Smoothing preconditioner
+            if self.smooth_gradient:
+                for name, s in self.ob_h.storages.items():
+                    s.data[:] -= self.smooth_gradient(self.ob_grad.storages[name].data)
+            else:
+                self.ob_h -= self.ob_grad
+
+            self.pr_h *= bt_pr / self.tmin_pr
+            self.pr_h -= self.pr_grad
+
+            ########################
+            # Compute step-sizes
+            # ob: tmin_ob
+            # pr: tmin_pr
+            ########################
+            dt = self.ptycho.FType
+            t2 = time.time()
+
+            if self.p.poly_line_coeffs == "quadratic":
+                B_ob = self.ML_model.poly_line_coeffs_ob(self.ob_h)
+                B_pr = self.ML_model.poly_line_coeffs_pr(self.pr_h)
+
+                # same as above but quicker when poly quadratic
+                self.tmin_ob = dt(-0.5 * B_ob[1] / B_ob[2])
+                self.tmin_pr = dt(-0.5 * B_pr[1] / B_pr[2])
+
+            else:
+                raise NotImplementedError("poly_line_coeffs should be 'quadratic' or 'all'")
+
+            tc += time.time() - t2
+
+            ########################
+            # Take conjugate steps
+            # ob += tmin_ob * ob_h
+            # pr += tmin_pr * pr_h
+            ########################
+            self.ob_h *= self.tmin_ob
+            self.pr_h *= self.tmin_pr
+            self.ob += self.ob_h
+            self.pr += self.pr_h
+
+            # Position correction
+            self.position_update()
+
+            # Allow for customized modifications at the end of each iteration
+            self._post_iterate_update()
+
+            # increase iteration counter
+            self.curiter +=1
+
+        logger.info('Time spent in gradient calculation: %.2f' % tg)
+        logger.info('  ....  in coefficient calculation: %.2f' % tc)
+        return error_dct  # np.array([[self.ML_model.LL[0]] * 3])
+
+    def _post_iterate_update(self):
+        """
+        Enables modification at the end of each ML iteration.
+        """
+        pass
+
+    def engine_finalize(self):
+        """
+        Delete temporary containers.
+        """
+        del self.ptycho.containers[self.ob_grad.ID]
+        del self.ob_grad
+        del self.ptycho.containers[self.ob_grad_new.ID]
+        del self.ob_grad_new
+        del self.ptycho.containers[self.ob_h.ID]
+        del self.ob_h
+        del self.ptycho.containers[self.pr_grad.ID]
+        del self.pr_grad
+        del self.ptycho.containers[self.pr_grad_new.ID]
+        del self.pr_grad_new
+        del self.ptycho.containers[self.pr_h.ID]
+        del self.pr_h
+
+        # Save floating intensities into runtime
+        self.ptycho.runtime["float_intens"] = parallel.gather_dict(self.ML_model.float_intens_coeff)
+
+        # Delete model
+        del self.ML_model
+
+class BaseModel(object):
+    """
+    Base class for log-likelihood models.
+    """
+
+    def __init__(self, MLengine):
+        """
+        Core functions for ML computation using a Gaussian model.
+        """
+        self.engine = MLengine
+
+        # Transfer commonly used attributes from ML engine
+        self.di = self.engine.di
+        self.p = self.engine.p
+        self.ob = self.engine.ob
+        self.ob_grad = self.engine.ob_grad_new
+        self.pr_grad = self.engine.pr_grad_new
+        self.pr = self.engine.pr
+        self.float_intens_coeff = {}
+
+        if self.p.intensity_renormalization is None:
+            self.Irenorm = 1.
+        else:
+            self.Irenorm = self.p.intensity_renormalization
+
+        if self.p.reg_del2:
+            self.regularizer = Regul_del2(self.p.reg_del2_amplitude)
+        else:
+            self.regularizer = None
+
+        # Create working variables
+        self.LL = 0.
+
+
+    def prepare(self):
+        # Useful quantities
+        self.tot_measpts = sum(s.data.size
+                               for s in self.di.storages.values())
+        self.tot_power = self.Irenorm * sum(s.tot_power
+                                            for s in self.di.storages.values())
+        # Prepare regularizer
+        if self.regularizer is not None:
+            obj_Npix = self.ob.size
+            expected_obj_var = obj_Npix / self.tot_power  # Poisson
+            reg_rescale = self.tot_measpts / (8. * obj_Npix * expected_obj_var)
+            logger.debug(
+                'Rescaling regularization amplitude using '
+                'the Poisson distribution assumption.')
+            logger.debug('Factor: %8.5g' % reg_rescale)
+
+            # TODO remove usage of .p. access
+            self.regularizer.amplitude = self.p.reg_del2_amplitude * reg_rescale
+
+    def __del__(self):
+        """
+        Clean up routine
+        """
+        # Remove working attributes
+        for name, diff_view in self.di.views.items():
+            if not diff_view.active:
+                continue
+            try:
+                del diff_view.error
+            except:
+                pass
+
+    def new_grad(self):
+        """
+        Compute new object and probe gradient directions according to the noise model.
+
+        Note: The negative log-likelihood and local errors should also be computed here.
+        """
+        raise NotImplementedError
+
+    def poly_line_coeffs_ob(self, ob_h):
+        """
+        Compute the coefficients of the polynomial for line minimization
+        in direction h for the object
+        """
+        raise NotImplementedError
+
+    def poly_line_coeffs_pr(self, pr_h):
+        """
+        Compute the coefficients of the polynomial for line minimization
+        in direction h for the probe
+        """
+        raise NotImplementedError
+
+
+class GaussianModel(BaseModel):
+    """
+    Gaussian noise model.
+    TODO: feed actual statistical weights instead of using the Poisson statistic heuristic.
+    """
+
+    def __init__(self, MLengine):
+        """
+        Core functions for ML computation using a Gaussian model.
+        """
+        BaseModel.__init__(self, MLengine)
+
+        # Gaussian model requires weights
+        # TODO: update this part of the code once actual weights are passed in the PODs
+        self.weights = self.engine.di.copy(self.engine.di.ID + '_weights')
+        # FIXME: This part needs to be updated once statistical weights are properly
+        # supported in the data preparation.
+        for name, di_view in self.di.views.items():
+            if not di_view.active:
+                continue
+            self.weights[di_view] = (self.Irenorm * di_view.pod.ma_view.data
+                                     / (1./self.Irenorm + di_view.data))
+
+    def __del__(self):
+        """
+        Clean up routine
+        """
+        BaseModel.__del__(self)
+        del self.engine.ptycho.containers[self.weights.ID]
+        del self.weights
+
+    def new_grad(self):
+        """
+        Compute a new gradient direction according to a Gaussian noise model.
+
+        Note: The negative log-likelihood and local errors are also computed
+        here.
+        """
+        self.ob_grad.fill(0.)
+        self.pr_grad.fill(0.)
+
+        # We need an array for MPI
+        LL = np.array([0.])
+        error_dct = {}
+
+        # Outer loop: through diffraction patterns
+        for dname, diff_view in self.di.views.items():
+            if not diff_view.active:
+                continue
+
+            # Weights and intensities for this view
+            w = self.weights[diff_view]
+            I = diff_view.data
+
+            Imodel = np.zeros_like(I)
+            f = {}
+
+            # First pod loop: compute total intensity
+            for name, pod in diff_view.pods.items():
+                if not pod.active:
+                    continue
+                f[name] = pod.fw(pod.probe * pod.object)
+                Imodel += pod.downsample(u.abs2(f[name]))
+
+            # Floating intensity option
+            if self.p.floating_intensities:
+                self.float_intens_coeff[dname] = ((w * Imodel * I).sum()
+                                                / (w * Imodel**2).sum())
+                Imodel *= self.float_intens_coeff[dname]
+
+            DI = np.double(Imodel) - I
+
+            # Second pod loop: gradients computation
+            LLL = np.sum((w * DI**2).astype(np.float64))
+            for name, pod in diff_view.pods.items():
+                if not pod.active:
+                    continue
+                xi = pod.bw(pod.upsample(w*DI) * f[name])
+                self.ob_grad[pod.ob_view] += 2. * xi * pod.probe.conj()
+                self.pr_grad[pod.pr_view] += 2. * xi * pod.object.conj()
+
+            diff_view.error = LLL
+            error_dct[dname] = np.array([0, LLL / np.prod(DI.shape), 0])
+            LL += LLL
+
+        # MPI reduction of gradients
+        self.ob_grad.allreduce()
+        self.pr_grad.allreduce()
+        parallel.allreduce(LL)
+
+        # Object regularizer
+        if self.regularizer:
+            for name, s in self.ob.storages.items():
+                self.ob_grad.storages[name].data += self.regularizer.grad(
+                    s.data)
+                LL += self.regularizer.LL
+        self.LL = LL / self.tot_measpts
+
+        return error_dct
+
+    def poly_line_coeffs_ob(self, ob_h):
+        """
+        Compute the coefficients of the polynomial for line minimization
+        in direction h for the object
+        """
+
+        B = np.zeros((3,), dtype=np.longdouble)
+        Brenorm = 1. / self.LL[0]**2
+
+        # Outer loop: through diffraction patterns
+        for dname, diff_view in self.di.views.items():
+            if not diff_view.active:
+                continue
+
+            # Weights and intensities for this view
+            w = self.weights[diff_view]
+            I = diff_view.data
+
+            A0 = None
+            A1 = None
+            A2 = None
+
+            for name, pod in diff_view.pods.items():
+                if not pod.active:
+                    continue
+                f = pod.fw(pod.probe * pod.object)
+                a = pod.fw(pod.probe * ob_h[pod.ob_view])
+
+                if A0 is None:
+                    A0 = u.abs2(f).astype(np.longdouble)
+                    A1 = 2 * np.real(f * a.conj()).astype(np.longdouble)
+                    A2 = u.abs2(a).astype(np.longdouble)
+                else:
+                    A0 += u.abs2(f)
+                    A1 += 2 * np.real(f * a.conj())
+                    A2 += u.abs2(a)
+
+            if self.p.floating_intensities:
+                A0 *= self.float_intens_coeff[dname]
+                A1 *= self.float_intens_coeff[dname]
+                A2 *= self.float_intens_coeff[dname]
+
+            A0 = np.double(A0) - pod.upsample(I)
+            #A0 -= pod.upsample(I)
+            w = pod.upsample(w)
+
+            B[0] += np.dot(w.flat, (A0**2).flat) * Brenorm
+            B[1] += np.dot(w.flat, (2*A0*A1).flat) * Brenorm
+            B[2] += np.dot(w.flat, (A1**2 + 2*A0*A2).flat) * Brenorm
+
+        parallel.allreduce(B)
+
+        # Object regularizer
+        if self.regularizer:
+            for name, s in self.ob.storages.items():
+                B += Brenorm * self.regularizer.poly_line_coeffs(
+                    ob_h.storages[name].data, s.data)
+
+        if np.isinf(B).any() or np.isnan(B).any():
+            logger.warning(
+                'Warning! inf or nan found! Trying to continue...')
+            B[np.isinf(B)] = 0.
+            B[np.isnan(B)] = 0.
+
+        self.B = B
+
+        return B
+
+    def poly_line_coeffs_pr(self, pr_h):
+        """
+        Compute the coefficients of the polynomial for line minimization
+        in direction h for the probe
+        """
+
+        B = np.zeros((3,), dtype=np.longdouble)
+        Brenorm = 1. / self.LL[0]**2
+
+        # Outer loop: through diffraction patterns
+        for dname, diff_view in self.di.views.items():
+            if not diff_view.active:
+                continue
+
+            # Weights and intensities for this view
+            w = self.weights[diff_view]
+            I = diff_view.data
+
+            A0 = None
+            A1 = None
+            A2 = None
+
+            for name, pod in diff_view.pods.items():
+                if not pod.active:
+                    continue
+
+                f = pod.fw(pod.probe * pod.object)
+                a = pod.fw(pr_h[pod.pr_view] * pod.object)
+
+                if A0 is None:
+
+                    A0 = u.abs2(f).astype(np.longdouble)
+                    A1 = 2 * np.real(f * a.conj()).astype(np.longdouble)
+                    A2 = u.abs2(a).astype(np.longdouble)
+                else:
+                    A0 += u.abs2(f)
+                    A1 += 2 * np.real(f * a.conj())
+                    A2 += u.abs2(a)
+
+            if self.p.floating_intensities:
+                A0 *= self.float_intens_coeff[dname]
+                A1 *= self.float_intens_coeff[dname]
+                A2 *= self.float_intens_coeff[dname]
+
+            A0 = np.double(A0) - pod.upsample(I)
+            #A0 -= pod.upsample(I)
+            w = pod.upsample(w)
+
+            B[0] += np.dot(w.flat, (A0**2).flat) * Brenorm
+            B[1] += np.dot(w.flat, (2*A0*A1).flat) * Brenorm
+            B[2] += np.dot(w.flat, (A1**2 + 2*A0*A2).flat) * Brenorm
+
+        parallel.allreduce(B)
+
+        if np.isinf(B).any() or np.isnan(B).any():
+            logger.warning(
+                'Warning! inf or nan found! Trying to continue...')
+            B[np.isinf(B)] = 0.
+            B[np.isnan(B)] = 0.
+
+        self.B = B
+        return B
+
+
+class Regul_del2(object):
+    """\
+    Squared gradient regularizer (Gaussian prior).
+
+    This class applies to any numpy array.
+    """
+    def __init__(self, amplitude, axes=[-2, -1]):
+        # Regul.__init__(self, axes)
+        self.axes = axes
+        self.amplitude = amplitude
+        self.delxy = None
+        self.g = None
+        self.LL = None
+
+    def grad(self, x):
+        """
+        Compute and return the regularizer gradient given the array x.
+        """
+        ax0, ax1 = self.axes
+        del_xf = u.delxf(x, axis=ax0)
+        del_yf = u.delxf(x, axis=ax1)
+        del_xb = u.delxb(x, axis=ax0)
+        del_yb = u.delxb(x, axis=ax1)
+
+        self.delxy = [del_xf, del_yf, del_xb, del_yb]
+        self.g = 2. * self.amplitude*(del_xb + del_yb - del_xf - del_yf)
+
+        self.LL = self.amplitude * (u.norm2(del_xf)
+                               + u.norm2(del_yf)
+                               + u.norm2(del_xb)
+                               + u.norm2(del_yb))
+
+        return self.g
+
+    def poly_line_coeffs(self, h, x=None):
+        ax0, ax1 = self.axes
+        if x is None:
+            del_xf, del_yf, del_xb, del_yb = self.delxy
+        else:
+            del_xf = u.delxf(x, axis=ax0)
+            del_yf = u.delxf(x, axis=ax1)
+            del_xb = u.delxb(x, axis=ax0)
+            del_yb = u.delxb(x, axis=ax1)
+
+        hdel_xf = u.delxf(h, axis=ax0)
+        hdel_yf = u.delxf(h, axis=ax1)
+        hdel_xb = u.delxb(h, axis=ax0)
+        hdel_yb = u.delxb(h, axis=ax1)
+
+        c0 = self.amplitude * (u.norm2(del_xf)
+                               + u.norm2(del_yf)
+                               + u.norm2(del_xb)
+                               + u.norm2(del_yb))
+
+        c1 = 2 * self.amplitude * np.real(np.vdot(del_xf, hdel_xf)
+                                          + np.vdot(del_yf, hdel_yf)
+                                          + np.vdot(del_xb, hdel_xb)
+                                          + np.vdot(del_yb, hdel_yb))
+
+        c2 = self.amplitude * (u.norm2(hdel_xf)
+                               + u.norm2(hdel_yf)
+                               + u.norm2(hdel_xb)
+                               + u.norm2(hdel_yb))
+
+        self.coeff = np.array([c0, c1, c2])
+        return self.coeff
+
+
+def prepare_smoothing_preconditioner(amplitude):
+    """
+    Factory for smoothing preconditioner.
+    """
+    if amplitude == 0.:
+        return None
+
+    class GaussFilt(object):
+        def __init__(self, sigma):
+            self.sigma = sigma
+
+        def __call__(self, x):
+            return u.c_gf(x, [0, self.sigma, self.sigma])
+
+    # from scipy.signal import correlate2d
+    # class HannFilt:
+    #    def __call__(self, x):
+    #        y = np.empty_like(x)
+    #        sh = x.shape
+    #        xf = x.reshape((-1,) + sh[-2:])
+    #        yf = y.reshape((-1,) + sh[-2:])
+    #        for i in range(len(xf)):
+    #            yf[i] = correlate2d(xf[i],
+    #                                np.array([[.0625, .125, .0625],
+    #                                          [.125, .25, .125],
+    #                                          [.0625, .125, .0625]]),
+    #                                mode='same')
+    #        return y
+
+    if amplitude > 0.:
+        logger.debug(
+            'Using a smooth gradient filter (Gaussian blur - only for ML)')
+        return GaussFilt(amplitude)
+
+    elif amplitude < 0.:
+        raise RuntimeError('Hann filter not implemented (negative smoothing amplitude not supported)')
+        # logger.debug(
+        #    'Using a smooth gradient filter (Hann window - only for ML)')
+        # return HannFilt()

From b6536008abbfdaaefe91f4c23f7b76a05879da71 Mon Sep 17 00:00:00 2001
From: Jaroslav Fowkes <jaroslav.fowkes@stfc.ac.uk>
Date: Thu, 21 Nov 2024 13:56:18 +0000
Subject: [PATCH 02/12] Add all poly coeffs to ML_separate_grads

---
 ptypy/custom/ML_separate_grads.py | 174 +++++++++++++++++++++++++++++-
 1 file changed, 170 insertions(+), 4 deletions(-)

diff --git a/ptypy/custom/ML_separate_grads.py b/ptypy/custom/ML_separate_grads.py
index 49b397f9e..abb9e45f5 100644
--- a/ptypy/custom/ML_separate_grads.py
+++ b/ptypy/custom/ML_separate_grads.py
@@ -35,7 +35,7 @@ class MLSeparateGrads(PositionCorrectionEngine):
     Defaults:
 
     [name]
-    default = ML
+    default = MLSeparateGrads
     type = str
     help =
     doc =
@@ -287,14 +287,33 @@ def engine_iterate(self, num=1):
             dt = self.ptycho.FType
             t2 = time.time()
 
-            if self.p.poly_line_coeffs == "quadratic":
+            if self.p.poly_line_coeffs == "all":
+                B_ob = self.ML_model.poly_line_all_coeffs_ob(self.ob_h)
+                diffB_ob = np.arange(1,len(B_ob))*B_ob[1:] # coefficients of poly derivative
+                roots_ob = np.roots(np.flip(diffB_ob.astype(np.double))) # roots only supports double
+                real_roots_ob = np.real(roots_ob[np.isreal(roots_ob)]) # not interested in complex roots
+                if real_roots_ob.size == 1: # single real root
+                    self.tmin_ob = dt(real_roots_ob[0])
+                else: # find real root with smallest poly objective
+                    evalp_ob = lambda root: np.polyval(np.flip(B_ob),root)
+                    self.tmin_ob = dt(min(real_roots_ob, key=evalp_ob)) # root with smallest poly objective
+
+                B_pr = self.ML_model.poly_line_all_coeffs_pr(self.pr_h)
+                diffB_pr = np.arange(1,len(B_pr))*B_pr[1:] # coefficients of poly derivative
+                roots_pr = np.roots(np.flip(diffB_pr.astype(np.double))) # roots only supports double
+                real_roots_pr = np.real(roots_pr[np.isreal(roots_pr)]) # not interested in complex roots
+                if real_roots_pr.size == 1: # single real root
+                    self.tmin_pr = dt(real_roots_pr[0])
+                else: # find real root with smallest poly objective
+                    evalp_pr = lambda root: np.polyval(np.flip(B_pr),root)
+                    self.tmin_pr = dt(min(real_roots_pr, key=evalp_pr)) # root with smallest poly objective
+            elif self.p.poly_line_coeffs == "quadratic":
                 B_ob = self.ML_model.poly_line_coeffs_ob(self.ob_h)
                 B_pr = self.ML_model.poly_line_coeffs_pr(self.pr_h)
 
                 # same as above but quicker when poly quadratic
                 self.tmin_ob = dt(-0.5 * B_ob[1] / B_ob[2])
                 self.tmin_pr = dt(-0.5 * B_pr[1] / B_pr[2])
-
             else:
                 raise NotImplementedError("poly_line_coeffs should be 'quadratic' or 'all'")
 
@@ -440,6 +459,20 @@ def poly_line_coeffs_pr(self, pr_h):
         """
         raise NotImplementedError
 
+    def poly_line_all_coeffs_ob(self, ob_h):
+        """
+        Compute all the coefficients of the polynomial for line minimization
+        in direction h for the object
+        """
+        raise NotImplementedError
+
+    def poly_line_all_coeffs_pr(self, pr_h):
+        """
+        Compute all the coefficients of the polynomial for line minimization
+        in direction h for the probe
+        """
+        raise NotImplementedError
+
 
 class GaussianModel(BaseModel):
     """
@@ -639,7 +672,6 @@ def poly_line_coeffs_pr(self, pr_h):
                 a = pod.fw(pr_h[pod.pr_view] * pod.object)
 
                 if A0 is None:
-
                     A0 = u.abs2(f).astype(np.longdouble)
                     A1 = 2 * np.real(f * a.conj()).astype(np.longdouble)
                     A2 = u.abs2(a).astype(np.longdouble)
@@ -672,6 +704,140 @@ def poly_line_coeffs_pr(self, pr_h):
         self.B = B
         return B
 
+    def poly_line_all_coeffs_ob(self, ob_h):
+            """
+            Compute all the coefficients of the polynomial for line minimization
+            in direction h
+            """
+
+            B = np.zeros((5,), dtype=np.longdouble)
+            Brenorm = 1. / self.LL[0]**2
+
+            # Outer loop: through diffraction patterns
+            for dname, diff_view in self.di.views.items():
+                if not diff_view.active:
+                    continue
+
+                # Weights and intensities for this view
+                w = self.weights[diff_view]
+                I = diff_view.data
+
+                A0 = None
+                A1 = None
+                A2 = None
+
+                for name, pod in diff_view.pods.items():
+                    if not pod.active:
+                        continue
+                    f = pod.fw(pod.probe * pod.object)
+                    a = pod.fw(pod.probe * ob_h[pod.ob_view])
+
+                    if A0 is None:
+                        A0 = u.abs2(f).astype(np.longdouble)
+                        A1 = 2 * np.real(f * a.conj()).astype(np.longdouble)
+                        A2 = u.abs2(a).astype(np.longdouble)
+                    else:
+                        A0 += u.abs2(f)
+                        A1 += 2 * np.real(f * a.conj())
+                        A2 += u.abs2(a)
+
+                if self.p.floating_intensities:
+                    A0 *= self.float_intens_coeff[dname]
+                    A1 *= self.float_intens_coeff[dname]
+                    A2 *= self.float_intens_coeff[dname]
+
+                A0 = np.double(A0) - pod.upsample(I)
+                #A0 -= pod.upsample(I)
+                w = pod.upsample(w)
+
+                B[0] += np.dot(w.flat, (A0**2).flat) * Brenorm
+                B[1] += np.dot(w.flat, (2*A0*A1).flat) * Brenorm
+                B[2] += np.dot(w.flat, (A1**2 + 2*A0*A2).flat) * Brenorm
+                B[3] += np.dot(w.flat, (2*A1*A2).flat) * Brenorm
+                B[4] += np.dot(w.flat, (A2**2).flat) * Brenorm
+
+            parallel.allreduce(B)
+
+            # Object regularizer
+            if self.regularizer:
+                for name, s in self.ob.storages.items():
+                    B[:3] += Brenorm * self.regularizer.poly_line_coeffs(
+                        ob_h.storages[name].data, s.data)
+
+            if np.isinf(B).any() or np.isnan(B).any():
+                logger.warning(
+                    'Warning! inf or nan found! Trying to continue...')
+                B[np.isinf(B)] = 0.
+                B[np.isnan(B)] = 0.
+
+            self.B = B
+
+            return B
+
+    def poly_line_all_coeffs_pr(self, pr_h):
+            """
+            Compute all the coefficients of the polynomial for line minimization
+            in direction h
+            """
+
+            B = np.zeros((5,), dtype=np.longdouble)
+            Brenorm = 1. / self.LL[0]**2
+
+            # Outer loop: through diffraction patterns
+            for dname, diff_view in self.di.views.items():
+                if not diff_view.active:
+                    continue
+
+                # Weights and intensities for this view
+                w = self.weights[diff_view]
+                I = diff_view.data
+
+                A0 = None
+                A1 = None
+                A2 = None
+
+                for name, pod in diff_view.pods.items():
+                    if not pod.active:
+                        continue
+                    f = pod.fw(pod.probe * pod.object)
+                    a = pod.fw(pr_h[pod.pr_view] * pod.object)
+
+                    if A0 is None:
+                        A0 = u.abs2(f).astype(np.longdouble)
+                        A1 = 2 * np.real(f * a.conj()).astype(np.longdouble)
+                        A2 = u.abs2(a).astype(np.longdouble)
+                    else:
+                        A0 += u.abs2(f)
+                        A1 += 2 * np.real(f * a.conj())
+                        A2 += u.abs2(a)
+
+                if self.p.floating_intensities:
+                    A0 *= self.float_intens_coeff[dname]
+                    A1 *= self.float_intens_coeff[dname]
+                    A2 *= self.float_intens_coeff[dname]
+
+                A0 = np.double(A0) - pod.upsample(I)
+                #A0 -= pod.upsample(I)
+                w = pod.upsample(w)
+
+                B[0] += np.dot(w.flat, (A0**2).flat) * Brenorm
+                B[1] += np.dot(w.flat, (2*A0*A1).flat) * Brenorm
+                B[2] += np.dot(w.flat, (A1**2 + 2*A0*A2).flat) * Brenorm
+                B[3] += np.dot(w.flat, (2*A1*A2).flat) * Brenorm
+                B[4] += np.dot(w.flat, (A2**2).flat) * Brenorm
+
+            parallel.allreduce(B)
+
+            if np.isinf(B).any() or np.isnan(B).any():
+                logger.warning(
+                    'Warning! inf or nan found! Trying to continue...')
+                B[np.isinf(B)] = 0.
+                B[np.isnan(B)] = 0.
+
+            self.B = B
+
+            return B
+
 
 class Regul_del2(object):
     """\

From ef7e14ebbeb0e2b5b37b18e15261e4d04b9c5ac0 Mon Sep 17 00:00:00 2001
From: Jaroslav Fowkes <jaroslav.fowkes@stfc.ac.uk>
Date: Thu, 21 Nov 2024 14:53:54 +0000
Subject: [PATCH 03/12] Add Poisson and Euclid models to ML_separate_grads

---
 ptypy/custom/ML_separate_grads.py | 736 +++++++++++++++++++++++++++++-
 1 file changed, 729 insertions(+), 7 deletions(-)

diff --git a/ptypy/custom/ML_separate_grads.py b/ptypy/custom/ML_separate_grads.py
index abb9e45f5..3c77e4b54 100644
--- a/ptypy/custom/ML_separate_grads.py
+++ b/ptypy/custom/ML_separate_grads.py
@@ -4,7 +4,7 @@
 
 TODO.
 
-  * Implement other ML models (Poisson/Euclid)
+  * Implement other regularizers
 
 This file is part of the PTYPY package.
 
@@ -181,10 +181,10 @@ def _initialize_model(self):
         # Create noise model
         if self.p.ML_type.lower() == "gaussian":
             self.ML_model = GaussianModel(self)
-        # elif self.p.ML_type.lower() == "poisson":
-        #     self.ML_model = PoissonModel(self)
-        # elif self.p.ML_type.lower() == "euclid":
-        #     self.ML_model = EuclidModel(self)
+        elif self.p.ML_type.lower() == "poisson":
+            self.ML_model = PoissonModel(self)
+        elif self.p.ML_type.lower() == "euclid":
+            self.ML_model = EuclidModel(self)
         else:
             raise RuntimeError("Unsupported ML_type: '%s'" % self.p.ML_type)
 
@@ -707,7 +707,7 @@ def poly_line_coeffs_pr(self, pr_h):
     def poly_line_all_coeffs_ob(self, ob_h):
             """
             Compute all the coefficients of the polynomial for line minimization
-            in direction h
+            in direction h for the object
             """
 
             B = np.zeros((5,), dtype=np.longdouble)
@@ -777,7 +777,7 @@ def poly_line_all_coeffs_ob(self, ob_h):
     def poly_line_all_coeffs_pr(self, pr_h):
             """
             Compute all the coefficients of the polynomial for line minimization
-            in direction h
+            in direction h for the probe
             """
 
             B = np.zeros((5,), dtype=np.longdouble)
@@ -839,6 +839,728 @@ def poly_line_all_coeffs_pr(self, pr_h):
             return B
 
 
+class PoissonModel(BaseModel):
+    """
+    Poisson noise model.
+    """
+
+    def __init__(self, MLengine):
+        """
+        Core functions for ML computation using a Poisson model.
+        """
+        BaseModel.__init__(self, MLengine)
+        from scipy import special
+        self.LLbase = {}
+        for name, di_view in self.di.views.items():
+            if not di_view.active:
+                continue
+            self.LLbase[name] = special.gammaln(di_view.data+1).sum()
+
+    def new_grad(self):
+        """
+        Compute a new gradient direction according to a Poisson noise model.
+
+        Note: The negative log-likelihood and local errors are also computed
+        here.
+        """
+        self.ob_grad.fill(0.)
+        self.pr_grad.fill(0.)
+
+        # We need an array for MPI
+        LL = np.array([0.])
+        error_dct = {}
+
+        # Outer loop: through diffraction patterns
+        for dname, diff_view in self.di.views.items():
+            if not diff_view.active:
+                continue
+
+            # Mask and intensities for this view
+            I = diff_view.data
+            m = diff_view.pod.ma_view.data
+
+            Imodel = np.zeros_like(I)
+            f = {}
+
+            # First pod loop: compute total intensity
+            for name, pod in diff_view.pods.items():
+                if not pod.active:
+                    continue
+                f[name] = pod.fw(pod.probe * pod.object)
+                Imodel += u.abs2(f[name])
+
+            # Floating intensity option
+            if self.p.floating_intensities:
+                self.float_intens_coeff[dname] = I.sum() / Imodel.sum()
+                Imodel *= self.float_intens_coeff[dname]
+
+            Imodel += 1e-6
+            DI = m * (1. - I / Imodel)
+
+            # Second pod loop: gradients computation
+            LLL = self.LLbase[dname] + (m * (Imodel - I * np.log(Imodel))).sum().astype(np.float64)
+            for name, pod in diff_view.pods.items():
+                if not pod.active:
+                    continue
+                xi = pod.bw(DI * f[name])
+                self.ob_grad[pod.ob_view] += 2 * xi * pod.probe.conj()
+                self.pr_grad[pod.pr_view] += 2 * xi * pod.object.conj()
+
+            diff_view.error = LLL
+            error_dct[dname] = np.array([0, LLL / np.prod(DI.shape), 0])
+            LL += LLL
+
+        # MPI reduction of gradients
+        self.ob_grad.allreduce()
+        self.pr_grad.allreduce()
+        parallel.allreduce(LL)
+
+        # Object regularizer
+        if self.regularizer:
+            for name, s in self.ob.storages.items():
+                self.ob_grad.storages[name].data += self.regularizer.grad(
+                    s.data)
+                LL += self.regularizer.LL
+
+        self.LL = LL / self.tot_measpts
+
+        return error_dct
+
+    def poly_line_coeffs_ob(self, ob_h):
+        """
+        Compute the coefficients of the polynomial for line minimization
+        in direction h for the object
+        """
+        B = np.zeros((3,), dtype=np.longdouble)
+        Brenorm = 1/(self.tot_measpts * self.LL[0])**2
+
+        # Outer loop: through diffraction patterns
+        for dname, diff_view in self.di.views.items():
+            if not diff_view.active:
+                continue
+
+            # Weights and intensities for this view
+            I = diff_view.data
+            m = diff_view.pod.ma_view.data
+
+            A0 = None
+            A1 = None
+            A2 = None
+
+            for name, pod in diff_view.pods.items():
+                if not pod.active:
+                    continue
+                f = pod.fw(pod.probe * pod.object)
+                a = pod.fw(pod.probe * ob_h[pod.ob_view])
+
+                if A0 is None:
+                    A0 = u.abs2(f).astype(np.longdouble)
+                    A1 = 2 * np.real(f * a.conj()).astype(np.longdouble)
+                    A2 = u.abs2(a).astype(np.longdouble)
+                else:
+                    A0 += u.abs2(f)
+                    A1 += 2 * np.real(f * a.conj())
+                    A2 += u.abs2(a)
+
+            if self.p.floating_intensities:
+                A0 *= self.float_intens_coeff[dname]
+                A1 *= self.float_intens_coeff[dname]
+                A2 *= self.float_intens_coeff[dname]
+
+            A0 += 1e-6
+            DI = 1. - I/A0
+
+            B[0] += (self.LLbase[dname] + (m * (A0 - I * np.log(A0))).sum().astype(np.float64)) * Brenorm
+            B[1] += np.dot(m.flat, (A1*DI).flat) * Brenorm
+            B[2] += (np.dot(m.flat, (A2*DI).flat) + 0.5*np.dot(m.flat, (I*(A1/A0)**2).flat)) * Brenorm
+
+        parallel.allreduce(B)
+
+        # Object regularizer
+        if self.regularizer:
+            for name, s in self.ob.storages.items():
+                B += Brenorm * self.regularizer.poly_line_coeffs(
+                    ob_h.storages[name].data, s.data)
+
+        if np.isinf(B).any() or np.isnan(B).any():
+            logger.warning(
+                'Warning! inf or nan found! Trying to continue...')
+            B[np.isinf(B)] = 0.
+            B[np.isnan(B)] = 0.
+
+        self.B = B
+
+        return B
+
+    def poly_line_coeffs_pr(self, pr_h):
+        """
+        Compute the coefficients of the polynomial for line minimization
+        in direction h for the probe
+        """
+        B = np.zeros((3,), dtype=np.longdouble)
+        Brenorm = 1/(self.tot_measpts * self.LL[0])**2
+
+        # Outer loop: through diffraction patterns
+        for dname, diff_view in self.di.views.items():
+            if not diff_view.active:
+                continue
+
+            # Weights and intensities for this view
+            I = diff_view.data
+            m = diff_view.pod.ma_view.data
+
+            A0 = None
+            A1 = None
+            A2 = None
+
+            for name, pod in diff_view.pods.items():
+                if not pod.active:
+                    continue
+                f = pod.fw(pod.probe * pod.object)
+                a = pod.fw(pr_h[pod.pr_view] * pod.object)
+
+                if A0 is None:
+                    A0 = u.abs2(f).astype(np.longdouble)
+                    A1 = 2 * np.real(f * a.conj()).astype(np.longdouble)
+                    A2 = u.abs2(a).astype(np.longdouble)
+                else:
+                    A0 += u.abs2(f)
+                    A1 += 2 * np.real(f * a.conj())
+                    A2 += u.abs2(a)
+
+            if self.p.floating_intensities:
+                A0 *= self.float_intens_coeff[dname]
+                A1 *= self.float_intens_coeff[dname]
+                A2 *= self.float_intens_coeff[dname]
+
+            A0 += 1e-6
+            DI = 1. - I/A0
+
+            B[0] += (self.LLbase[dname] + (m * (A0 - I * np.log(A0))).sum().astype(np.float64)) * Brenorm
+            B[1] += np.dot(m.flat, (A1*DI).flat) * Brenorm
+            B[2] += (np.dot(m.flat, (A2*DI).flat) + 0.5*np.dot(m.flat, (I*(A1/A0)**2).flat)) * Brenorm
+
+        parallel.allreduce(B)
+
+        if np.isinf(B).any() or np.isnan(B).any():
+            logger.warning(
+                'Warning! inf or nan found! Trying to continue...')
+            B[np.isinf(B)] = 0.
+            B[np.isnan(B)] = 0.
+
+        self.B = B
+
+        return B
+
+    def poly_line_all_coeffs_ob(self, ob_h):
+        """
+        Compute all the coefficients of the polynomial for line minimization
+        in direction h for the object
+        """
+        B = np.zeros((5,), dtype=np.longdouble)
+        Brenorm = 1/(self.tot_measpts * self.LL[0])**2
+
+        # Outer loop: through diffraction patterns
+        for dname, diff_view in self.di.views.items():
+            if not diff_view.active:
+                continue
+
+            # Weights and intensities for this view
+            I = diff_view.data
+            m = diff_view.pod.ma_view.data
+
+            A0 = None
+            A1 = None
+            A2 = None
+
+            for name, pod in diff_view.pods.items():
+                if not pod.active:
+                    continue
+                f = pod.fw(pod.probe * pod.object)
+                a = pod.fw(pod.probe * ob_h[pod.ob_view])
+
+                if A0 is None:
+                    A0 = u.abs2(f).astype(np.longdouble)
+                    A1 = 2 * np.real(f * a.conj()).astype(np.longdouble)
+                    A2 = u.abs2(a).astype(np.longdouble)
+                else:
+                    A0 += u.abs2(f)
+                    A1 += 2 * np.real(f * a.conj())
+                    A2 += u.abs2(a)
+
+            if self.p.floating_intensities:
+                A0 *= self.float_intens_coeff[dname]
+                A1 *= self.float_intens_coeff[dname]
+                A2 *= self.float_intens_coeff[dname]
+
+            A0 += 1e-6
+            DI = 1. - I/A0
+
+            B[0] += (self.LLbase[dname] + (m * (A0 - I * np.log(A0))).sum().astype(np.float64)) * Brenorm
+            B[1] += np.dot(m.flat, (A1*DI).flat) * Brenorm
+            B[2] += (np.dot(m.flat, (A2*DI).flat) + 0.5*np.dot(m.flat, (I*(A1/A0)**2).flat)) * Brenorm
+            B[3] += (0.5*np.dot(m.flat, (I*((2*A1*A2)/A0**2)).flat)) * Brenorm
+            B[4] += (0.5*np.dot(m.flat, (I*((A2**2)/A0**2)).flat)) * Brenorm
+
+        parallel.allreduce(B)
+
+        # Object regularizer
+        if self.regularizer:
+            for name, s in self.ob.storages.items():
+                B[:3] += Brenorm * self.regularizer.poly_line_coeffs(
+                    ob_h.storages[name].data, s.data)
+
+        if np.isinf(B).any() or np.isnan(B).any():
+            logger.warning(
+                'Warning! inf or nan found! Trying to continue...')
+            B[np.isinf(B)] = 0.
+            B[np.isnan(B)] = 0.
+
+        self.B = B
+
+        return B
+
+def poly_line_all_coeffs_pr(self, pr_h):
+        """
+        Compute all the coefficients of the polynomial for line minimization
+        in direction h for the probe
+        """
+        B = np.zeros((5,), dtype=np.longdouble)
+        Brenorm = 1/(self.tot_measpts * self.LL[0])**2
+
+        # Outer loop: through diffraction patterns
+        for dname, diff_view in self.di.views.items():
+            if not diff_view.active:
+                continue
+
+            # Weights and intensities for this view
+            I = diff_view.data
+            m = diff_view.pod.ma_view.data
+
+            A0 = None
+            A1 = None
+            A2 = None
+
+            for name, pod in diff_view.pods.items():
+                if not pod.active:
+                    continue
+                f = pod.fw(pod.probe * pod.object)
+                a = pod.fw(pr_h[pod.pr_view] * pod.object)
+
+                if A0 is None:
+                    A0 = u.abs2(f).astype(np.longdouble)
+                    A1 = 2 * np.real(f * a.conj()).astype(np.longdouble)
+                    A2 = u.abs2(a).astype(np.longdouble)
+                else:
+                    A0 += u.abs2(f)
+                    A1 += 2 * np.real(f * a.conj())
+                    A2 += u.abs2(a)
+
+            if self.p.floating_intensities:
+                A0 *= self.float_intens_coeff[dname]
+                A1 *= self.float_intens_coeff[dname]
+                A2 *= self.float_intens_coeff[dname]
+
+            A0 += 1e-6
+            DI = 1. - I/A0
+
+            B[0] += (self.LLbase[dname] + (m * (A0 - I * np.log(A0))).sum().astype(np.float64)) * Brenorm
+            B[1] += np.dot(m.flat, (A1*DI).flat) * Brenorm
+            B[2] += (np.dot(m.flat, (A2*DI).flat) + 0.5*np.dot(m.flat, (I*(A1/A0)**2).flat)) * Brenorm
+            B[3] += (0.5*np.dot(m.flat, (I*((2*A1*A2)/A0**2)).flat)) * Brenorm
+            B[4] += (0.5*np.dot(m.flat, (I*((A2**2)/A0**2)).flat)) * Brenorm
+
+        parallel.allreduce(B)
+
+        if np.isinf(B).any() or np.isnan(B).any():
+            logger.warning(
+                'Warning! inf or nan found! Trying to continue...')
+            B[np.isinf(B)] = 0.
+            B[np.isnan(B)] = 0.
+
+        self.B = B
+
+        return B
+
+
+class EuclidModel(BaseModel):
+    """
+    Euclidean (Amplitude) noise model.
+    TODO: feed actual statistical weights instead of using a fixed variance.
+    """
+
+    def __init__(self, MLengine):
+        """
+        Core functions for ML computation using a Euclidean model.
+        """
+        BaseModel.__init__(self, MLengine)
+
+        # Euclidean model requires weights
+        # TODO: update this part of the code once actual weights are passed in the PODs
+        self.weights = self.engine.di.copy(self.engine.di.ID + '_weights')
+        # FIXME: This part needs to be updated once statistical weights are properly
+        # supported in the data preparation.
+        for name, di_view in self.di.views.items():
+            if not di_view.active:
+                continue
+            self.weights[di_view] = di_view.pod.ma_view.data # just the mask for now
+            #self.weights[di_view] = (di_view.pod.ma_view.data
+            #                         / (1. + stat_weights/di_view.data))
+
+    def __del__(self):
+        """
+        Clean up routine
+        """
+        BaseModel.__del__(self)
+        del self.engine.ptycho.containers[self.weights.ID]
+        del self.weights
+
+    def new_grad(self):
+        """
+        Compute a new gradient direction according to a Euclidean noise model.
+
+        Note: The negative log-likelihood and local errors are also computed
+        here.
+        """
+        self.ob_grad.fill(0.)
+        self.pr_grad.fill(0.)
+
+        # We need an array for MPI
+        LL = np.array([0.])
+        error_dct = {}
+
+        # Outer loop: through diffraction patterns
+        for dname, diff_view in self.di.views.items():
+            if not diff_view.active:
+                continue
+
+            # Weights and amplitudes for this view
+            w = self.weights[diff_view]
+            A = np.sqrt(diff_view.data)
+
+            Amodel = np.zeros_like(A)
+            f = {}
+
+            # First pod loop: compute total amplitude
+            for name, pod in diff_view.pods.items():
+                if not pod.active:
+                    continue
+                f[name] = pod.fw(pod.probe * pod.object)
+                Amodel += np.sqrt(u.abs2(f[name]))
+
+            # Floating intensity option
+            if self.p.floating_intensities:
+                self.float_intens_coeff[dname] = A.sum() / Amodel.sum()
+                Amodel *= self.float_intens_coeff[dname]
+
+            Amodel += 1e-6 # cf Poisson model
+            DA = (1. - A / Amodel)
+
+            # Second pod loop: gradients computation
+            LLL = np.sum((w * (Amodel - A)**2).astype(np.float64))
+            for name, pod in diff_view.pods.items():
+                if not pod.active:
+                    continue
+                xi = pod.bw(w*DA * f[name])
+                self.ob_grad[pod.ob_view] += 2. * xi * pod.probe.conj()
+                self.pr_grad[pod.pr_view] += 2. * xi * pod.object.conj()
+
+            diff_view.error = LLL
+            error_dct[dname] = np.array([0, LLL / np.prod(DA.shape), 0])
+            LL += LLL
+
+        # MPI reduction of gradients
+        self.ob_grad.allreduce()
+        self.pr_grad.allreduce()
+        parallel.allreduce(LL)
+
+        # Object regularizer
+        if self.regularizer:
+            for name, s in self.ob.storages.items():
+                self.ob_grad.storages[name].data += self.regularizer.grad(
+                    s.data)
+                LL += self.regularizer.LL
+        self.LL = LL / self.tot_measpts
+
+        return error_dct
+
+    def poly_line_coeffs_ob(self, ob_h):
+        """
+        Compute the coefficients of the polynomial for line minimization
+        in direction h for the object
+        """
+
+        B = np.zeros((3,), dtype=np.longdouble)
+        Brenorm = 1. / self.LL[0]**2
+
+        # Outer loop: through diffraction patterns
+        for dname, diff_view in self.di.views.items():
+            if not diff_view.active:
+                continue
+
+            # Weights and amplitudes for this view
+            w = self.weights[diff_view]
+            A = np.sqrt(diff_view.data)
+
+            A0 = None
+            A1 = None
+            A2 = None
+
+            for name, pod in diff_view.pods.items():
+                if not pod.active:
+                    continue
+                f = pod.fw(pod.probe * pod.object)
+                a = pod.fw(pod.probe * ob_h[pod.ob_view])
+
+                if A0 is None:
+                    A0 = u.abs2(f).astype(np.longdouble)
+                    A1 = 2 * np.real(f * a.conj()).astype(np.longdouble)
+                    A2 = u.abs2(a).astype(np.longdouble)
+                else:
+                    A0 += u.abs2(f)
+                    A1 += 2 * np.real(f * a.conj())
+                    A2 += u.abs2(a)
+
+            if self.p.floating_intensities:
+                A0 *= self.float_intens_coeff[dname]
+                A1 *= self.float_intens_coeff[dname]
+                A2 *= self.float_intens_coeff[dname]
+
+            A0 += 1e-12 # cf Poisson model sqrt(1e-12) = 1e-6
+            DA = 1. - A/np.sqrt(A0)
+
+            B[0] += np.dot(w.flat, ((np.sqrt(A0) - A)**2).flat) * Brenorm
+            B[1] += np.dot(w.flat, (A1*DA).flat) * Brenorm
+            B[2] += (np.dot(w.flat, (A2*DA).flat) + 0.25*np.dot(w.flat, (A1**2 * A/A0**(3/2)).flat)) * Brenorm
+
+        parallel.allreduce(B)
+
+        # Object regularizer
+        if self.regularizer:
+            for name, s in self.ob.storages.items():
+                B += Brenorm * self.regularizer.poly_line_coeffs(
+                    ob_h.storages[name].data, s.data)
+
+        if np.isinf(B).any() or np.isnan(B).any():
+            logger.warning(
+                'Warning! inf or nan found! Trying to continue...')
+            B[np.isinf(B)] = 0.
+            B[np.isnan(B)] = 0.
+
+        self.B = B
+
+        return B
+
+    def poly_line_coeffs_pr(self, pr_h):
+        """
+        Compute the coefficients of the polynomial for line minimization
+        in direction h for the probe
+        """
+
+        B = np.zeros((3,), dtype=np.longdouble)
+        Brenorm = 1. / self.LL[0]**2
+
+        # Outer loop: through diffraction patterns
+        for dname, diff_view in self.di.views.items():
+            if not diff_view.active:
+                continue
+
+            # Weights and amplitudes for this view
+            w = self.weights[diff_view]
+            A = np.sqrt(diff_view.data)
+
+            A0 = None
+            A1 = None
+            A2 = None
+
+            for name, pod in diff_view.pods.items():
+                if not pod.active:
+                    continue
+                f = pod.fw(pod.probe * pod.object)
+                a = pod.fw(pr_h[pod.pr_view] * pod.object)
+
+                if A0 is None:
+                    A0 = u.abs2(f).astype(np.longdouble)
+                    A1 = 2 * np.real(f * a.conj()).astype(np.longdouble)
+                    A2 = u.abs2(a).astype(np.longdouble)
+                else:
+                    A0 += u.abs2(f)
+                    A1 += 2 * np.real(f * a.conj())
+                    A2 += u.abs2(a)
+
+            if self.p.floating_intensities:
+                A0 *= self.float_intens_coeff[dname]
+                A1 *= self.float_intens_coeff[dname]
+                A2 *= self.float_intens_coeff[dname]
+
+            A0 += 1e-12 # cf Poisson model sqrt(1e-12) = 1e-6
+            DA = 1. - A/np.sqrt(A0)
+
+            B[0] += np.dot(w.flat, ((np.sqrt(A0) - A)**2).flat) * Brenorm
+            B[1] += np.dot(w.flat, (A1*DA).flat) * Brenorm
+            B[2] += (np.dot(w.flat, (A2*DA).flat) + 0.25*np.dot(w.flat, (A1**2 * A/A0**(3/2)).flat)) * Brenorm
+
+        parallel.allreduce(B)
+
+        if np.isinf(B).any() or np.isnan(B).any():
+            logger.warning(
+                'Warning! inf or nan found! Trying to continue...')
+            B[np.isinf(B)] = 0.
+            B[np.isnan(B)] = 0.
+
+        self.B = B
+
+        return B
+
+    def poly_line_all_coeffs_ob(self, ob_h):
+        """
+        Compute all the coefficients of the polynomial for line minimization
+        in direction h for the object
+        """
+
+        B = np.zeros((9,), dtype=np.longdouble)
+        Brenorm = 1. / self.LL[0]**2
+
+        # Outer loop: through diffraction patterns
+        for dname, diff_view in self.di.views.items():
+            if not diff_view.active:
+                continue
+
+            # Weights and amplitudes for this view
+            w = self.weights[diff_view]
+            A = np.sqrt(diff_view.data)
+
+            A0 = None
+            A1 = None
+            A2 = None
+
+            for name, pod in diff_view.pods.items():
+                if not pod.active:
+                    continue
+                f = pod.fw(pod.probe * pod.object)
+                a = pod.fw(pod.probe * ob_h[pod.ob_view])
+
+                if A0 is None:
+                    A0 = u.abs2(f).astype(np.longdouble)
+                    A1 = 2 * np.real(f * a.conj()).astype(np.longdouble)
+                    A2 = u.abs2(a).astype(np.longdouble)
+                else:
+                    A0 += u.abs2(f)
+                    A1 += 2 * np.real(f * a.conj())
+                    A2 += u.abs2(a)
+
+            if self.p.floating_intensities:
+                A0 *= self.float_intens_coeff[dname]
+                A1 *= self.float_intens_coeff[dname]
+                A2 *= self.float_intens_coeff[dname]
+
+            A0 += 1e-12 # cf Poisson model sqrt(1e-12) = 1e-6
+            DA = 1. - A/np.sqrt(A0)
+            DA32 = A/A0**(3/2)
+
+            B[0] += np.dot(w.flat, ((np.sqrt(A0) - A)**2).flat) * Brenorm
+            B[1] += np.dot(w.flat, (A1*DA).flat) * Brenorm
+            B[2] += (np.dot(w.flat, (A2*DA).flat) + 0.25*np.dot(w.flat, (A1**2 * DA32).flat)) * Brenorm
+            B[3] += (0.25*np.dot(w.flat, (2*A1*A2 * DA32).flat) - 0.125*np.dot(w.flat, (A1**3/A0**2).flat)) * Brenorm
+            B[4] += (0.25*np.dot(w.flat, ((A2**2) * DA32).flat) - 0.125*np.dot(w.flat, ((3*A1**2*A2)/A0**2).flat)
+                      + 0.015625*np.dot(w.flat, (A1**4/A0**3).flat)) * Brenorm
+            B[5] += (- 0.125*np.dot(w.flat, ((3*A1*A2**2)/A0**2).flat)
+                     + 0.015625*np.dot(w.flat, ((4*A1**3*A2)/A0**3).flat)) * Brenorm
+            B[6] += (- 0.125*np.dot(w.flat, ((A2**3)/A0**2).flat)
+                     + 0.015625*np.dot(w.flat, ((6*A1**2*A2**2)/A0**3).flat)) * Brenorm
+            B[7] += (0.015625*np.dot(w.flat, ((4*A1*A2**3)/A0**3).flat)) * Brenorm
+            B[8] += (0.015625*np.dot(w.flat, ((A2**4)/A0**3).flat)) * Brenorm
+
+        parallel.allreduce(B)
+
+        # Object regularizer
+        if self.regularizer:
+            for name, s in self.ob.storages.items():
+                B[:3] += Brenorm * self.regularizer.poly_line_coeffs(
+                    ob_h.storages[name].data, s.data)
+
+        if np.isinf(B).any() or np.isnan(B).any():
+            logger.warning(
+                'Warning! inf or nan found! Trying to continue...')
+            B[np.isinf(B)] = 0.
+            B[np.isnan(B)] = 0.
+
+        self.B = B
+
+        return B
+
+def poly_line_all_coeffs_pr(self, pr_h):
+        """
+        Compute all the coefficients of the polynomial for line minimization
+        in direction h for the probe
+        """
+
+        B = np.zeros((9,), dtype=np.longdouble)
+        Brenorm = 1. / self.LL[0]**2
+
+        # Outer loop: through diffraction patterns
+        for dname, diff_view in self.di.views.items():
+            if not diff_view.active:
+                continue
+
+            # Weights and amplitudes for this view
+            w = self.weights[diff_view]
+            A = np.sqrt(diff_view.data)
+
+            A0 = None
+            A1 = None
+            A2 = None
+
+            for name, pod in diff_view.pods.items():
+                if not pod.active:
+                    continue
+                f = pod.fw(pod.probe * pod.object)
+                a = pod.fw(pr_h[pod.pr_view] * pod.object)
+
+                if A0 is None:
+                    A0 = u.abs2(f).astype(np.longdouble)
+                    A1 = 2 * np.real(f * a.conj()).astype(np.longdouble)
+                    A2 = u.abs2(a).astype(np.longdouble)
+                else:
+                    A0 += u.abs2(f)
+                    A1 += 2 * np.real(f * a.conj())
+                    A2 += u.abs2(a)
+
+            if self.p.floating_intensities:
+                A0 *= self.float_intens_coeff[dname]
+                A1 *= self.float_intens_coeff[dname]
+                A2 *= self.float_intens_coeff[dname]
+
+            A0 += 1e-12 # cf Poisson model sqrt(1e-12) = 1e-6
+            DA = 1. - A/np.sqrt(A0)
+            DA32 = A/A0**(3/2)
+
+            B[0] += np.dot(w.flat, ((np.sqrt(A0) - A)**2).flat) * Brenorm
+            B[1] += np.dot(w.flat, (A1*DA).flat) * Brenorm
+            B[2] += (np.dot(w.flat, (A2*DA).flat) + 0.25*np.dot(w.flat, (A1**2 * DA32).flat)) * Brenorm
+            B[3] += (0.25*np.dot(w.flat, (2*A1*A2 * DA32).flat) - 0.125*np.dot(w.flat, (A1**3/A0**2).flat)) * Brenorm
+            B[4] += (0.25*np.dot(w.flat, ((A2**2) * DA32).flat) - 0.125*np.dot(w.flat, ((3*A1**2*A2)/A0**2).flat)
+                      + 0.015625*np.dot(w.flat, (A1**4/A0**3).flat)) * Brenorm
+            B[5] += (- 0.125*np.dot(w.flat, ((3*A1*A2**2)/A0**2).flat)
+                     + 0.015625*np.dot(w.flat, ((4*A1**3*A2)/A0**3).flat)) * Brenorm
+            B[6] += (- 0.125*np.dot(w.flat, ((A2**3)/A0**2).flat)
+                     + 0.015625*np.dot(w.flat, ((6*A1**2*A2**2)/A0**3).flat)) * Brenorm
+            B[7] += (0.015625*np.dot(w.flat, ((4*A1*A2**3)/A0**3).flat)) * Brenorm
+            B[8] += (0.015625*np.dot(w.flat, ((A2**4)/A0**3).flat)) * Brenorm
+
+        parallel.allreduce(B)
+
+        if np.isinf(B).any() or np.isnan(B).any():
+            logger.warning(
+                'Warning! inf or nan found! Trying to continue...')
+            B[np.isinf(B)] = 0.
+            B[np.isnan(B)] = 0.
+
+        self.B = B
+
+        return B
+
+
 class Regul_del2(object):
     """\
     Squared gradient regularizer (Gaussian prior).

From ae6f01b54389e57aaa52fd526b7b38164e343016 Mon Sep 17 00:00:00 2001
From: Jaroslav Fowkes <jaroslav.fowkes@stfc.ac.uk>
Date: Thu, 21 Nov 2024 16:04:26 +0000
Subject: [PATCH 04/12] Serialize ML with separate gradient steps

---
 ...L_separate_grads.py => MLSeparateGrads.py} |   0
 ptypy/custom/MLSeparateGrads_serial.py        | 612 ++++++++++++++++++
 2 files changed, 612 insertions(+)
 rename ptypy/custom/{ML_separate_grads.py => MLSeparateGrads.py} (100%)
 create mode 100644 ptypy/custom/MLSeparateGrads_serial.py

diff --git a/ptypy/custom/ML_separate_grads.py b/ptypy/custom/MLSeparateGrads.py
similarity index 100%
rename from ptypy/custom/ML_separate_grads.py
rename to ptypy/custom/MLSeparateGrads.py
diff --git a/ptypy/custom/MLSeparateGrads_serial.py b/ptypy/custom/MLSeparateGrads_serial.py
new file mode 100644
index 000000000..6731e00aa
--- /dev/null
+++ b/ptypy/custom/MLSeparateGrads_serial.py
@@ -0,0 +1,612 @@
+# -*- coding: utf-8 -*-
+"""
+Maximum Likelihood reconstruction engine.
+
+TODO.
+
+  * Implement other regularizers
+
+This file is part of the PTYPY package.
+
+    :copyright: Copyright 2024 by the PTYPY team, see AUTHORS.
+    :license: see LICENSE for details.
+"""
+import numpy as np
+import time
+
+from .MLSeparateGrads import MLSeparateGrads, BaseModel
+from ..accelerate.base.engines.projectional_serial import serialize_array_access
+from .. import utils as u
+from ..utils.verbose import logger, log
+from ..utils import parallel
+from ..engines.utils import Cnorm2, Cdot
+from ..engines import register
+from ..accelerate.base.kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel
+from ..accelerate.base.array_utils import complex_gaussian_filter, complex_gaussian_filter_fft
+
+
+__all__ = ['MLSeparateGrads_serial']
+
+@register()
+class MLSeparateGrads_serial(MLSeparateGrads):
+
+    """
+    Defaults:
+
+    [smooth_gradient_method]
+    default = convolution
+    type = str
+    help = Method to be used for smoothing the gradient, choose between ```convolution``` or ```fft```.
+    """
+
+    def __init__(self, ptycho_parent, pars=None):
+        """
+        Maximum likelihood reconstruction engine.
+        """
+        super(MLSeparateGrads_serial, self).__init__(ptycho_parent, pars)
+
+        self.kernels = {}
+        self.diff_info = {}
+        self.cn2_ob_grad = 0.
+        self.cn2_pr_grad = 0.
+
+    def engine_initialize(self):
+        """
+        Prepare for ML reconstruction.
+        """
+        super(MLSeparateGrads_serial, self).engine_initialize()
+        self._setup_kernels()
+
+    def _initialize_model(self):
+
+        # Create noise model
+        if self.p.ML_type.lower() == "gaussian":
+            self.ML_model = GaussianModel(self)
+        elif self.p.ML_type.lower() == "poisson":
+            raise NotImplementedError('Poisson norm model not yet implemented')
+        elif self.p.ML_type.lower() == "euclid":
+            raise NotImplementedError('Euclid norm model not yet implemented')
+        else:
+            raise RuntimeError("Unsupported ML_type: '%s'" % self.p.ML_type)
+
+    def _setup_kernels(self):
+        """
+        Setup kernels, one for each scan. Derive scans from ptycho class
+        """
+        # get the scans
+        for label, scan in self.ptycho.model.scans.items():
+
+            kern = u.Param()
+            kern.scanmodel = type(scan).__name__
+            self.kernels[label] = kern
+
+            # TODO: needs to be adapted for broad bandwidth
+            geo = scan.geometries[0]
+
+            # Get info to shape buffer arrays
+            fpc = scan.max_frames_per_block
+
+            # TODO : make this more foolproof
+            try:
+                nmodes = scan.p.coherence.num_probe_modes
+            except:
+                nmodes = 1
+
+            # create buffer arrays
+            ash = (fpc * nmodes,) + tuple(geo.shape)
+            aux = np.zeros(ash, dtype=np.complex64)
+            kern.aux = aux
+            kern.a = np.zeros(ash, dtype=np.complex64)
+            kern.b = np.zeros(ash, dtype=np.complex64)
+
+            # setup kernels, one for each SCAN.
+            kern.GDK = GradientDescentKernel(aux, nmodes)
+            kern.GDK.allocate()
+
+            kern.POK = PoUpdateKernel()
+            kern.POK.allocate()
+
+            kern.AWK = AuxiliaryWaveKernel()
+            kern.AWK.allocate()
+
+            kern.FW = geo.propagator.fw
+            kern.BW = geo.propagator.bw
+            kern.resolution = geo.resolution[0]
+
+            if self.do_position_refinement:
+                kern.PCK = PositionCorrectionKernel(aux, nmodes, self.p.position_refinement, geo.resolution)
+                kern.PCK.allocate()
+
+    def engine_prepare(self):
+
+        ## Serialize new data ##
+
+        for label, d in self.ptycho.new_data:
+            prep = u.Param()
+            prep.label = label
+            self.diff_info[d.ID] = prep
+            prep.err_phot = np.zeros((d.data.shape[0],), dtype=np.float32)
+            # set floating intensity coefficients to 1.0
+            # they get overridden if self.p.floating_intensities=True
+            prep.float_intens_coeff = np.ones((d.data.shape[0],), dtype=np.float32)
+
+        # Unfortunately this needs to be done for all pods, since
+        # the shape of the probe / object was modified.
+        # TODO: possible scaling issue
+        for label, d in self.di.storages.items():
+            prep = self.diff_info[d.ID]
+            prep.view_IDs, prep.poe_IDs, prep.addr = serialize_array_access(d)
+            # Re-create exit addresses when gradient models (single exit buffer per view) are used
+            # TODO: this should not be necessary, kernels should not use exit wave information
+            if self.kernels[prep.label].scanmodel in ("GradFull", "BlockGradFull"):
+                for i,addr in enumerate(prep.addr):
+                    nmodes = len(addr[:,2,0])
+                    for j,ea in enumerate(addr[:,2,0]):
+                        prep.addr[i,j,2,0] = i*nmodes+j
+            prep.I = d.data
+            if self.do_position_refinement:
+                prep.original_addr = np.zeros_like(prep.addr)
+                prep.original_addr[:] = prep.addr
+                prep.ma = self.ma.S[d.ID].data
+
+        self.ML_model.prepare()
+
+    def _get_smooth_gradient(self, data, sigma):
+        if self.p.smooth_gradient_method == "convolution":
+            return complex_gaussian_filter(data, sigma)
+        elif self.p.smooth_gradient_method == "fft":
+            return complex_gaussian_filter_fft(data, sigma)
+        else:
+            raise NotImplementedError("smooth_gradient_method should be ```convolution``` or ```fft```.")
+
+    def _replace_ob_grad(self):
+        new_ob_grad = self.ob_grad_new
+        # Smoothing preconditioner
+        if self.smooth_gradient:
+            self.smooth_gradient.sigma *= (1. - self.p.smooth_gradient_decay)
+            for name, s in new_ob_grad.storages.items():
+                s.data[:] = self._get_smooth_gradient(s.data, self.smooth_gradient.sigma)
+
+        norm = Cnorm2(new_ob_grad)
+        dot = np.real(Cdot(new_ob_grad, self.ob_grad))
+        self.ob_grad << new_ob_grad
+        return norm, dot
+
+    def _replace_pr_grad(self):
+        new_pr_grad = self.pr_grad_new
+        # probe support
+        if self.p.probe_update_start <= self.curiter:
+            # Apply probe support if needed
+            for name, s in new_pr_grad.storages.items():
+                self.support_constraint(s)
+        # FIXME: this hack doesn't work here as we step in probe and object separately
+        # FIXME: really it's the probe step that should be zeroed out not the gradient
+        else:
+            new_pr_grad.fill(0.)
+
+        norm = Cnorm2(new_pr_grad)
+        dot = np.real(Cdot(new_pr_grad, self.pr_grad))
+        self.pr_grad << new_pr_grad
+        return norm, dot
+
+    def engine_iterate(self, num=1):
+        """
+        Compute `num` iterations.
+        """
+        ########################
+        # Compute new gradient
+        ########################
+        tg = 0.
+        tc = 0.
+        ta = time.time()
+        for it in range(num):
+
+            ########################
+            # Compute new gradients
+            # ob: new_ob_grad
+            # pr: new_pr_grad
+            ########################
+            t1 = time.time()
+            error_dct = self.ML_model.new_grad()
+            tg += time.time() - t1
+
+            cn2_new_pr_grad, cdotr_pr_grad = self._replace_pr_grad()
+            cn2_new_ob_grad, cdotr_ob_grad = self._replace_ob_grad()
+
+            ############################
+            # Compute Polak-Ribiere betas
+            # bt_ob = bt_num_ob/bt_denom_ob
+            # bt_pr = bt_num_pr/bt_denom_pr
+            ############################
+            if self.curiter == 0:
+                bt_ob = 0.
+                bt_pr = 0.
+            else:
+                # For the object
+                bt_num_ob = cn2_new_ob_grad - cdotr_ob_grad
+                bt_denom_ob = self.cn2_ob_grad
+                bt_ob = max(0, bt_num_ob/bt_denom_ob)
+
+                # For the probe
+                bt_num_pr = cn2_new_pr_grad - cdotr_pr_grad
+                bt_denom_pr = self.cn2_pr_grad
+                bt_pr = max(0, bt_num_pr/bt_denom_pr)
+
+            self.cn2_ob_grad = cn2_new_ob_grad
+            self.cn2_pr_grad = cn2_new_pr_grad
+
+            ############################
+            # Compute next conjugates
+            # ob_h -= bt_ob * ob_grad
+            # pr_h -= bt_pr * pr_grad
+            # NB: in the below need to do h/tmin
+            # as did h*tmin when taking steps
+            # (don't you just love containers?)
+            ############################
+            dt = self.ptycho.FType
+            self.ob_h *= dt(bt_ob / self.tmin_ob)
+
+            # Smoothing preconditioner
+            if self.smooth_gradient:
+                for name, s in self.ob_h.storages.items():
+                    s.data[:] -= self._get_smooth_gradient(self.ob_grad.storages[name].data, self.smooth_gradient.sigma)
+            else:
+                self.ob_h -= self.ob_grad
+
+            self.pr_h *= dt(bt_pr / self.tmin_pr)
+            self.pr_h -= self.pr_grad
+
+            ########################
+            # Compute step-sizes
+            # ob: tmin_ob
+            # pr: tmin_pr
+            ########################
+            t2 = time.time()
+            B_ob = self.ML_model.poly_line_coeffs_ob(self.ob_h)
+            B_pr = self.ML_model.poly_line_coeffs_pr(self.pr_h)
+            tc += time.time() - t2
+
+            self.tmin_ob = dt(-0.5 * B_ob[1] / B_ob[2])
+            self.tmin_pr = dt(-0.5 * B_pr[1] / B_pr[2])
+
+            ########################
+            # Take conjugate steps
+            # ob += tmin_ob * ob_h
+            # pr += tmin_pr * pr_h
+            ########################
+            self.ob_h *= self.tmin_ob
+            self.pr_h *= self.tmin_pr
+            self.ob += self.ob_h
+            self.pr += self.pr_h
+
+            # Position correction
+            self.position_update()
+
+            # Allow for customized modifications at the end of each iteration
+            self._post_iterate_update()
+
+            # increase iteration counter
+            self.curiter += 1
+
+        logger.info('Time spent in gradient calculation: %.2f' % tg)
+        logger.info('  ....  in coefficient calculation: %.2f' % tc)
+        return error_dct  # np.array([[self.ML_model.LL[0]] * 3])
+
+    def position_update(self):
+        """
+        Position refinement
+        """
+        if not self.do_position_refinement:
+            return
+        do_update_pos = (self.p.position_refinement.stop > self.curiter >= self.p.position_refinement.start)
+        do_update_pos &= (self.curiter % self.p.position_refinement.interval) == 0
+
+        # Update positions
+        if do_update_pos:
+            """
+            Iterates through all positions and refines them by a given algorithm.
+            """
+            log(4, "----------- START POS REF -------------")
+            for dID in self.di.S.keys():
+
+                prep = self.diff_info[dID]
+                pID, oID, eID = prep.poe_IDs
+                ob = self.ob.S[oID].data
+                pr = self.pr.S[pID].data
+                kern = self.kernels[prep.label]
+                aux = kern.aux
+                addr = prep.addr
+                original_addr = prep.original_addr
+                mangled_addr = addr.copy()
+                w = prep.weights
+                I = prep.I
+                err_phot = prep.err_phot
+
+                PCK = kern.PCK
+                FW = kern.FW
+
+                # Keep track of object boundaries
+                max_oby = ob.shape[-2] - aux.shape[-2] - 1
+                max_obx = ob.shape[-1] - aux.shape[-1] - 1
+
+                # We need to re-calculate the current error
+                PCK.build_aux(aux, addr, ob, pr)
+                aux[:] = FW(aux)
+                PCK.log_likelihood_ml(aux, addr, I, w, err_phot)
+                error_state = np.zeros_like(err_phot)
+                error_state[:] = err_phot
+                PCK.mangler.setup_shifts(self.curiter, nframes=addr.shape[0])
+
+                log(4, 'Position refinement trial: iteration %s' % (self.curiter))
+                for i in range(PCK.mangler.nshifts):
+                    PCK.mangler.get_address(i, addr, mangled_addr, max_oby, max_obx)
+                    PCK.build_aux(aux, mangled_addr, ob, pr)
+                    aux[:] = FW(aux)
+                    PCK.log_likelihood_ml(aux, mangled_addr, I, w, err_phot)
+                    PCK.update_addr_and_error_state(addr, error_state, mangled_addr, err_phot)
+
+                prep.err_phot = error_state
+                prep.addr = addr
+
+    def engine_finalize(self):
+        """
+        try deleting ever helper contianer
+        """
+        if self.do_position_refinement and self.p.position_refinement.record:
+            for label, d in self.di.storages.items():
+                prep = self.diff_info[d.ID]
+                res = self.kernels[prep.label].resolution
+                for i,view in enumerate(d.views):
+                    for j,(pname, pod) in enumerate(view.pods.items()):
+                        delta = (prep.addr[i][j][1][1:] - prep.original_addr[i][j][1][1:]) * res
+                        pod.ob_view.coord += delta
+                        pod.ob_view.storage.update_views(pod.ob_view)
+            self.ptycho.record_positions = True
+
+        # Save floating intensities into runtime
+        float_intens_coeff = {}
+        for label, d in self.di.storages.items():
+            prep = self.diff_info[d.ID]
+            float_intens_coeff[label] = prep.float_intens_coeff
+        self.ptycho.runtime["float_intens"] = parallel.gather_dict(float_intens_coeff)
+        super().engine_finalize()
+
+
+class BaseModelSerial(BaseModel):
+    """
+    Base class for log-likelihood models.
+    """
+
+    def __del__(self):
+        """
+        Clean up routine
+        """
+        pass
+
+
+class GaussianModel(BaseModelSerial):
+    """
+    Gaussian noise model.
+    TODO: feed actual statistical weights instead of using the Poisson statistic heuristic.
+    """
+
+    def __init__(self, MLengine):
+        """
+        Core functions for ML computation using a Gaussian model.
+        """
+        super(GaussianModel, self).__init__(MLengine)
+
+    def prepare(self):
+
+        super(GaussianModel, self).prepare()
+
+        for label, d in self.engine.ptycho.new_data:
+            prep = self.engine.diff_info[d.ID]
+            prep.weights = (self.Irenorm * self.engine.ma.S[d.ID].data
+                            / (1. / self.Irenorm + d.data)).astype(d.data.dtype)
+
+    def __del__(self):
+        """
+        Clean up routine
+        """
+        super(GaussianModel, self).__del__()
+
+    def new_grad(self):
+        """
+        Compute a new gradient direction according to a Gaussian noise model.
+
+        Note: The negative log-likelihood and local errors are also computed
+        here.
+        """
+        ob_grad = self.engine.ob_grad_new
+        pr_grad = self.engine.pr_grad_new
+        ob_grad << 0.
+        pr_grad << 0.
+
+        # We need an array for MPI
+        LL = np.array([0.])
+        error_dct = {}
+
+        for dID in self.di.S.keys():
+            prep = self.engine.diff_info[dID]
+            # find probe, object in exit ID in dependence of dID
+            pID, oID, eID = prep.poe_IDs
+
+            # references for kernels
+            kern = self.engine.kernels[prep.label]
+            GDK = kern.GDK
+            AWK = kern.AWK
+            POK = kern.POK
+
+            aux = kern.aux
+
+            FW = kern.FW
+            BW = kern.BW
+
+            # get addresses and auxilliary array
+            addr = prep.addr
+            w = prep.weights
+            err_phot = prep.err_phot
+            fic = prep.float_intens_coeff
+
+            # local references
+            ob = self.engine.ob.S[oID].data
+            obg = ob_grad.S[oID].data
+            pr = self.engine.pr.S[pID].data
+            prg = pr_grad.S[pID].data
+            I = prep.I
+
+            # make propagated exit (to buffer)
+            AWK.build_aux_no_ex(aux, addr, ob, pr, add=False)
+
+            # forward prop
+            aux[:] = FW(aux)
+
+            GDK.make_model(aux, addr)
+
+            if self.p.floating_intensities:
+                GDK.floating_intensity(addr, w, I, fic)
+
+            GDK.main(aux, addr, w, I)
+            GDK.error_reduce(addr, err_phot)
+            aux[:] = BW(aux)
+
+            POK.ob_update_ML(addr, obg, pr, aux)
+            POK.pr_update_ML(addr, prg, ob, aux)
+
+        for dID, prep in self.engine.diff_info.items():
+            err_phot = prep.err_phot
+            LL += err_phot.sum()
+            err_phot /= np.prod(prep.weights.shape[-2:])
+            err_fourier = np.zeros_like(err_phot)
+            err_exit = np.zeros_like(err_phot)
+            errs = np.ascontiguousarray(np.vstack([err_fourier, err_phot, err_exit]).T)
+            error_dct.update(zip(prep.view_IDs, errs))
+
+        # MPI reduction of gradients
+        ob_grad.allreduce()
+        pr_grad.allreduce()
+        parallel.allreduce(LL)
+
+        # Object regularizer
+        if self.regularizer:
+            for name, s in self.engine.ob.storages.items():
+                ob_grad.storages[name].data += self.regularizer.grad(s.data)
+                LL += self.regularizer.LL
+
+        self.LL = LL / self.tot_measpts
+        return error_dct
+
+    def poly_line_coeffs_ob(self, c_ob_h):
+        """
+        Compute the coefficients of the polynomial for line minimization
+        in direction h for the object
+        """
+
+        B = np.zeros((3,), dtype=np.longdouble)
+        Brenorm = 1. / self.LL[0] ** 2
+
+        # Outer loop: through diffraction patterns
+        for dID in self.di.S.keys():
+            prep = self.engine.diff_info[dID]
+
+            # find probe, object in exit ID in dependence of dID
+            pID, oID, eID = prep.poe_IDs
+
+            # references for kernels
+            kern = self.engine.kernels[prep.label]
+            GDK = kern.GDK
+            AWK = kern.AWK
+
+            f = kern.aux
+            a = kern.a
+
+            FW = kern.FW
+
+            # get addresses and auxilliary array
+            addr = prep.addr
+            w = prep.weights
+            fic = prep.float_intens_coeff
+
+            # local references
+            ob = self.ob.S[oID].data
+            ob_h = c_ob_h.S[oID].data
+            pr = self.pr.S[pID].data
+            I = self.di.S[dID].data
+
+            # make propagated exit (to buffer)
+            AWK.build_aux_no_ex(f, addr, ob, pr, add=False)
+            AWK.build_aux_no_ex(a, addr, ob_h, pr, add=False)
+
+            # forward prop
+            f[:] = FW(f)
+            a[:] = FW(a)
+
+            GDK.make_a012(f, a, 0, addr, I, fic) # FIXME: need new kernel
+            GDK.fill_b(addr, Brenorm, w, B)
+
+        parallel.allreduce(B)
+
+        # Object regularizer
+        if self.regularizer:
+            for name, s in self.ob.storages.items():
+                B += Brenorm * self.regularizer.poly_line_coeffs(
+                    c_ob_h.storages[name].data, s.data)
+
+        self.B = B
+
+        return B
+
+def poly_line_coeffs_pr(self, c_pr_h):
+        """
+        Compute the coefficients of the polynomial for line minimization
+        in direction h for the probe
+        """
+
+        B = np.zeros((3,), dtype=np.longdouble)
+        Brenorm = 1. / self.LL[0] ** 2
+
+        # Outer loop: through diffraction patterns
+        for dID in self.di.S.keys():
+            prep = self.engine.diff_info[dID]
+
+            # find probe, object in exit ID in dependence of dID
+            pID, oID, eID = prep.poe_IDs
+
+            # references for kernels
+            kern = self.engine.kernels[prep.label]
+            GDK = kern.GDK
+            AWK = kern.AWK
+
+            f = kern.aux
+            a = kern.a
+
+            FW = kern.FW
+
+            # get addresses and auxilliary array
+            addr = prep.addr
+            w = prep.weights
+            fic = prep.float_intens_coeff
+
+            # local references
+            ob = self.ob.S[oID].data
+            pr = self.pr.S[pID].data
+            pr_h = c_pr_h.S[pID].data
+            I = self.di.S[dID].data
+
+            # make propagated exit (to buffer)
+            AWK.build_aux_no_ex(f, addr, ob, pr, add=False)
+            AWK.build_aux_no_ex(a, addr, ob, pr_h, add=True)
+
+            # forward prop
+            f[:] = FW(f)
+            a[:] = FW(a)
+
+            GDK.make_a012(f, a, 0, addr, I, fic) # FIXME: need new kernel
+            GDK.fill_b(addr, Brenorm, w, B)
+
+        parallel.allreduce(B)
+
+        self.B = B
+
+        return B

From 17ba4adfe557eba0443170dc7f1ea33c9c901d70 Mon Sep 17 00:00:00 2001
From: Jaroslav Fowkes <jaroslav.fowkes@stfc.ac.uk>
Date: Thu, 21 Nov 2024 16:21:48 +0000
Subject: [PATCH 05/12] Drop probe_update_start from MLSeparateGrads

---
 ptypy/custom/MLSeparateGrads.py        | 24 ++++++------------------
 ptypy/custom/MLSeparateGrads_serial.py | 12 +++---------
 2 files changed, 9 insertions(+), 27 deletions(-)

diff --git a/ptypy/custom/MLSeparateGrads.py b/ptypy/custom/MLSeparateGrads.py
index 3c77e4b54..1448f879d 100644
--- a/ptypy/custom/MLSeparateGrads.py
+++ b/ptypy/custom/MLSeparateGrads.py
@@ -88,13 +88,6 @@ class MLSeparateGrads(PositionCorrectionEngine):
     help = Whether to use the object/probe scaling preconditioner
     doc = This parameter can give faster convergence for weakly scattering samples.
 
-    [probe_update_start]
-    default = 0
-    type = int
-    lowlim = 0
-    help = Number of iterations before probe update starts
-    # NOTE: probe_update_start doesn't work with this code, need to add some code to fix this
-
     [poly_line_coeffs]
     default = quadratic
     type = str
@@ -219,17 +212,12 @@ def engine_iterate(self, num=1):
             new_ob_grad, new_pr_grad = self.ob_grad_new, self.pr_grad_new
             tg += time.time() - t1
 
-            if self.p.probe_update_start <= self.curiter:
-                # Apply probe support if needed
-                for name, s in new_pr_grad.storages.items():
-                    self.support_constraint(s)
-                    #support = self.probe_support.get(name)
-                    #if support is not None:
-                    #    s.data *= support
-            # FIXME: this hack doesn't work here as we step in probe and object separately
-            # FIXME: really it's the probe step that should be zeroed out not the gradient
-            else:
-                new_pr_grad.fill(0.)
+            # Apply probe support if needed
+            for name, s in new_pr_grad.storages.items():
+                self.support_constraint(s)
+                #support = self.probe_support.get(name)
+                #if support is not None:
+                #    s.data *= support
 
             # Smoothing preconditioner
             if self.smooth_gradient:
diff --git a/ptypy/custom/MLSeparateGrads_serial.py b/ptypy/custom/MLSeparateGrads_serial.py
index 6731e00aa..572e9a323 100644
--- a/ptypy/custom/MLSeparateGrads_serial.py
+++ b/ptypy/custom/MLSeparateGrads_serial.py
@@ -174,15 +174,9 @@ def _replace_ob_grad(self):
 
     def _replace_pr_grad(self):
         new_pr_grad = self.pr_grad_new
-        # probe support
-        if self.p.probe_update_start <= self.curiter:
-            # Apply probe support if needed
-            for name, s in new_pr_grad.storages.items():
-                self.support_constraint(s)
-        # FIXME: this hack doesn't work here as we step in probe and object separately
-        # FIXME: really it's the probe step that should be zeroed out not the gradient
-        else:
-            new_pr_grad.fill(0.)
+        # Apply probe support if needed
+        for name, s in new_pr_grad.storages.items():
+            self.support_constraint(s)
 
         norm = Cnorm2(new_pr_grad)
         dot = np.real(Cdot(new_pr_grad, self.pr_grad))

From fe0df7d260b10fb16fb5f9025e3af82bfeeade91 Mon Sep 17 00:00:00 2001
From: Jaroslav Fowkes <jaroslav.fowkes@stfc.ac.uk>
Date: Mon, 6 Jan 2025 09:04:04 +0000
Subject: [PATCH 06/12] Add required kernel function

---
 ptypy/accelerate/base/kernels.py       | 31 ++++++++++++++++++++++++++
 ptypy/custom/MLSeparateGrads_serial.py |  4 ++--
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/ptypy/accelerate/base/kernels.py b/ptypy/accelerate/base/kernels.py
index c5343f940..2d6b438da 100644
--- a/ptypy/accelerate/base/kernels.py
+++ b/ptypy/accelerate/base/kernels.py
@@ -269,6 +269,37 @@ def make_model(self, b_aux, addr):
         tf = aux.reshape(sh[0], self.nmodes, sh[1], sh[2])
         Imodel[:] = ((tf * tf.conj()).real).sum(1)
 
+    def make_a012_notb(self, b_f, b_a, addr, I, fic):
+
+        # reference shape (= GPU global dims)
+        sh = I.shape
+
+        # stopper
+        maxz = I.shape[0]
+
+        A0 = self.npy.Imodel
+        A1 = self.npy.LLerr
+        A2 = self.npy.LLden
+
+        # batch buffers
+        f = b_f[:maxz * self.nmodes]
+        a = b_a[:maxz * self.nmodes]
+
+        ## Actual math ## (subset of FUK.fourier_error)
+        fc = fic.reshape((maxz,1,1))
+        A0.fill(0.)
+        tf = np.real(f * f.conj()).astype(self.ftype)
+        A0[:maxz] = np.double(tf.reshape(maxz, self.nmodes, sh[1], sh[2]).sum(1) * fc) - I
+
+        A1.fill(0.)
+        tf = 2. * np.real(f * a.conj())
+        A1[:maxz] = tf.reshape(maxz, self.nmodes, sh[1], sh[2]).sum(1) * fc
+
+        A2.fill(0.)
+        tf = np.real(a * a.conj())
+        A2[:maxz] = tf.reshape(maxz, self.nmodes, sh[1], sh[2]).sum(1) * fc
+        return
+
     def make_a012(self, b_f, b_a, b_b, addr, I, fic):
 
         # reference shape (= GPU global dims)
diff --git a/ptypy/custom/MLSeparateGrads_serial.py b/ptypy/custom/MLSeparateGrads_serial.py
index 572e9a323..e5c2283e7 100644
--- a/ptypy/custom/MLSeparateGrads_serial.py
+++ b/ptypy/custom/MLSeparateGrads_serial.py
@@ -536,7 +536,7 @@ def poly_line_coeffs_ob(self, c_ob_h):
             f[:] = FW(f)
             a[:] = FW(a)
 
-            GDK.make_a012(f, a, 0, addr, I, fic) # FIXME: need new kernel
+            GDK.make_a012_notb(f, a, addr, I, fic)
             GDK.fill_b(addr, Brenorm, w, B)
 
         parallel.allreduce(B)
@@ -596,7 +596,7 @@ def poly_line_coeffs_pr(self, c_pr_h):
             f[:] = FW(f)
             a[:] = FW(a)
 
-            GDK.make_a012(f, a, 0, addr, I, fic) # FIXME: need new kernel
+            GDK.make_a012_notb(f, a, addr, I, fic)
             GDK.fill_b(addr, Brenorm, w, B)
 
         parallel.allreduce(B)

From efe000ff9b266cd3e0dbe70a39263f4b2364cb8d Mon Sep 17 00:00:00 2001
From: Jaroslav Fowkes <jaroslav.fowkes@stfc.ac.uk>
Date: Mon, 6 Jan 2025 09:18:57 +0000
Subject: [PATCH 07/12] Add kernel regression test

---
 .../gradient_descent_kernel_test.py           | 61 +++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/test/accelerate_tests/base_tests/gradient_descent_kernel_test.py b/test/accelerate_tests/base_tests/gradient_descent_kernel_test.py
index 453edc2ff..9749f8e39 100644
--- a/test/accelerate_tests/base_tests/gradient_descent_kernel_test.py
+++ b/test/accelerate_tests/base_tests/gradient_descent_kernel_test.py
@@ -125,6 +125,67 @@ def test_floating_intensity(self):
         np.testing.assert_array_almost_equal(exp_fic, fic,
             err_msg="floating intensity coeff (fic) has not been updated as expected")
 
+    def test_make_a012_notb(self):
+        b_f, b_a, _, I, w, err_sum, addr = self.prepare_arrays()
+        fic = np.ones((I.shape[0],), dtype=I.dtype)
+        GDK=GradientDescentKernel(b_f, addr.shape[1])
+        GDK.allocate()
+        GDK.make_a012_notb(b_f, b_a, addr, I, fic)
+        #print('Imodel',repr(GDK.npy.Imodel))
+        #print('LLerr',repr(GDK.npy.LLerr))
+        #print('LLden',repr(GDK.npy.LLden))
+        exp_A0 = np.array([[[ 1.,  1.,  1.],
+        [ 2.,  2.,  2.],
+        [ 5.,  5.,  5.]],
+
+       [[12., 12., 12.],
+        [13., 13., 13.],
+        [16., 16., 16.]],
+
+       [[37., 37., 37.],
+        [38., 38., 38.],
+        [41., 41., 41.]],
+
+       [[ 0.,  0.,  0.],
+        [ 0.,  0.,  0.],
+        [ 0.,  0.,  0.]]], dtype=FLOAT_TYPE)
+        np.testing.assert_array_almost_equal(exp_A0, GDK.npy.Imodel,
+                                      err_msg="`Imodel` buffer (=A0) has not been updated as expected")
+        exp_A1 = np.array([[[ 0.,  0.,  0.],
+        [ 2.,  6., 10.],
+        [ 4., 12., 20.]],
+
+       [[ 0.,  0.,  0.],
+        [10., 14., 18.],
+        [20., 28., 36.]],
+
+       [[ 0.,  0.,  0.],
+        [18., 22., 26.],
+        [36., 44., 52.]],
+
+       [[ 0.,  0.,  0.],
+        [ 0.,  0.,  0.],
+        [ 0.,  0.,  0.]]], dtype=FLOAT_TYPE)
+        np.testing.assert_array_almost_equal(exp_A1, GDK.npy.LLerr,
+                                      err_msg="`LLerr` buffer (=A1) has not been updated as expected")
+        exp_A2 = np.array([[[ 0.,  4., 12.],
+        [ 4.,  8., 16.],
+        [12., 16., 24.]],
+
+       [[ 0., 12., 28.],
+        [12., 24., 40.],
+        [28., 40., 56.]],
+
+       [[ 0., 20., 44.],
+        [20., 40., 64.],
+        [44., 64., 88.]],
+
+       [[ 0.,  0.,  0.],
+        [ 0.,  0.,  0.],
+        [ 0.,  0.,  0.]]], dtype=FLOAT_TYPE)
+        print(GDK.npy.LLden)
+        np.testing.assert_array_almost_equal(exp_A2, GDK.npy.LLden,
+                                      err_msg="`LLden` buffer (=A2) has not been updated as expected")
 
     def test_make_a012(self):
         b_f, b_a, b_b, I, w, err_sum, addr = self.prepare_arrays()

From 60c7e60828e1add0490e86434dc5e27e7a334397 Mon Sep 17 00:00:00 2001
From: Jaroslav Fowkes <jaroslav.fowkes@stfc.ac.uk>
Date: Mon, 6 Jan 2025 09:29:23 +0000
Subject: [PATCH 08/12] Fix kernel regression test

---
 .../gradient_descent_kernel_test.py           | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/test/accelerate_tests/base_tests/gradient_descent_kernel_test.py b/test/accelerate_tests/base_tests/gradient_descent_kernel_test.py
index 9749f8e39..675d7690a 100644
--- a/test/accelerate_tests/base_tests/gradient_descent_kernel_test.py
+++ b/test/accelerate_tests/base_tests/gradient_descent_kernel_test.py
@@ -168,22 +168,21 @@ def test_make_a012_notb(self):
         [ 0.,  0.,  0.]]], dtype=FLOAT_TYPE)
         np.testing.assert_array_almost_equal(exp_A1, GDK.npy.LLerr,
                                       err_msg="`LLerr` buffer (=A1) has not been updated as expected")
-        exp_A2 = np.array([[[ 0.,  4., 12.],
-        [ 4.,  8., 16.],
-        [12., 16., 24.]],
+        exp_A2 = np.array([[[ 0.,  2.,  8.],
+        [ 2.,  4., 10.],
+        [ 8., 10., 16.]],
 
-       [[ 0., 12., 28.],
-        [12., 24., 40.],
-        [28., 40., 56.]],
+       [[ 0.,  2.,  8.],
+        [ 2.,  4., 10.],
+        [ 8., 10., 16.]],
 
-       [[ 0., 20., 44.],
-        [20., 40., 64.],
-        [44., 64., 88.]],
+       [[ 0.,  2.,  8.],
+        [ 2.,  4., 10.],
+        [ 8., 10., 16.]],
 
        [[ 0.,  0.,  0.],
         [ 0.,  0.,  0.],
         [ 0.,  0.,  0.]]], dtype=FLOAT_TYPE)
-        print(GDK.npy.LLden)
         np.testing.assert_array_almost_equal(exp_A2, GDK.npy.LLden,
                                       err_msg="`LLden` buffer (=A2) has not been updated as expected")
 

From 7c1a288060c96438598af36c43297f6264f2319c Mon Sep 17 00:00:00 2001
From: Jari <jfowkes@users.noreply.github.com>
Date: Tue, 7 Jan 2025 14:52:21 +0000
Subject: [PATCH 09/12] Add CUDA kernel

---
 .../accelerate/cuda_common/make_a012_notb.cu  | 61 +++++++++++++++++++
 1 file changed, 61 insertions(+)
 create mode 100644 ptypy/accelerate/cuda_common/make_a012_notb.cu

diff --git a/ptypy/accelerate/cuda_common/make_a012_notb.cu b/ptypy/accelerate/cuda_common/make_a012_notb.cu
new file mode 100644
index 000000000..94c7ab68a
--- /dev/null
+++ b/ptypy/accelerate/cuda_common/make_a012_notb.cu
@@ -0,0 +1,61 @@
+/** fmag_all_update without b.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double)
+ * - MATH_TYPE: the data type used for computation 
+ * - ACC_TYPE: data type used for accumulation
+ */
+
+#include "common.cuh"
+
+extern "C" __global__ void make_a012_notb(const complex<IN_TYPE>* f,
+                                          const complex<IN_TYPE>* a,
+                                          const IN_TYPE* I,
+                                          const IN_TYPE* fic,
+                                          OUT_TYPE* A0,
+                                          OUT_TYPE* A1,
+                                          OUT_TYPE* A2,
+                                          int z,
+                                          int y,
+                                          int x,
+                                          int maxz)
+{
+  int ix = threadIdx.x + blockIdx.x * blockDim.x;
+  int iz = blockIdx.z;
+
+  if (ix >= x)
+    return;
+
+  if (iz >= maxz)
+  {
+    A0[iz * x + ix] = OUT_TYPE(0);  // make sure it's the right type (double/float)
+    A1[iz * x + ix] = OUT_TYPE(0);
+    A2[iz * x + ix] = OUT_TYPE(0);
+    return;
+  }
+
+  // we sum across y directly, as this is the number of modes,
+  // which is typically small
+  auto sumtf0 = ACC_TYPE(0);
+  auto sumtf1 = ACC_TYPE(0);
+  auto sumtf2 = ACC_TYPE(0);
+  for (auto iy = 0; iy < y; ++iy)
+  {
+    complex<MATH_TYPE> fv = f[iz * y * x + iy * x + ix];
+    sumtf0 += fv.real() * fv.real() + fv.imag() * fv.imag();
+
+    complex<MATH_TYPE> av = a[iz * y * x + iy * x + ix];
+    // 2 * real(f * conj(a))
+    sumtf1 += MATH_TYPE(2) * (fv.real() * av.real() + fv.imag() * av.imag());
+
+    // abs(a)^2
+    sumtf2 += av.real() * av.real() + av.imag() * av.imag();
+  }
+
+  MATH_TYPE Iv = I[iz * x + ix];
+  MATH_TYPE ficv = fic[iz];
+  A0[iz * x + ix] = OUT_TYPE(MATH_TYPE(sumtf0) * ficv - Iv);
+  A1[iz * x + ix] = OUT_TYPE(MATH_TYPE(sumtf1) * ficv);
+  A2[iz * x + ix] = OUT_TYPE(MATH_TYPE(sumtf2) * ficv);
+}
\ No newline at end of file

From 9ad9985b71c6e1852c8fcf96e8913cbd6ed619a4 Mon Sep 17 00:00:00 2001
From: Jaroslav Fowkes <jaroslav.fowkes@stfc.ac.uk>
Date: Mon, 13 Jan 2025 13:34:31 +0000
Subject: [PATCH 10/12] Add cupy and pycuda kernels

---
 ptypy/accelerate/cuda_cupy/kernels.py   | 24 ++++++++++++++++++++++++
 ptypy/accelerate/cuda_pycuda/kernels.py | 23 +++++++++++++++++++++++
 2 files changed, 47 insertions(+)

diff --git a/ptypy/accelerate/cuda_cupy/kernels.py b/ptypy/accelerate/cuda_cupy/kernels.py
index b743e9b08..becaf78b2 100644
--- a/ptypy/accelerate/cuda_cupy/kernels.py
+++ b/ptypy/accelerate/cuda_cupy/kernels.py
@@ -676,6 +676,7 @@ def __init__(self, aux, nmodes=1, queue=None, accumulate_type='double', math_typ
         }
         self.make_model_cuda = load_kernel('make_model', subs)
         self.make_a012_cuda = load_kernel('make_a012', subs)
+        self.make_a012_notb_cuda = load_kernel('make_a012_notb', subs)
         self.error_reduce_cuda = load_kernel('error_reduce', {
             **subs,
             'OUT_TYPE': 'float' if self.ftype == np.float32 else 'double',
@@ -748,6 +749,29 @@ def make_a012(self, b_f, b_a, b_b, addr, I, fic):
                             args=(b_f, b_a, b_b, I, fic,
                             A0, A1, A2, z, y, x, maxz))
 
+    def make_a012_notb(self, b_f, b_a, addr, I, fic):
+        # reference shape (= GPU global dims)
+        sh = I.shape
+
+        # stopper
+        maxz = I.shape[0]
+
+        A0 = self.gpu.Imodel
+        A1 = self.gpu.LLerr
+        A2 = self.gpu.LLden
+
+        z = np.int32(sh[0])
+        maxz = np.int32(maxz)
+        y = np.int32(self.nmodes)
+        x = np.int32(sh[1]*sh[2])
+        bx = 1024
+        if self.queue is not None:
+            self.queue.use()
+        self.make_a012_notb_cuda(grid=(int((x + bx - 1) // bx), 1, int(z)),
+                            block=(bx, 1, 1),
+                            args=(b_f, b_a, I, fic,
+                            A0, A1, A2, z, y, x, maxz))
+
     def fill_b(self, addr, Brenorm, w, B):
         # stopper
         maxz = w.shape[0]
diff --git a/ptypy/accelerate/cuda_pycuda/kernels.py b/ptypy/accelerate/cuda_pycuda/kernels.py
index 8fcebb6ef..541a1e8b1 100644
--- a/ptypy/accelerate/cuda_pycuda/kernels.py
+++ b/ptypy/accelerate/cuda_pycuda/kernels.py
@@ -654,6 +654,7 @@ def __init__(self, aux, nmodes=1, queue=None, accumulate_type = 'double', math_t
         }
         self.make_model_cuda = load_kernel('make_model', subs)
         self.make_a012_cuda = load_kernel('make_a012', subs)
+        self.make_a012_notb_cuda = load_kernel('make_a012_notb', subs)
         self.error_reduce_cuda = load_kernel('error_reduce', {
             **subs,
             'OUT_TYPE': 'float' if self.ftype == np.float32 else 'double',
@@ -723,6 +724,28 @@ def make_a012(self, b_f, b_a, b_b, addr, I, fic):
                             grid=(int((x + bx - 1) // bx), 1, int(z)),
                             stream=self.queue)
 
+    def make_a012_notb(self, b_f, b_a, addr, I, fic):
+        # reference shape (= GPU global dims)
+        sh = I.shape
+
+        # stopper
+        maxz = I.shape[0]
+
+        A0 = self.gpu.Imodel
+        A1 = self.gpu.LLerr
+        A2 = self.gpu.LLden
+
+        z = np.int32(sh[0])
+        maxz = np.int32(maxz)
+        y = np.int32(self.nmodes)
+        x = np.int32(sh[1]*sh[2])
+        bx = 1024
+        self.make_a012_notb_cuda(b_f, b_a, I, fic,
+                            A0, A1, A2, z, y, x, maxz,
+                            block=(bx, 1, 1),
+                            grid=(int((x + bx - 1) // bx), 1, int(z)),
+                            stream=self.queue)
+
     def fill_b(self, addr, Brenorm, w, B):
         # stopper
         maxz = w.shape[0]

From b25b4f6a4c9a97af6b648d01c5df619614a700a3 Mon Sep 17 00:00:00 2001
From: Jaroslav Fowkes <jaroslav.fowkes@stfc.ac.uk>
Date: Mon, 13 Jan 2025 13:46:03 +0000
Subject: [PATCH 11/12] Add cupy and pycuda kernel regression tests

---
 .../gradient_descent_kernel_test.py           | 73 +++++++++++++++++--
 .../gradient_descent_kernel_test.py           | 73 +++++++++++++++++--
 2 files changed, 136 insertions(+), 10 deletions(-)

diff --git a/test/accelerate_tests/cuda_cupy_tests/gradient_descent_kernel_test.py b/test/accelerate_tests/cuda_cupy_tests/gradient_descent_kernel_test.py
index b3a6e8ff7..55837771b 100644
--- a/test/accelerate_tests/cuda_cupy_tests/gradient_descent_kernel_test.py
+++ b/test/accelerate_tests/cuda_cupy_tests/gradient_descent_kernel_test.py
@@ -23,9 +23,9 @@ class GradientDescentKernelTest(CupyCudaTest):
     def prepare_arrays(self, performance=False):
         if not performance:
             nmodes = 2
-            N_buf = 4 
-            N = 3 
-            A = 3 
+            N_buf = 4
+            N = 3
+            A = 3
         else:
             nmodes = 4
             N_buf = 100
@@ -198,6 +198,69 @@ def test_make_a012(self):
             exp_A2, GDK.gpu.LLden.get(),
             err_msg="`LLden` buffer (=A2) has not been updated as expected")
 
+    def test_make_a012_notb(self):
+        b_f, b_a, _, I, w, err_sum, addr, fic = self.prepare_arrays()
+        GDK = GradientDescentKernel(b_f, addr.shape[1])
+        GDK.allocate()
+        GDK.make_a012_notb(b_f, b_a, addr, I, fic)
+
+        exp_A0 = np.array([[[1.,  1.,  1.],
+                            [2.,  2.,  2.],
+                            [5.,  5.,  5.]],
+
+                           [[12., 12., 12.],
+                            [13., 13., 13.],
+                            [16., 16., 16.]],
+
+                           [[37., 37., 37.],
+                            [38., 38., 38.],
+                            [41., 41., 41.]],
+
+                           [[0.,  0.,  0.],
+                            [0.,  0.,  0.],
+                            [0.,  0.,  0.]]], dtype=FLOAT_TYPE)
+        np.testing.assert_array_almost_equal(
+            exp_A0, GDK.gpu.Imodel.get(),
+            err_msg="`Imodel` buffer (=A0) has not been updated as expected")
+
+        exp_A1 = np.array([[[0., 0., 0.],
+                            [2., 6., 10.],
+                            [4., 12., 20.]],
+
+                           [[0., 0., 0.],
+                            [10., 14., 18.],
+                            [20., 28., 36.]],
+
+                           [[0., 0., 0.],
+                            [18., 22., 26.],
+                            [36., 44., 52.]],
+
+                           [[0., 0., 0.],
+                            [0., 0., 0.],
+                            [0., 0., 0.]]], dtype=FLOAT_TYPE)
+        np.testing.assert_array_almost_equal(
+            exp_A1, GDK.gpu.LLerr.get(),
+            err_msg="`LLerr` buffer (=A1) has not been updated as expected")
+
+        exp_A2 = np.array([[[ 0.,  2.,  8.],
+                            [ 2.,  4., 10.],
+                            [ 8., 10., 16.]],
+
+                           [[ 0.,  2.,  8.],
+                            [ 2.,  4., 10.],
+                            [ 8., 10., 16.]],
+
+                           [[ 0.,  2.,  8.],
+                            [ 2.,  4., 10.],
+                            [ 8., 10., 16.]],
+
+                           [[ 0.,  0.,  0.],
+                            [ 0.,  0.,  0.],
+                            [ 0.,  0.,  0.]]], dtype=FLOAT_TYPE)
+        np.testing.assert_array_almost_equal(
+            exp_A2, GDK.gpu.LLden.get(),
+            err_msg="`LLden` buffer (=A2) has not been updated as expected")
+
     @unittest.skipIf(not perfrun, "performance test")
     def test_make_a012_performance(self):
         b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays(performance=True)
@@ -233,7 +296,7 @@ def test_fill_b_perf(self):
         GDK.allocate()
         GDK.make_a012(b_f, b_a, b_b, addr, I, fic)
         GDK.fill_b(addr, Brenorm, w, B_dev)
-    
+
     def test_error_reduce(self):
         b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays()
         GDK = GradientDescentKernel(b_f, addr.shape[1])
@@ -246,7 +309,7 @@ def test_error_reduce(self):
         np.testing.assert_array_almost_equal(
             exp_err, err_sum.get(),
             err_msg="`err_sum` has not been updated as expected")
-    
+
     @unittest.skipIf(not perfrun, "performance test")
     def test_error_reduce_perf(self):
         b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays(performance=True)
diff --git a/test/accelerate_tests/cuda_pycuda_tests/gradient_descent_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/gradient_descent_kernel_test.py
index 6caed13f2..c08a61e9f 100644
--- a/test/accelerate_tests/cuda_pycuda_tests/gradient_descent_kernel_test.py
+++ b/test/accelerate_tests/cuda_pycuda_tests/gradient_descent_kernel_test.py
@@ -23,9 +23,9 @@ class GradientDescentKernelTest(PyCudaTest):
     def prepare_arrays(self, performance=False):
         if not performance:
             nmodes = 2
-            N_buf = 4 
-            N = 3 
-            A = 3 
+            N_buf = 4
+            N = 3
+            A = 3
         else:
             nmodes = 4
             N_buf = 100
@@ -198,6 +198,69 @@ def test_make_a012(self):
             exp_A2, GDK.gpu.LLden.get(),
             err_msg="`LLden` buffer (=A2) has not been updated as expected")
 
+    def test_make_a012_notb(self):
+        b_f, b_a, _, I, w, err_sum, addr, fic = self.prepare_arrays()
+        GDK = GradientDescentKernel(b_f, addr.shape[1])
+        GDK.allocate()
+        GDK.make_a012_notb(b_f, b_a, addr, I, fic)
+
+        exp_A0 = np.array([[[1.,  1.,  1.],
+                            [2.,  2.,  2.],
+                            [5.,  5.,  5.]],
+
+                           [[12., 12., 12.],
+                            [13., 13., 13.],
+                            [16., 16., 16.]],
+
+                           [[37., 37., 37.],
+                            [38., 38., 38.],
+                            [41., 41., 41.]],
+
+                           [[0.,  0.,  0.],
+                            [0.,  0.,  0.],
+                            [0.,  0.,  0.]]], dtype=FLOAT_TYPE)
+        np.testing.assert_array_almost_equal(
+            exp_A0, GDK.gpu.Imodel.get(),
+            err_msg="`Imodel` buffer (=A0) has not been updated as expected")
+
+        exp_A1 = np.array([[[0., 0., 0.],
+                            [2., 6., 10.],
+                            [4., 12., 20.]],
+
+                           [[0., 0., 0.],
+                            [10., 14., 18.],
+                            [20., 28., 36.]],
+
+                           [[0., 0., 0.],
+                            [18., 22., 26.],
+                            [36., 44., 52.]],
+
+                           [[0., 0., 0.],
+                            [0., 0., 0.],
+                            [0., 0., 0.]]], dtype=FLOAT_TYPE)
+        np.testing.assert_array_almost_equal(
+            exp_A1, GDK.gpu.LLerr.get(),
+            err_msg="`LLerr` buffer (=A1) has not been updated as expected")
+
+        exp_A2 = np.array([[[ 0.,  2.,  8.],
+                            [ 2.,  4., 10.],
+                            [ 8., 10., 16.]],
+
+                           [[ 0.,  2.,  8.],
+                            [ 2.,  4., 10.],
+                            [ 8., 10., 16.]],
+
+                           [[ 0.,  2.,  8.],
+                            [ 2.,  4., 10.],
+                            [ 8., 10., 16.]],
+
+                           [[ 0.,  0.,  0.],
+                            [ 0.,  0.,  0.],
+                            [ 0.,  0.,  0.]]], dtype=FLOAT_TYPE)
+        np.testing.assert_array_almost_equal(
+            exp_A2, GDK.gpu.LLden.get(),
+            err_msg="`LLden` buffer (=A2) has not been updated as expected")
+
     @unittest.skipIf(not perfrun, "performance test")
     def test_make_a012_performance(self):
         b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays(performance=True)
@@ -233,7 +296,7 @@ def test_fill_b_perf(self):
         GDK.allocate()
         GDK.make_a012(b_f, b_a, b_b, addr, I, fic)
         GDK.fill_b(addr, Brenorm, w, B_dev)
-    
+
     def test_error_reduce(self):
         b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays()
         GDK = GradientDescentKernel(b_f, addr.shape[1])
@@ -246,7 +309,7 @@ def test_error_reduce(self):
         np.testing.assert_array_almost_equal(
             exp_err, err_sum.get(),
             err_msg="`err_sum` has not been updated as expected")
-    
+
     @unittest.skipIf(not perfrun, "performance test")
     def test_error_reduce_perf(self):
         b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays(performance=True)

From 1a24467f5f6ac12d306dd22325d264d5a64fe057 Mon Sep 17 00:00:00 2001
From: Jaroslav Fowkes <jaroslav.fowkes@stfc.ac.uk>
Date: Mon, 13 Jan 2025 14:19:49 +0000
Subject: [PATCH 12/12] CuPy and PyCUDA MLSeparateGrads

---
 ptypy/custom/MLSeparateGrads_cupy.py   | 881 +++++++++++++++++++++++++
 ptypy/custom/MLSeparateGrads_pycuda.py | 858 ++++++++++++++++++++++++
 2 files changed, 1739 insertions(+)
 create mode 100644 ptypy/custom/MLSeparateGrads_cupy.py
 create mode 100644 ptypy/custom/MLSeparateGrads_pycuda.py

diff --git a/ptypy/custom/MLSeparateGrads_cupy.py b/ptypy/custom/MLSeparateGrads_cupy.py
new file mode 100644
index 000000000..14899895c
--- /dev/null
+++ b/ptypy/custom/MLSeparateGrads_cupy.py
@@ -0,0 +1,881 @@
+# -*- coding: utf-8 -*-
+"""
+Maximum Likelihood reconstruction engine.
+
+TODO.
+
+  * Implement other regularizers
+
+This file is part of the PTYPY package.
+
+    :copyright: Copyright 2014 by the PTYPY team, see AUTHORS.
+    :license: see LICENSE for details.
+"""
+import numpy as np
+import cupy as cp
+import cupyx
+
+from ptypy.engines import register
+from .MLSeparateGrads_serial import MLSeparateGrads_serial
+from ptypy.accelerate.base.engines.ML_serial import BaseModelSerial
+from ptypy import utils as u
+from ptypy.utils.verbose import logger, log
+from ptypy.utils import parallel
+from ptypy.accelerate.cuda_cupy import get_context, log_device_memory_stats
+from ptypy.accelerate.cuda_cupy.kernels import PropagationKernel, RealSupportKernel, FourierSupportKernel
+from ptypy.accelerate.cuda_cupy.kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel
+from ptypy.accelerate.cuda_cupy.array_utils import ArrayUtilsKernel, DerivativesKernel, GaussianSmoothingKernel, FFTGaussianSmoothingKernel, TransposeKernel
+from ptypy.accelerate.cuda_cupy.mem_utils import GpuDataManager
+
+#from ..mem_utils import GpuDataManager
+from ptypy.accelerate.base import address_manglers
+
+__all__ = ['MLSeparateGrads_cupy']
+
+# can be used to limit the number of blocks, simulating that they don't fit
+MAX_BLOCKS = 99999
+# MAX_BLOCKS = 3  # can be used to limit the number of blocks, simulating that they don't fit
+
+
+@register()
+class MLSeparateGrads_cupy(MLSeparateGrads_serial):
+
+    """
+    Defaults:
+
+    [probe_update_cuda_atomics]
+    default = False
+    type = bool
+    help = For GPU, use the atomics version for probe update kernel
+
+    [object_update_cuda_atomics]
+    default = True
+    type = bool
+    help = For GPU, use the atomics version for object update kernel
+
+    [fft_lib]
+    default = cuda
+    type = str
+    help = Choose the cupy-compatible FFT module.
+    doc = One of:
+      - ``'cuda'`` : ptypy's cuda wrapper (delayed load, but fastest compute if all data is on GPU)
+      - ``'cupy'`` : cupy using cufft (fast load, slowest compute due to additional store/load stages)
+    choices = 'cuda','cupy'
+    userlevel = 2
+
+    """
+
+    def __init__(self, ptycho_parent, pars=None):
+        """
+        Maximum likelihood reconstruction engine.
+        """
+        super().__init__(ptycho_parent, pars)
+
+    def engine_initialize(self):
+        """
+        Prepare for ML reconstruction.
+        """
+        self.queue = get_context(new_queue=True)
+
+        self.qu_htod = cp.cuda.Stream()
+        self.qu_dtoh = cp.cuda.Stream()
+
+        if self.p.smooth_gradient_method == "convolution":
+            self.GSK = GaussianSmoothingKernel(queue=self.queue)
+            self.GSK.tmp = None
+
+        if self.p.smooth_gradient_method == "fft":
+            self.FGSK = FFTGaussianSmoothingKernel(queue=self.queue)
+
+        # Real/Fourier Support Kernel
+        self.RSK = {}
+        self.FSK = {}
+
+        super().engine_initialize()
+        # self._setup_kernels()
+
+    def _setup_kernels(self):
+        """
+        Setup kernels, one for each scan. Derive scans from ptycho class
+        """
+        AUK = ArrayUtilsKernel(queue=self.queue)
+        self._dot_kernel = AUK.dot
+        # get the scans
+        for label, scan in self.ptycho.model.scans.items():
+
+            kern = u.Param()
+            kern.scanmodel = type(scan).__name__
+            self.kernels[label] = kern
+
+            # TODO: needs to be adapted for broad bandwidth
+            geo = scan.geometries[0]
+
+            # Get info to shape buffer arrays
+            fpc = scan.max_frames_per_block
+
+            # TODO : make this more foolproof
+            try:
+                nmodes = scan.p.coherence.num_probe_modes * \
+                    scan.p.coherence.num_object_modes
+            except:
+                nmodes = 1
+
+            # create buffer arrays
+            ash = (fpc * nmodes,) + tuple([int(s) for s in geo.shape])
+            aux = cp.zeros(ash, dtype=np.complex64)
+            kern.aux = aux
+            kern.a = cp.zeros(ash, dtype=np.complex64)
+            kern.b = cp.zeros(ash, dtype=np.complex64)
+
+            # setup kernels, one for each SCAN.
+            kern.GDK = GradientDescentKernel(
+                aux, nmodes, queue=self.queue, math_type="double")
+            kern.GDK.allocate()
+
+            kern.POK = PoUpdateKernel(queue_thread=self.queue)
+            kern.POK.allocate()
+
+            kern.AWK = AuxiliaryWaveKernel(queue_thread=self.queue)
+            kern.AWK.allocate()
+
+            kern.TK = TransposeKernel(queue=self.queue)
+
+            kern.PROP = PropagationKernel(
+                aux, geo.propagator, queue_thread=self.queue, fft_type=self.p.fft_lib)
+            kern.PROP.allocate()
+            kern.resolution = geo.resolution[0]
+
+            if self.do_position_refinement:
+                kern.PCK = PositionCorrectionKernel(
+                    aux, nmodes, self.p.position_refinement, geo.resolution, queue_thread=self.queue)
+                kern.PCK.allocate()
+
+        mag_mem = 0
+        for scan, kern in self.kernels.items():
+            mag_mem = max(kern.aux.nbytes // 2, mag_mem)
+        ma_mem = mag_mem
+        blk = ma_mem + mag_mem
+
+        # We need to add the free memory from the pool to the free device memory,
+        # as both will be used for allocations
+        mempool = cp.get_default_memory_pool()
+        mem = cp.cuda.runtime.memGetInfo()[0] + mempool.total_bytes() - mempool.used_bytes()
+
+        # leave 200MB room for safety
+        fit = int(mem - 200 * 1024 * 1024) // blk
+        if not fit:
+            log(1, "Cannot fit memory into device, if possible reduce frames per block. Exiting...")
+            raise SystemExit("ptypy has been exited.")
+
+        # TODO grow blocks dynamically
+        nma = min(fit, MAX_BLOCKS)
+        log_device_memory_stats(4)
+        log(4, 'Free memory available: {:.2f} GB'.format(float(mem)/(1024**3)))
+        log(4, 'Memory to be allocated per block: {:.2f} GB'.format(float(blk)/(1024**3)))
+        log(4, 'CuPy max blocks fitting on GPU: ma_arrays={}'.format(nma))
+        # reset memory or create new
+        self.w_data = GpuDataManager(ma_mem, 0, nma, False)
+        self.I_data = GpuDataManager(mag_mem, 0, nma, False)
+
+    def engine_prepare(self):
+
+        super().engine_prepare()
+        ## Serialize new data ##
+        use_tiles = (not self.p.probe_update_cuda_atomics) or (
+            not self.p.object_update_cuda_atomics)
+
+        # recursive copy to gpu for probe and object
+        for _cname, c in self.ptycho.containers.items():
+            if c.original != self.pr and c.original != self.ob:
+                continue
+            for _sname, s in c.S.items():
+                # convert data here
+                s.gpu = cp.asarray(s.data)
+                s.cpu = cupyx.empty_pinned(
+                    s.data.shape, s.data.dtype, order="C")
+                s.cpu[:] = s.data
+
+        for label, d in self.ptycho.new_data:
+            prep = self.diff_info[d.ID]
+            prep.err_phot_gpu = cp.asarray(prep.err_phot)
+            prep.fic_gpu = cp.ones_like(prep.err_phot_gpu)
+
+            if use_tiles:
+                prep.addr2 = np.ascontiguousarray(
+                    np.transpose(prep.addr, (2, 3, 0, 1)))
+
+            prep.addr_gpu = cp.asarray(prep.addr)
+            if self.do_position_refinement:
+                prep.original_addr_gpu = cp.asarray(prep.original_addr)
+                prep.error_state_gpu = cp.empty_like(prep.err_phot_gpu)
+                prep.mangled_addr_gpu = prep.addr_gpu.copy()
+
+            # Todo: Which address to pick?
+            if use_tiles:
+                prep.addr2_gpu = cp.asarray(prep.addr2)
+
+            prep.I = cupyx.empty_pinned(d.data.shape, d.data.dtype, order="C")
+            prep.I[:] = d.data
+
+            # Todo: avoid that extra copy of data
+            if self.do_position_refinement:
+                ma = self.ma.S[d.ID].data.astype(np.float32)
+                prep.ma = cupyx.empty_pinned(ma.shape, ma.dtype, order="C")
+                prep.ma[:] = ma
+
+            log(4, 'Free memory on device: %.2f GB' %
+                (float(cp.cuda.runtime.memGetInfo()[0])/1e9))
+            self.w_data.add_data_block()
+            self.I_data.add_data_block()
+
+        self.dID_list = list(self.di.S.keys())
+
+    def _initialize_model(self):
+
+        # Create noise model
+        if self.p.ML_type.lower() == "gaussian":
+            self.ML_model = GaussianModel(self)
+        elif self.p.ML_type.lower() == "poisson":
+            raise NotImplementedError('Poisson norm model not yet implemented')
+        elif self.p.ML_type.lower() == "euclid":
+            raise NotImplementedError('Euclid norm model not yet implemented')
+        else:
+            raise RuntimeError("Unsupported ML_type: '%s'" % self.p.ML_type)
+
+    def _set_pr_ob_ref_for_data(self, dev='gpu', container=None, sync_copy=False):
+        """
+        Overloading the context of Storage.data here, to allow for in-place math on Container instances:
+        """
+        if container is not None:
+            if container.original == self.pr or container.original == self.ob:
+                for s in container.S.values():
+                    # convert data here
+                    if dev == 'gpu':
+                        s.data = s.gpu
+                        if sync_copy:
+                            s.gpu.set(s.cpu)
+                    elif dev == 'cpu':
+                        s.data = s.cpu
+                        if sync_copy:
+                            s.gpu.get(out=s.cpu)
+                            #print('%s to cpu' % s.ID)
+        else:
+            for container in self.ptycho.containers.values():
+                self._set_pr_ob_ref_for_data(
+                    dev=dev, container=container, sync_copy=sync_copy)
+
+    def _get_smooth_gradient(self, data, sigma):
+        if self.p.smooth_gradient_method == "convolution":
+            if self.GSK.tmp is None:
+                self.GSK.tmp = cp.empty(data.shape, dtype=np.complex64)
+            try:
+                self.GSK.convolution(data, [sigma, sigma], tmp=self.GSK.tmp)
+            except MemoryError:
+                raise RuntimeError("Convolution kernel too large for direct convolution on GPU",
+                                   "Please reduce parameter smooth_gradient or set smooth_gradient_method='fft'.")
+        elif self.p.smooth_gradient_method == "fft":
+            self.FGSK.filter(data, sigma)
+        else:
+            raise NotImplementedError("smooth_gradient_method should be ```convolution``` or ```fft```.")
+        return data
+
+    def _replace_ob_grad(self):
+        new_ob_grad = self.ob_grad_new
+        # Smoothing preconditioner
+        if self.smooth_gradient:
+            self.smooth_gradient.sigma *= (1. - self.p.smooth_gradient_decay)
+            for name, s in new_ob_grad.storages.items():
+                s.gpu = self._get_smooth_gradient(
+                    s.gpu, self.smooth_gradient.sigma)
+
+        return self._replace_grad(self.ob_grad, new_ob_grad)
+
+    def _replace_pr_grad(self):
+        new_pr_grad = self.pr_grad_new
+        # Apply probe support if needed
+        for name, s in new_pr_grad.storages.items():
+            self.support_constraint(s)
+
+        return self._replace_grad(self.pr_grad, new_pr_grad)
+
+    def _replace_grad(self, grad, new_grad):
+        norm = np.double(0.)
+        dot = np.double(0.)
+        for name, new in new_grad.storages.items():
+            old = grad.storages[name]
+            norm += self._dot_kernel(new.gpu, new.gpu).get()[0]
+            dot += self._dot_kernel(new.gpu, old.gpu).get()[0]
+            old.gpu[:] = new.gpu
+        return norm, dot
+
+    def engine_iterate(self, num=1):
+        err = super().engine_iterate(num)
+        # copy all data back to cpu
+        self._set_pr_ob_ref_for_data(dev='cpu', container=None, sync_copy=True)
+        return err
+
+    def position_update(self):
+        """
+        Position refinement
+        """
+        if not self.do_position_refinement or (not self.curiter):
+            return
+        do_update_pos = (self.p.position_refinement.stop >
+                         self.curiter >= self.p.position_refinement.start)
+        do_update_pos &= (self.curiter %
+                          self.p.position_refinement.interval) == 0
+        use_tiles = (not self.p.probe_update_cuda_atomics) or (
+            not self.p.object_update_cuda_atomics)
+
+        # Update positions
+        if do_update_pos:
+            """
+            Iterates through all positions and refines them by a given algorithm.
+            """
+            log(4, "----------- START POS REF -------------")
+            for dID in self.dID_list:
+
+                prep = self.diff_info[dID]
+                pID, oID, eID = prep.poe_IDs
+                ob = self.ob.S[oID].gpu
+                pr = self.pr.S[pID].gpu
+                kern = self.kernels[prep.label]
+                aux = kern.aux
+                addr = prep.addr_gpu
+                original_addr = prep.original_addr
+                mangled_addr = prep.mangled_addr_gpu
+                err_phot = prep.err_phot_gpu
+                error_state = prep.error_state_gpu
+
+                # copy intensities and weights to GPU
+                ev_w, w, data_w = self.w_data.to_gpu(
+                    prep.weights, dID, self.qu_htod)
+                ev, I, data_I = self.I_data.to_gpu(prep.I, dID, self.qu_htod)
+
+                PCK = kern.PCK
+                TK = kern.TK
+                PROP = kern.PROP
+
+                # Keep track of object boundaries
+                max_oby = ob.shape[-2] - aux.shape[-2] - 1
+                max_obx = ob.shape[-1] - aux.shape[-1] - 1
+
+                # We need to re-calculate the current error
+                PCK.build_aux(aux, addr, ob, pr)
+                PROP.fw(aux, aux)
+                PCK.queue.wait_event(ev)
+                # w & I now on device
+                PCK.log_likelihood_ml(aux, addr, I, w, err_phot)
+                cp.cuda.runtime.memcpy(dst=error_state.data.ptr,
+                                       src=err_phot.data.ptr,
+                                       size=err_phot.nbytes,
+                                       kind=3)  # d2d
+
+                PCK.mangler.setup_shifts(self.curiter, nframes=addr.shape[0])
+
+                log(4, 'Position refinement trial: iteration %s' % (self.curiter))
+                for i in range(PCK.mangler.nshifts):
+                    PCK.mangler.get_address(
+                        i, addr, mangled_addr, max_oby, max_obx)
+                    PCK.build_aux(aux, mangled_addr, ob, pr)
+                    PROP.fw(aux, aux)
+                    PCK.log_likelihood_ml(aux, mangled_addr, I, w, err_phot)
+                    PCK.update_addr_and_error_state(
+                        addr, error_state, mangled_addr, err_phot)
+
+                data_w.record_done(self.queue, 'compute')
+                data_I.record_done(self.queue, 'compute')
+                cp.cuda.runtime.memcpy(dst=err_phot.data.ptr,
+                                       src=error_state.data.ptr,
+                                       size=err_phot.nbytes,
+                                       kind=3)  # d2d
+                if use_tiles:
+                    s1 = addr.shape[0] * addr.shape[1]
+                    s2 = addr.shape[2] * addr.shape[3]
+                    TK.transpose(addr.reshape(s1, s2),
+                                 prep.addr2_gpu.reshape(s2, s1))
+
+            self.dID_list.reverse()
+
+    def support_constraint(self, storage=None):
+        """
+        Enforces 2D support constraint on probe.
+        """
+        if storage is None:
+            for s in self.pr.storages.values():
+                self.support_constraint(s)
+
+        # Fourier space
+        support = self._probe_fourier_support.get(storage.ID)
+        if support is not None:
+            if storage.ID not in self.FSK:
+                supp = support.astype(np.complex64)
+                self.FSK[storage.ID] = FourierSupportKernel(
+                    supp, self.queue, self.p.fft_lib)
+                self.FSK[storage.ID].allocate()
+            self.FSK[storage.ID].apply_fourier_support(storage.gpu)
+
+        # Real space
+        support = self._probe_support.get(storage.ID)
+        if support is not None:
+            if storage.ID not in self.RSK:
+                self.RSK[storage.ID] = RealSupportKernel(
+                    support.astype(np.complex64))
+                self.RSK[storage.ID].allocate()
+            self.RSK[storage.ID].apply_real_support(storage.gpu)
+
+    def engine_finalize(self):
+        """
+        Clear all GPU data, pinned memory, etc
+        """
+        self.w_data = None
+        self.I_data = None
+
+        for name, s in self.pr.S.items():
+            s.data = s.gpu.get()  # need this, otherwise getting segfault once context is detached
+            # no longer need those
+            del s.gpu
+            del s.cpu
+        for name, s in self.ob.S.items():
+            s.data = s.gpu.get()  # need this, otherwise getting segfault once context is detached
+            # no longer need those
+            del s.gpu
+            del s.cpu
+        for dID, prep in self.diff_info.items():
+            prep.addr = prep.addr_gpu.get()
+            prep.float_intens_coeff = prep.fic_gpu.get()
+
+        # self.queue.synchronize()
+        super().engine_finalize()
+
+        log_device_memory_stats(4)
+
+
+class GaussianModel(BaseModelSerial):
+    """
+    Gaussian noise model.
+    TODO: feed actual statistical weights instead of using the Poisson statistic heuristic.
+    """
+
+    def __init__(self, MLengine):
+        """
+        Core functions for ML computation using a Gaussian model.
+        """
+        super(GaussianModel, self).__init__(MLengine)
+
+        if self.p.reg_del2:
+            self.regularizer = Regul_del2_cupy(
+                self.p.reg_del2_amplitude,
+                queue=self.engine.queue
+            )
+        else:
+            self.regularizer = None
+
+    def prepare(self):
+
+        super(GaussianModel, self).prepare()
+
+        for label, d in self.engine.ptycho.new_data:
+            prep = self.engine.diff_info[d.ID]
+            w = (self.Irenorm * self.engine.ma.S[d.ID].data
+                 / (1. / self.Irenorm + d.data)).astype(d.data.dtype)
+            prep.weights = cupyx.empty_pinned(w.shape, w.dtype, order="C")
+            prep.weights[:] = w
+
+    def __del__(self):
+        """
+        Clean up routine
+        """
+        super(GaussianModel, self).__del__()
+
+    def new_grad(self):
+        """
+        Compute a new gradient direction according to a Gaussian noise model.
+
+        Note: The negative log-likelihood and local errors are also computed
+        here.
+        """
+        ob_grad = self.engine.ob_grad_new
+        pr_grad = self.engine.pr_grad_new
+        qu_htod = self.engine.qu_htod
+        queue = self.engine.queue
+
+        self.engine._set_pr_ob_ref_for_data('gpu')
+        ob_grad << 0.
+        pr_grad << 0.
+
+        # We need an array for MPI
+        LL = np.array([0.])
+        error_dct = {}
+
+        for dID in self.engine.dID_list:
+            prep = self.engine.diff_info[dID]
+            # find probe, object in exit ID in dependence of dID
+            pID, oID, eID = prep.poe_IDs
+
+            # references for kernels
+            kern = self.engine.kernels[prep.label]
+            GDK = kern.GDK
+            AWK = kern.AWK
+            POK = kern.POK
+            aux = kern.aux
+
+            FW = kern.PROP.fw
+            BW = kern.PROP.bw
+
+            # get addresses and auxilliary array
+            addr = prep.addr_gpu
+            fic = prep.fic_gpu
+
+            err_phot = prep.err_phot_gpu
+            # local references
+            ob = self.engine.ob.S[oID].data
+            obg = ob_grad.S[oID].data
+            pr = self.engine.pr.S[pID].data
+            prg = pr_grad.S[pID].data
+
+            # Schedule w & I to device
+            ev_w, w, data_w = self.engine.w_data.to_gpu(
+                prep.weights, dID, qu_htod)
+            ev, I, data_I = self.engine.I_data.to_gpu(prep.I, dID, qu_htod)
+
+            # make propagated exit (to buffer)
+            AWK.build_aux_no_ex(aux, addr, ob, pr, add=False)
+
+            # forward prop
+            FW(aux, aux)
+            GDK.make_model(aux, addr)
+
+            queue.wait_event(ev)
+
+            if self.p.floating_intensities:
+                GDK.floating_intensity(addr, w, I, fic)
+
+            GDK.main(aux, addr, w, I)
+            data_w.record_done(queue, 'compute')
+            data_I.record_done(queue, 'compute')
+
+            GDK.error_reduce(addr, err_phot)
+
+            BW(aux, aux)
+
+            use_atomics = self.p.object_update_cuda_atomics
+            addr = prep.addr_gpu if use_atomics else prep.addr2_gpu
+            POK.ob_update_ML(addr, obg, pr, aux, atomics=use_atomics)
+
+            use_atomics = self.p.probe_update_cuda_atomics
+            addr = prep.addr_gpu if use_atomics else prep.addr2_gpu
+            POK.pr_update_ML(addr, prg, ob, aux, atomics=use_atomics)
+
+        queue.synchronize()
+        self.engine.dID_list.reverse()
+
+        # TODO we err_phot.sum, but not necessarily this error_dct until the end of contiguous iteration
+        for dID, prep in self.engine.diff_info.items():
+            err_phot = prep.err_phot_gpu.get()
+            LL += err_phot.sum()
+            err_phot /= np.prod(prep.weights.shape[-2:])
+            err_fourier = np.zeros_like(err_phot)
+            err_exit = np.zeros_like(err_phot)
+            errs = np.ascontiguousarray(
+                np.vstack([err_fourier, err_phot, err_exit]).T)
+            error_dct.update(zip(prep.view_IDs, errs))
+
+        # MPI reduction of gradients
+
+        # DtoH copies
+        for s in ob_grad.S.values():
+            s.gpu.get(out=s.cpu)
+        for s in pr_grad.S.values():
+            s.gpu.get(out=s.cpu)
+        self.engine._set_pr_ob_ref_for_data('cpu')
+
+        ob_grad.allreduce()
+        pr_grad.allreduce()
+        parallel.allreduce(LL)
+
+        # HtoD cause we continue on gpu
+        for s in ob_grad.S.values():
+            s.gpu.set(s.cpu)
+        for s in pr_grad.S.values():
+            s.gpu.set(s.cpu)
+        self.engine._set_pr_ob_ref_for_data('gpu')
+
+        # Object regularizer
+        if self.regularizer:
+            for name, s in self.engine.ob.storages.items():
+                ob_grad.storages[name].data += self.regularizer.grad(s.data)
+                LL += self.regularizer.LL
+
+        self.LL = LL / self.tot_measpts
+
+        return error_dct
+
+    def poly_line_coeffs_ob(self, c_ob_h):
+        """
+        Compute the coefficients of the polynomial for line minimization
+        in direction h for the object
+        """
+        self.engine._set_pr_ob_ref_for_data('gpu')
+        qu_htod = self.engine.qu_htod
+        queue = self.engine.queue
+
+        # does not accept np.longdouble
+        B = cp.zeros((3,), dtype=np.float32)
+        Brenorm = 1. / self.LL[0] ** 2
+
+        # Outer loop: through diffraction patterns
+        for dID in self.engine.dID_list:
+            prep = self.engine.diff_info[dID]
+
+            # find probe, object in exit ID in dependence of dID
+            pID, oID, eID = prep.poe_IDs
+
+            # references for kernels
+            kern = self.engine.kernels[prep.label]
+            GDK = kern.GDK
+            AWK = kern.AWK
+
+            f = kern.aux
+            a = kern.a
+
+            FW = kern.PROP.fw
+
+            # get addresses and auxiliary arrays
+            addr = prep.addr_gpu
+            fic = prep.fic_gpu
+
+            # Schedule w & I to device
+            ev_w, w, data_w = self.engine.w_data.to_gpu(
+                prep.weights, dID, qu_htod)
+            ev, I, data_I = self.engine.I_data.to_gpu(prep.I, dID, qu_htod)
+
+            # local references
+            ob = self.ob.S[oID].data
+            ob_h = c_ob_h.S[oID].data
+            pr = self.pr.S[pID].data
+
+            # make propagated exit (to buffer)
+            AWK.build_aux_no_ex(f, addr, ob, pr, add=False)
+            AWK.build_aux_no_ex(a, addr, ob_h, pr, add=False)
+
+            # forward prop
+            FW(f, f)
+            FW(a, a)
+
+            queue.wait_event(ev)
+
+            GDK.make_a012_notb(f, a, addr, I, fic)
+            GDK.fill_b(addr, Brenorm, w, B)
+
+            data_w.record_done(queue, 'compute')
+            data_I.record_done(queue, 'compute')
+
+        queue.synchronize()
+        self.engine.dID_list.reverse()
+
+        B = B.get()
+        parallel.allreduce(B)
+
+        # Object regularizer
+        if self.regularizer:
+            for name, s in self.ob.storages.items():
+                B += Brenorm * self.regularizer.poly_line_coeffs(
+                    c_ob_h.storages[name].data, s.data)
+
+        self.B = B
+
+        return B
+
+    def poly_line_coeffs_pr(self, c_pr_h):
+        """
+        Compute the coefficients of the polynomial for line minimization
+        in direction h for the probe
+        """
+        self.engine._set_pr_ob_ref_for_data('gpu')
+        qu_htod = self.engine.qu_htod
+        queue = self.engine.queue
+
+        # does not accept np.longdouble
+        B = cp.zeros((3,), dtype=np.float32)
+        Brenorm = 1. / self.LL[0] ** 2
+
+        # Outer loop: through diffraction patterns
+        for dID in self.engine.dID_list:
+            prep = self.engine.diff_info[dID]
+
+            # find probe, object in exit ID in dependence of dID
+            pID, oID, eID = prep.poe_IDs
+
+            # references for kernels
+            kern = self.engine.kernels[prep.label]
+            GDK = kern.GDK
+            AWK = kern.AWK
+
+            f = kern.aux
+            a = kern.a
+
+            FW = kern.PROP.fw
+
+            # get addresses and auxiliary arrays
+            addr = prep.addr_gpu
+            fic = prep.fic_gpu
+
+            # Schedule w & I to device
+            ev_w, w, data_w = self.engine.w_data.to_gpu(
+                prep.weights, dID, qu_htod)
+            ev, I, data_I = self.engine.I_data.to_gpu(prep.I, dID, qu_htod)
+
+            # local references
+            ob = self.ob.S[oID].data
+            pr = self.pr.S[pID].data
+            pr_h = c_pr_h.S[pID].data
+
+            # make propagated exit (to buffer)
+            AWK.build_aux_no_ex(f, addr, ob, pr, add=False)
+            AWK.build_aux_no_ex(a, addr, ob, pr_h, add=True)
+
+            # forward prop
+            FW(f, f)
+            FW(a, a)
+
+            queue.wait_event(ev)
+
+            GDK.make_a012_notb(f, a, addr, I, fic)
+            GDK.fill_b(addr, Brenorm, w, B)
+
+            data_w.record_done(queue, 'compute')
+            data_I.record_done(queue, 'compute')
+
+        queue.synchronize()
+        self.engine.dID_list.reverse()
+
+        B = B.get()
+        parallel.allreduce(B)
+
+        self.B = B
+
+        return B
+
+
+class Regul_del2_cupy(object):
+    """\
+    Squared gradient regularizer (Gaussian prior).
+
+    This class applies to any numpy array.
+    """
+
+    def __init__(self, amplitude, axes=[-2, -1], queue=None):
+        # Regul.__init__(self, axes)
+        self.axes = axes
+        self.amplitude = amplitude
+        self.delxy = None
+        self.g = None
+        self.LL = None
+        self.queue = queue
+        self.AUK = ArrayUtilsKernel(queue=queue)
+        self.DELK_c = DerivativesKernel(np.complex64, queue=queue)
+        self.DELK_f = DerivativesKernel(np.float32, queue=queue)
+
+
+        def empty(x): return cp.empty(
+            x.shape, x.dtype)
+
+        def delxb(x, axis=-1):
+            out = empty(x)
+            if x.dtype == np.float32:
+                self.DELK_f.delxb(x, out, axis)
+            elif x.dtype == np.complex64:
+                self.DELK_c.delxb(x, out, axis)
+            else:
+                raise TypeError("Type %s invalid for derivatives" % x.dtype)
+            return out
+
+        self.delxb = delxb
+
+        def delxf(x, axis=-1):
+            out = empty(x)
+            if x.dtype == np.float32:
+                self.DELK_f.delxf(x, out, axis)
+            elif x.dtype == np.complex64:
+                self.DELK_c.delxf(x, out, axis)
+            else:
+                raise TypeError("Type %s invalid for derivatives" % x.dtype)
+            return out
+
+        self.delxf = delxf
+        self.norm = lambda x: self.AUK.norm2(x).get().item()
+        self.dot = lambda x, y: self.AUK.dot(x, y).get().item()
+
+        self._grad_reg_kernel = cp.ElementwiseKernel(
+            "float32 fac, complex64 py, complex64 px, complex64 my, complex64 mx",
+            "complex64 out",
+            "out = (px+py-my-mx) * fac",
+            "grad_reg",
+            no_return=True
+        )
+
+        def grad(amp, px, py, mx, my):
+            out = empty(px)
+            if self.queue is not None:
+                self.queue.use()
+            self._grad_reg_kernel(amp, py, px, mx, my, out)
+            return out
+        self.reg_grad = grad
+
+    def grad(self, x):
+        """
+        Compute and return the regularizer gradient given the array x.
+        """
+        ax0, ax1 = self.axes
+        del_xf = self.delxf(x, axis=ax0)
+        del_yf = self.delxf(x, axis=ax1)
+        del_xb = self.delxb(x, axis=ax0)
+        del_yb = self.delxb(x, axis=ax1)
+
+        self.delxy = [del_xf, del_yf, del_xb, del_yb]
+
+        # TODO this one might be slow, maybe try with elementwise kernel
+        #self.g = (del_xb + del_yb - del_xf - del_yf) * 2. * self.amplitude
+        self.g = self.reg_grad(2. * self.amplitude,
+                               del_xb, del_yb, del_xf, del_yf)
+
+        self.LL = self.amplitude * (self.norm(del_xf)
+                                    + self.norm(del_yf)
+                                    + self.norm(del_xb)
+                                    + self.norm(del_yb))
+
+        return self.g
+
+    def poly_line_coeffs(self, h, x=None):
+        ax0, ax1 = self.axes
+        if x is None:
+            del_xf, del_yf, del_xb, del_yb = self.delxy
+        else:
+            del_xf = self.delxf(x, axis=ax0)
+            del_yf = self.delxf(x, axis=ax1)
+            del_xb = self.delxb(x, axis=ax0)
+            del_yb = self.delxb(x, axis=ax1)
+
+        hdel_xf = self.delxf(h, axis=ax0)
+        hdel_yf = self.delxf(h, axis=ax1)
+        hdel_xb = self.delxb(h, axis=ax0)
+        hdel_yb = self.delxb(h, axis=ax1)
+
+        c0 = self.amplitude * (self.norm(del_xf)
+                               + self.norm(del_yf)
+                               + self.norm(del_xb)
+                               + self.norm(del_yb))
+
+        c1 = 2 * self.amplitude * (self.dot(del_xf, hdel_xf)
+                                   + self.dot(del_yf, hdel_yf)
+                                   + self.dot(del_xb, hdel_xb)
+                                   + self.dot(del_yb, hdel_yb))
+
+        c2 = self.amplitude * (self.norm(hdel_xf)
+                               + self.norm(hdel_yf)
+                               + self.norm(hdel_xb)
+                               + self.norm(hdel_yb))
+
+        self.coeff = np.array([c0, c1, c2])
+        return self.coeff
diff --git a/ptypy/custom/MLSeparateGrads_pycuda.py b/ptypy/custom/MLSeparateGrads_pycuda.py
new file mode 100644
index 000000000..501c95864
--- /dev/null
+++ b/ptypy/custom/MLSeparateGrads_pycuda.py
@@ -0,0 +1,858 @@
+# -*- coding: utf-8 -*-
+"""
+Maximum Likelihood reconstruction engine.
+
+TODO.
+
+  * Implement other regularizers
+
+This file is part of the PTYPY package.
+
+    :copyright: Copyright 2014 by the PTYPY team, see AUTHORS.
+    :license: see LICENSE for details.
+"""
+import numpy as np
+from pycuda import gpuarray
+import pycuda.driver as cuda
+
+from ptypy.engines import register
+from .MLSeparateGrads_serial import MLSeparateGrads_serial
+from ptypy.accelerate.base.engines.ML_serial import BaseModelSerial
+from ptypy import utils as u
+from ptypy.utils.verbose import logger, log
+from ptypy.utils import parallel
+from ptypy.accelerate.cuda_pycuda import get_context, get_dev_pool
+from ptypy.accelerate.cuda_pycuda.kernels import PropagationKernel, RealSupportKernel, FourierSupportKernel
+from ptypy.accelerate.cuda_pycuda.kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel
+from ptypy.accelerate.cuda_pycuda.array_utils import ArrayUtilsKernel, DerivativesKernel, GaussianSmoothingKernel, FFTGaussianSmoothingKernel, TransposeKernel
+
+from ptypy.accelerate.cuda_pycuda.mem_utils import GpuDataManager
+from ptypy.accelerate.base import address_manglers
+
+__all__ = ['MLSeparateGrads_pycuda']
+
+MAX_BLOCKS = 99999  # can be used to limit the number of blocks, simulating that they don't fit
+#MAX_BLOCKS = 3  # can be used to limit the number of blocks, simulating that they don't fit
+
+@register()
+class MLSeparateGrads_pycuda(MLSeparateGrads_serial):
+
+    """
+    Defaults:
+
+    [probe_update_cuda_atomics]
+    default = False
+    type = bool
+    help = For GPU, use the atomics version for probe update kernel
+
+    [object_update_cuda_atomics]
+    default = True
+    type = bool
+    help = For GPU, use the atomics version for object update kernel
+
+    [use_cuda_device_memory_pool]
+    default = True
+    type = bool
+    help = For GPU, use a device memory pool
+
+    [fft_lib]
+    default = reikna
+    type = str
+    help = Choose the pycuda-compatible FFT module.
+    doc = One of:
+      - ``'reikna'`` : the reikna packaga (fast load, competitive compute for streaming)
+      - ``'cuda'`` : ptypy's cuda wrapper (delayed load, but fastest compute if all data is on GPU)
+      - ``'skcuda'`` : scikit-cuda (fast load, slowest compute due to additional store/load stages)
+    choices = 'reikna','cuda','skcuda'
+    userlevel = 2
+
+    """
+
+    def __init__(self, ptycho_parent, pars=None):
+        """
+        Maximum likelihood reconstruction engine.
+        """
+        super().__init__(ptycho_parent, pars)
+
+    def engine_initialize(self):
+        """
+        Prepare for ML reconstruction.
+        """
+        self.context, self.queue = get_context(new_queue=True)
+
+        if self.p.use_cuda_device_memory_pool:
+            self._dev_pool = get_dev_pool()
+            self.allocate = self._dev_pool.allocate
+        else:
+            self._dev_pool = None
+            self.allocate = cuda.mem_alloc
+
+        self.qu_htod = cuda.Stream()
+        self.qu_dtoh = cuda.Stream()
+
+        if self.p.smooth_gradient_method == "convolution":
+            self.GSK = GaussianSmoothingKernel(queue=self.queue)
+            self.GSK.tmp = None
+
+        if self.p.smooth_gradient_method == "fft":
+            self.FGSK = FFTGaussianSmoothingKernel(queue=self.queue)
+
+        # Real/Fourier Support Kernel
+        self.RSK = {}
+        self.FSK = {}
+
+        super().engine_initialize()
+        #self._setup_kernels()
+
+    def _setup_kernels(self):
+        """
+        Setup kernels, one for each scan. Derive scans from ptycho class
+        """
+        AUK = ArrayUtilsKernel(queue=self.queue)
+        self._dot_kernel = AUK.dot
+        # get the scans
+        for label, scan in self.ptycho.model.scans.items():
+
+            kern = u.Param()
+            kern.scanmodel = type(scan).__name__
+            self.kernels[label] = kern
+
+            # TODO: needs to be adapted for broad bandwidth
+            geo = scan.geometries[0]
+
+            # Get info to shape buffer arrays
+            fpc = scan.max_frames_per_block
+
+            # TODO : make this more foolproof
+            try:
+                nmodes = scan.p.coherence.num_probe_modes * \
+                         scan.p.coherence.num_object_modes
+            except:
+                nmodes = 1
+
+            # create buffer arrays
+            ash = (fpc * nmodes,) + tuple([int(s) for s in geo.shape])
+            aux = gpuarray.zeros(ash, dtype=np.complex64)
+            kern.aux = aux
+            kern.a = gpuarray.zeros(ash, dtype=np.complex64)
+            kern.b = gpuarray.zeros(ash, dtype=np.complex64)
+
+            # setup kernels, one for each SCAN.
+            kern.GDK = GradientDescentKernel(aux, nmodes, queue=self.queue, math_type="double")
+            kern.GDK.allocate()
+
+            kern.POK = PoUpdateKernel(queue_thread=self.queue)
+            kern.POK.allocate()
+
+            kern.AWK = AuxiliaryWaveKernel(queue_thread=self.queue)
+            kern.AWK.allocate()
+
+            kern.TK = TransposeKernel(queue=self.queue)
+
+            kern.PROP = PropagationKernel(aux, geo.propagator, queue_thread=self.queue, fft=self.p.fft_lib)
+            kern.PROP.allocate()
+            kern.resolution = geo.resolution[0]
+
+            if self.do_position_refinement:
+                kern.PCK = PositionCorrectionKernel(aux, nmodes, self.p.position_refinement, geo.resolution, queue_thread=self.queue)
+                kern.PCK.allocate()
+
+        mag_mem = 0
+        for scan, kern in self.kernels.items():
+            mag_mem = max(kern.aux.nbytes // 2, mag_mem)
+        ma_mem = mag_mem
+        mem = cuda.mem_get_info()[0]
+        blk = ma_mem + mag_mem
+        fit = int(mem - 200 * 1024 * 1024) // blk  # leave 200MB room for safety
+        if not fit:
+            log(1,"Cannot fit memory into device, if possible reduce frames per block. Exiting...")
+            raise SystemExit("ptypy has been exited.")
+
+        # TODO grow blocks dynamically
+        nma = min(fit, MAX_BLOCKS)
+        log(4, 'Free memory available: {:.2f} GB'.format(float(mem)/(1024**3)))
+        log(4, 'Memory to be allocated per block {:.2f} GB'.format(float(blk)/(1024**3)))
+        log(4, 'PyCUDA max blocks fitting on GPU: ma_arrays={}'.format(nma))
+        # reset memory or create new
+        self.w_data = GpuDataManager(ma_mem, 0, nma, False)
+        self.I_data = GpuDataManager(mag_mem, 0, nma, False)
+
+    def engine_prepare(self):
+
+        super().engine_prepare()
+        ## Serialize new data ##
+        use_tiles = (not self.p.probe_update_cuda_atomics) or (not self.p.object_update_cuda_atomics)
+
+        # recursive copy to gpu for probe and object
+        for _cname, c in self.ptycho.containers.items():
+            if c.original != self.pr and c.original != self.ob:
+                continue
+            for _sname, s in c.S.items():
+                # convert data here
+                s.gpu = gpuarray.to_gpu(s.data)
+                s.cpu = cuda.pagelocked_empty(s.data.shape, s.data.dtype, order="C")
+                s.cpu[:] = s.data
+
+        for label, d in self.ptycho.new_data:
+            prep = self.diff_info[d.ID]
+            prep.err_phot_gpu = gpuarray.to_gpu(prep.err_phot)
+            prep.fic_gpu = gpuarray.ones_like(prep.err_phot_gpu)
+
+            if use_tiles:
+                prep.addr2 = np.ascontiguousarray(np.transpose(prep.addr, (2, 3, 0, 1)))
+
+            prep.addr_gpu = gpuarray.to_gpu(prep.addr)
+            if self.do_position_refinement:
+                prep.original_addr_gpu = gpuarray.to_gpu(prep.original_addr)
+                prep.error_state_gpu = gpuarray.empty_like(prep.err_phot_gpu)
+                prep.mangled_addr_gpu = prep.addr_gpu.copy()
+
+            # Todo: Which address to pick?
+            if use_tiles:
+                prep.addr2_gpu = gpuarray.to_gpu(prep.addr2)
+
+            prep.I = cuda.pagelocked_empty(d.data.shape, d.data.dtype, order="C", mem_flags=4)
+            prep.I[:] = d.data
+
+            # Todo: avoid that extra copy of data
+            if self.do_position_refinement:
+                ma = self.ma.S[d.ID].data.astype(np.float32)
+                prep.ma = cuda.pagelocked_empty(ma.shape, ma.dtype, order="C", mem_flags=4)
+                prep.ma[:] = ma
+
+            log(4, 'Free memory on device: %.2f GB' % (float(cuda.mem_get_info()[0])/1e9))
+            self.w_data.add_data_block()
+            self.I_data.add_data_block()
+
+        self.dID_list = list(self.di.S.keys())
+
+    def _initialize_model(self):
+
+        # Create noise model
+        if self.p.ML_type.lower() == "gaussian":
+            self.ML_model = GaussianModel(self)
+        elif self.p.ML_type.lower() == "poisson":
+            raise NotImplementedError('Poisson norm model not yet implemented')
+        elif self.p.ML_type.lower() == "euclid":
+            raise NotImplementedError('Euclid norm model not yet implemented')
+        else:
+            raise RuntimeError("Unsupported ML_type: '%s'" % self.p.ML_type)
+
+    def _set_pr_ob_ref_for_data(self, dev='gpu', container=None, sync_copy=False):
+        """
+        Overloading the context of Storage.data here, to allow for in-place math on Container instances:
+        """
+        if container is not None:
+            if container.original==self.pr or container.original==self.ob:
+                for s in container.S.values():
+                    # convert data here
+                    if dev == 'gpu':
+                        s.data = s.gpu
+                        if sync_copy: s.gpu.set(s.cpu)
+                    elif dev == 'cpu':
+                        s.data = s.cpu
+                        if sync_copy:
+                            s.gpu.get(s.cpu)
+                            #print('%s to cpu' % s.ID)
+        else:
+            for container in self.ptycho.containers.values():
+                self._set_pr_ob_ref_for_data(dev=dev, container=container, sync_copy=sync_copy)
+
+    def _get_smooth_gradient(self, data, sigma):
+        if self.p.smooth_gradient_method == "convolution":
+            if self.GSK.tmp is None:
+                self.GSK.tmp = gpuarray.empty(data.shape, dtype=np.complex64)
+            try:
+                self.GSK.convolution(data, [sigma, sigma], tmp=self.GSK.tmp)
+            except MemoryError:
+                raise RuntimeError("Convolution kernel too large for direct convolution on GPU",
+                                   "Please reduce parameter smooth_gradient or set smooth_gradient_method='fft'.")
+        elif self.p.smooth_gradient_method == "fft":
+            self.FGSK.filter(data, sigma)
+        else:
+            raise NotImplementedError("smooth_gradient_method should be ```convolution``` or ```fft```.")
+        return data
+
+    def _replace_ob_grad(self):
+        new_ob_grad = self.ob_grad_new
+        # Smoothing preconditioner
+        if self.smooth_gradient:
+            self.smooth_gradient.sigma *= (1. - self.p.smooth_gradient_decay)
+            for name, s in new_ob_grad.storages.items():
+                s.gpu = self._get_smooth_gradient(s.gpu, self.smooth_gradient.sigma)
+
+        return self._replace_grad(self.ob_grad, new_ob_grad)
+
+    def _replace_pr_grad(self):
+        new_pr_grad = self.pr_grad_new
+        # Apply probe support if needed
+        for name, s in new_pr_grad.storages.items():
+            self.support_constraint(s)
+
+        return self._replace_grad(self.pr_grad , new_pr_grad)
+
+    def _replace_grad(self, grad, new_grad):
+        norm = np.double(0.)
+        dot = np.double(0.)
+        for name, new in new_grad.storages.items():
+            old = grad.storages[name]
+            norm += self._dot_kernel(new.gpu,new.gpu).get()[0]
+            dot += self._dot_kernel(new.gpu,old.gpu).get()[0]
+            old.gpu[:] = new.gpu
+        return norm, dot
+
+    def engine_iterate(self, num=1):
+        err = super().engine_iterate(num)
+        # copy all data back to cpu
+        self._set_pr_ob_ref_for_data(dev='cpu', container=None, sync_copy=True)
+        return err
+
+    def position_update(self):
+        """
+        Position refinement
+        """
+        if not self.do_position_refinement or (not self.curiter):
+            return
+        do_update_pos = (self.p.position_refinement.stop > self.curiter >= self.p.position_refinement.start)
+        do_update_pos &= (self.curiter % self.p.position_refinement.interval) == 0
+        use_tiles = (not self.p.probe_update_cuda_atomics) or (not self.p.object_update_cuda_atomics)
+
+        # Update positions
+        if do_update_pos:
+            """
+            Iterates through all positions and refines them by a given algorithm.
+            """
+            log(4, "----------- START POS REF -------------")
+            for dID in self.dID_list:
+
+                prep = self.diff_info[dID]
+                pID, oID, eID = prep.poe_IDs
+                ob = self.ob.S[oID].gpu
+                pr = self.pr.S[pID].gpu
+                kern = self.kernels[prep.label]
+                aux = kern.aux
+                addr = prep.addr_gpu
+                original_addr = prep.original_addr
+                mangled_addr = prep.mangled_addr_gpu
+                err_phot = prep.err_phot_gpu
+                error_state = prep.error_state_gpu
+
+                # copy intensities and weights to GPU
+                ev_w, w, data_w = self.w_data.to_gpu(prep.weights, dID, self.qu_htod)
+                ev, I, data_I = self.I_data.to_gpu(prep.I, dID, self.qu_htod)
+
+                PCK = kern.PCK
+                TK  = kern.TK
+                PROP = kern.PROP
+
+                # Keep track of object boundaries
+                max_oby = ob.shape[-2] - aux.shape[-2] - 1
+                max_obx = ob.shape[-1] - aux.shape[-1] - 1
+
+                # We need to re-calculate the current error
+                PCK.build_aux(aux, addr, ob, pr)
+                PROP.fw(aux, aux)
+                PCK.queue.wait_for_event(ev)
+                # w & I now on device
+                PCK.log_likelihood_ml(aux, addr, I, w, err_phot)
+                cuda.memcpy_dtod(dest=error_state.ptr,
+                                    src=err_phot.ptr,
+                                    size=err_phot.nbytes)
+
+                PCK.mangler.setup_shifts(self.curiter, nframes=addr.shape[0])
+
+                log(4, 'Position refinement trial: iteration %s' % (self.curiter))
+                for i in range(PCK.mangler.nshifts):
+                    PCK.mangler.get_address(i, addr, mangled_addr, max_oby, max_obx)
+                    PCK.build_aux(aux, mangled_addr, ob, pr)
+                    PROP.fw(aux, aux)
+                    PCK.log_likelihood_ml(aux, mangled_addr, I, w, err_phot)
+                    PCK.update_addr_and_error_state(addr, error_state, mangled_addr, err_phot)
+
+                data_w.record_done(self.queue, 'compute')
+                data_I.record_done(self.queue, 'compute')
+                cuda.memcpy_dtod(dest=err_phot.ptr,
+                                 src=error_state.ptr,
+                                 size=err_phot.nbytes)
+                if use_tiles:
+                    s1 = addr.shape[0] * addr.shape[1]
+                    s2 = addr.shape[2] * addr.shape[3]
+                    TK.transpose(addr.reshape(s1, s2), prep.addr2_gpu.reshape(s2, s1))
+
+            self.dID_list.reverse()
+
+    def support_constraint(self, storage=None):
+        """
+        Enforces 2D support constraint on probe.
+        """
+        if storage is None:
+            for s in self.pr.storages.values():
+                self.support_constraint(s)
+
+        # Fourier space
+        support = self._probe_fourier_support.get(storage.ID)
+        if support is not None:
+            if storage.ID not in self.FSK:
+                supp = support.astype(np.complex64)
+                self.FSK[storage.ID] = FourierSupportKernel(supp, self.queue, self.p.fft_lib)
+                self.FSK[storage.ID].allocate()
+            self.FSK[storage.ID].apply_fourier_support(storage.gpu)
+
+        # Real space
+        support = self._probe_support.get(storage.ID)
+        if support is not None:
+            if storage.ID not in self.RSK:
+                self.RSK[storage.ID] = RealSupportKernel(support.astype(np.complex64))
+                self.RSK[storage.ID].allocate()
+            self.RSK[storage.ID].apply_real_support(storage.gpu)
+
+    def engine_finalize(self):
+        """
+        Clear all GPU data, pinned memory, etc
+        """
+        self.w_data = None
+        self.I_data = None
+
+        for name, s in self.pr.S.items():
+            s.data = s.gpu.get() # need this, otherwise getting segfault once context is detached
+            # no longer need those
+            del s.gpu
+            del s.cpu
+        for name, s in self.ob.S.items():
+            s.data = s.gpu.get() # need this, otherwise getting segfault once context is detached
+            # no longer need those
+            del s.gpu
+            del s.cpu
+        for dID, prep in self.diff_info.items():
+            prep.addr = prep.addr_gpu.get()
+            prep.float_intens_coeff = prep.fic_gpu.get()
+
+
+        #self.queue.synchronize()
+        super().engine_finalize()
+
+class GaussianModel(BaseModelSerial):
+    """
+    Gaussian noise model.
+    TODO: feed actual statistical weights instead of using the Poisson statistic heuristic.
+    """
+
+    def __init__(self, MLengine):
+        """
+        Core functions for ML computation using a Gaussian model.
+        """
+        super(GaussianModel, self).__init__(MLengine)
+
+        if self.p.reg_del2:
+            self.regularizer = Regul_del2_pycuda(
+                self.p.reg_del2_amplitude,
+                queue=self.engine.queue,
+                allocator=self.engine.allocate
+            )
+        else:
+            self.regularizer = None
+
+    def prepare(self):
+
+        super(GaussianModel, self).prepare()
+
+        for label, d in self.engine.ptycho.new_data:
+            prep = self.engine.diff_info[d.ID]
+            w = (self.Irenorm * self.engine.ma.S[d.ID].data
+                       / (1. / self.Irenorm + d.data)).astype(d.data.dtype)
+            prep.weights = cuda.pagelocked_empty(w.shape, w.dtype, order="C", mem_flags=4)
+            prep.weights[:] = w
+
+    def __del__(self):
+        """
+        Clean up routine
+        """
+        super(GaussianModel, self).__del__()
+
+    def new_grad(self):
+        """
+        Compute a new gradient direction according to a Gaussian noise model.
+
+        Note: The negative log-likelihood and local errors are also computed
+        here.
+        """
+        ob_grad = self.engine.ob_grad_new
+        pr_grad = self.engine.pr_grad_new
+        qu_htod = self.engine.qu_htod
+        queue = self.engine.queue
+
+        self.engine._set_pr_ob_ref_for_data('gpu')
+        ob_grad << 0.
+        pr_grad << 0.
+
+        # We need an array for MPI
+        LL = np.array([0.])
+        error_dct = {}
+
+        for dID in self.engine.dID_list:
+            prep = self.engine.diff_info[dID]
+            # find probe, object in exit ID in dependence of dID
+            pID, oID, eID = prep.poe_IDs
+
+            # references for kernels
+            kern = self.engine.kernels[prep.label]
+            GDK = kern.GDK
+            AWK = kern.AWK
+            POK = kern.POK
+            aux = kern.aux
+
+            FW = kern.PROP.fw
+            BW = kern.PROP.bw
+
+            # get addresses and auxilliary array
+            addr = prep.addr_gpu
+            fic = prep.fic_gpu
+
+            err_phot = prep.err_phot_gpu
+            # local references
+            ob = self.engine.ob.S[oID].data
+            obg = ob_grad.S[oID].data
+            pr = self.engine.pr.S[pID].data
+            prg = pr_grad.S[pID].data
+
+            # Schedule w & I to device
+            ev_w, w, data_w = self.engine.w_data.to_gpu(prep.weights, dID, qu_htod)
+            ev, I, data_I = self.engine.I_data.to_gpu(prep.I, dID, qu_htod)
+
+            # make propagated exit (to buffer)
+            AWK.build_aux_no_ex(aux, addr, ob, pr, add=False)
+
+            # forward prop
+            FW(aux, aux)
+            GDK.make_model(aux, addr)
+
+            queue.wait_for_event(ev)
+
+            if self.p.floating_intensities:
+                GDK.floating_intensity(addr, w, I, fic)
+
+            GDK.main(aux, addr, w, I)
+            data_w.record_done(queue, 'compute')
+            data_I.record_done(queue, 'compute')
+
+            GDK.error_reduce(addr, err_phot)
+
+            BW(aux, aux)
+
+            use_atomics = self.p.object_update_cuda_atomics
+            addr = prep.addr_gpu if use_atomics else prep.addr2_gpu
+            POK.ob_update_ML(addr, obg, pr, aux, atomics=use_atomics)
+
+            use_atomics = self.p.probe_update_cuda_atomics
+            addr = prep.addr_gpu if use_atomics else prep.addr2_gpu
+            POK.pr_update_ML(addr, prg, ob, aux, atomics=use_atomics)
+
+        queue.synchronize()
+        self.engine.dID_list.reverse()
+
+        # TODO we err_phot.sum, but not necessarily this error_dct until the end of contiguous iteration
+        for dID, prep in self.engine.diff_info.items():
+            err_phot = prep.err_phot_gpu.get()
+            LL += err_phot.sum()
+            err_phot /= np.prod(prep.weights.shape[-2:])
+            err_fourier = np.zeros_like(err_phot)
+            err_exit = np.zeros_like(err_phot)
+            errs = np.ascontiguousarray(np.vstack([err_fourier, err_phot, err_exit]).T)
+            error_dct.update(zip(prep.view_IDs, errs))
+
+        # MPI reduction of gradients
+
+        # DtoH copies
+        for s in ob_grad.S.values():
+            s.gpu.get(s.cpu)
+        for s in pr_grad.S.values():
+            s.gpu.get(s.cpu)
+        self.engine._set_pr_ob_ref_for_data('cpu')
+
+        ob_grad.allreduce()
+        pr_grad.allreduce()
+        parallel.allreduce(LL)
+
+        # HtoD cause we continue on gpu
+        for s in ob_grad.S.values():
+            s.gpu.set(s.cpu)
+        for s in pr_grad.S.values():
+            s.gpu.set(s.cpu)
+        self.engine._set_pr_ob_ref_for_data('gpu')
+
+        # Object regularizer
+        if self.regularizer:
+            for name, s in self.engine.ob.storages.items():
+                ob_grad.storages[name].data += self.regularizer.grad(s.data)
+                LL += self.regularizer.LL
+
+        self.LL = LL / self.tot_measpts
+
+        return error_dct
+
+    def poly_line_coeffs_ob(self, c_ob_h):
+        """
+        Compute the coefficients of the polynomial for line minimization
+        in direction h for the object
+        """
+        self.engine._set_pr_ob_ref_for_data('gpu')
+        qu_htod = self.engine.qu_htod
+        queue = self.engine.queue
+
+        B = gpuarray.zeros((3,), dtype=np.float32) # does not accept np.longdouble
+        Brenorm = 1. / self.LL[0] ** 2
+
+        # Outer loop: through diffraction patterns
+        for dID in self.engine.dID_list:
+            prep = self.engine.diff_info[dID]
+
+            # find probe, object in exit ID in dependence of dID
+            pID, oID, eID = prep.poe_IDs
+
+            # references for kernels
+            kern = self.engine.kernels[prep.label]
+            GDK = kern.GDK
+            AWK = kern.AWK
+
+            f = kern.aux
+            a = kern.a
+
+            FW = kern.PROP.fw
+
+            # get addresses and auxiliary arrays
+            addr = prep.addr_gpu
+            fic = prep.fic_gpu
+
+            # Schedule w & I to device
+            ev_w, w, data_w = self.engine.w_data.to_gpu(prep.weights, dID, qu_htod)
+            ev, I, data_I = self.engine.I_data.to_gpu(prep.I, dID, qu_htod)
+
+            # local references
+            ob = self.ob.S[oID].data
+            ob_h = c_ob_h.S[oID].data
+            pr = self.pr.S[pID].data
+
+            # make propagated exit (to buffer)
+            AWK.build_aux_no_ex(f, addr, ob, pr, add=False)
+            AWK.build_aux_no_ex(a, addr, ob_h, pr, add=False)
+
+            # forward prop
+            FW(f,f)
+            FW(a,a)
+
+            queue.wait_for_event(ev)
+
+            GDK.make_a012_notb(f, a, addr, I, fic)
+            GDK.fill_b(addr, Brenorm, w, B)
+
+            data_w.record_done(queue, 'compute')
+            data_I.record_done(queue, 'compute')
+
+        queue.synchronize()
+        self.engine.dID_list.reverse()
+
+        B = B.get()
+        parallel.allreduce(B)
+
+        # Object regularizer
+        if self.regularizer:
+            for name, s in self.ob.storages.items():
+                B += Brenorm * self.regularizer.poly_line_coeffs(
+                    c_ob_h.storages[name].data, s.data)
+
+        self.B = B
+
+        return B
+
+    def poly_line_coeffs_pr(self, c_pr_h):
+        """
+        Compute the coefficients of the polynomial for line minimization
+        in direction h for the probe
+        """
+        self.engine._set_pr_ob_ref_for_data('gpu')
+        qu_htod = self.engine.qu_htod
+        queue = self.engine.queue
+
+        B = gpuarray.zeros((3,), dtype=np.float32) # does not accept np.longdouble
+        Brenorm = 1. / self.LL[0] ** 2
+
+        # Outer loop: through diffraction patterns
+        for dID in self.engine.dID_list:
+            prep = self.engine.diff_info[dID]
+
+            # find probe, object in exit ID in dependence of dID
+            pID, oID, eID = prep.poe_IDs
+
+            # references for kernels
+            kern = self.engine.kernels[prep.label]
+            GDK = kern.GDK
+            AWK = kern.AWK
+
+            f = kern.aux
+            a = kern.a
+
+            FW = kern.PROP.fw
+
+            # get addresses and auxiliary arrays
+            addr = prep.addr_gpu
+            fic = prep.fic_gpu
+
+            # Schedule w & I to device
+            ev_w, w, data_w = self.engine.w_data.to_gpu(prep.weights, dID, qu_htod)
+            ev, I, data_I = self.engine.I_data.to_gpu(prep.I, dID, qu_htod)
+
+            # local references
+            ob = self.ob.S[oID].data
+            pr = self.pr.S[pID].data
+            pr_h = c_pr_h.S[pID].data
+
+            # make propagated exit (to buffer)
+            AWK.build_aux_no_ex(f, addr, ob, pr, add=False)
+            AWK.build_aux_no_ex(a, addr, ob, pr_h, add=True)
+
+            # forward prop
+            FW(f,f)
+            FW(a,a)
+
+            queue.wait_for_event(ev)
+
+            GDK.make_a012_notb(f, a, addr, I, fic)
+            GDK.fill_b(addr, Brenorm, w, B)
+
+            data_w.record_done(queue, 'compute')
+            data_I.record_done(queue, 'compute')
+
+        queue.synchronize()
+        self.engine.dID_list.reverse()
+
+        B = B.get()
+        parallel.allreduce(B)
+
+        self.B = B
+
+        return B
+
+
+class Regul_del2_pycuda(object):
+    """\
+    Squared gradient regularizer (Gaussian prior).
+
+    This class applies to any numpy array.
+    """
+    def __init__(self, amplitude, axes=[-2, -1], queue=None, allocator=None):
+        # Regul.__init__(self, axes)
+        self.axes = axes
+        self.amplitude = amplitude
+        self.delxy = None
+        self.g = None
+        self.LL = None
+        self.queue = queue
+        self.AUK = ArrayUtilsKernel(queue=queue)
+        self.DELK_c = DerivativesKernel(np.complex64, queue=queue)
+        self.DELK_f = DerivativesKernel(np.float32, queue=queue)
+
+        if allocator is None:
+            self._dev_pool = get_dev_pool()
+            self.allocator=self._dev_pool.allocate
+        else:
+            self.allocator = allocator
+            self._dev_pool= None
+
+        empty = lambda x: gpuarray.empty(x.shape, x.dtype, allocator=self.allocator)
+
+        def delxb(x, axis=-1):
+            out = empty(x)
+            if x.dtype == np.float32:
+                self.DELK_f.delxb(x, out, axis)
+            elif x.dtype == np.complex64:
+                self.DELK_c.delxb(x, out, axis)
+            else:
+                raise TypeError("Type %s invalid for derivatives" % x.dtype)
+            return out
+
+        self.delxb = delxb
+
+        def delxf(x, axis=-1):
+            out = empty(x)
+            if x.dtype == np.float32:
+                self.DELK_f.delxf(x, out, axis)
+            elif x.dtype == np.complex64:
+                self.DELK_c.delxf(x, out, axis)
+            else:
+                raise TypeError("Type %s invalid for derivatives" % x.dtype)
+            return out
+
+        self.delxf = delxf
+        self.norm =  lambda x : self.AUK.norm2(x).get().item()
+        self.dot = lambda x, y : self.AUK.dot(x,y).get().item()
+
+        from pycuda.elementwise import ElementwiseKernel
+        self._grad_reg_kernel = ElementwiseKernel(
+            "pycuda::complex<float> *g, float fac, \
+            pycuda::complex<float> *py, pycuda::complex<float> *px, \
+            pycuda::complex<float> *my, pycuda::complex<float> *mx",
+            "g[i] = (px[i]+py[i]-my[i]-mx[i]) * fac",
+            "grad_reg",
+        )
+        def grad(amp, px,py, mx, my):
+            out = empty(px)
+            self._grad_reg_kernel(out, amp, py, px, mx, my, stream=self.queue)
+            return out
+        self.reg_grad = grad
+
+    def grad(self, x):
+        """
+        Compute and return the regularizer gradient given the array x.
+        """
+        ax0, ax1 = self.axes
+        del_xf = self.delxf(x, axis=ax0)
+        del_yf = self.delxf(x, axis=ax1)
+        del_xb = self.delxb(x, axis=ax0)
+        del_yb = self.delxb(x, axis=ax1)
+
+        self.delxy = [del_xf, del_yf, del_xb, del_yb]
+
+        # TODO this one might be slow, maybe try with elementwise kernel
+        #self.g = (del_xb + del_yb - del_xf - del_yf) * 2. * self.amplitude
+        self.g = self.reg_grad(2. * self.amplitude, del_xb, del_yb, del_xf, del_yf)
+
+
+        self.LL = self.amplitude * (self.norm(del_xf)
+                               + self.norm(del_yf)
+                               + self.norm(del_xb)
+                               + self.norm(del_yb))
+
+        return self.g
+
+    def poly_line_coeffs(self, h, x=None):
+        ax0, ax1 = self.axes
+        if x is None:
+            del_xf, del_yf, del_xb, del_yb = self.delxy
+        else:
+            del_xf = self.delxf(x, axis=ax0)
+            del_yf = self.delxf(x, axis=ax1)
+            del_xb = self.delxb(x, axis=ax0)
+            del_yb = self.delxb(x, axis=ax1)
+
+        hdel_xf = self.delxf(h, axis=ax0)
+        hdel_yf = self.delxf(h, axis=ax1)
+        hdel_xb = self.delxb(h, axis=ax0)
+        hdel_yb = self.delxb(h, axis=ax1)
+
+        c0 = self.amplitude * (self.norm(del_xf)
+                               + self.norm(del_yf)
+                               + self.norm(del_xb)
+                               + self.norm(del_yb))
+
+        c1 = 2 * self.amplitude * (self.dot(del_xf, hdel_xf)
+                                 + self.dot(del_yf, hdel_yf)
+                                 + self.dot(del_xb, hdel_xb)
+                                 + self.dot(del_yb, hdel_yb))
+
+        c2 = self.amplitude * (self.norm(hdel_xf)
+                               + self.norm(hdel_yf)
+                               + self.norm(hdel_xb)
+                               + self.norm(hdel_yb))
+
+        self.coeff = np.array([c0, c1, c2])
+        return self.coeff