diff --git a/ptypy/custom/LBFGS.py b/ptypy/custom/LBFGS.py
new file mode 100644
index 000000000..673f58600
--- /dev/null
+++ b/ptypy/custom/LBFGS.py
@@ -0,0 +1,383 @@
+# -*- coding: utf-8 -*-
+"""
+Limited-memory BFGS reconstruction engine.
+
+This file is part of the PTYPY package.
+
+    :copyright: Copyright 2014 by the PTYPY team, see AUTHORS.
+    :license: GPLv2, see LICENSE for details.
+"""
+import numpy as np
+import time
+
+#from ptypy import utils as u
+from ptypy.utils.verbose import logger
+#from ptypy.utils import parallel
+from ptypy.engines.utils import Cnorm2, Cdot
+from ptypy.engines import register
+from ptypy.engines.ML import ML
+from ptypy.core.manager import Full, Vanilla, Bragg3dModel, BlockVanilla, BlockFull, GradFull, BlockGradFull
+
+
+__all__ = ['LBFGS']
+
+
+@register()
+class LBFGS(ML):
+    """
+    Limited-memory BFGS reconstruction engine.
+
+
+    Defaults:
+
+    [name]
+    default = LBFGS
+    type = str
+    help =
+    doc =
+
+    [ML_type]
+    default = 'gaussian'
+    type = str
+    help = Likelihood model
+    choices = ['gaussian','poisson','euclid']
+    doc = One of ‘gaussian’, poisson’ or ‘euclid’. Only 'gaussian' is implemented.
+
+    [floating_intensities]
+    default = False
+    type = bool
+    help = Adaptive diffraction pattern rescaling
+    doc = If True, allow for adaptative rescaling of the diffraction pattern intensities (to correct for incident beam intensity fluctuations).
+
+    [intensity_renormalization]
+    default = 1.
+    type = float
+    lowlim = 0.0
+    help = Rescales the intensities so they can be interpreted as Poisson counts.
+
+    [reg_del2]
+    default = False
+    type = bool
+    help = Whether to use a Gaussian prior (smoothing) regularizer
+
+    [reg_del2_amplitude]
+    default = .01
+    type = float
+    lowlim = 0.0
+    help = Amplitude of the Gaussian prior if used
+
+    [smooth_gradient]
+    default = 0.0
+    type = float
+    help = Smoothing preconditioner
+    doc = Sigma for gaussian filter (turned off if 0.)
+
+    [smooth_gradient_decay]
+    default = 0.
+    type = float
+    help = Decay rate for smoothing preconditioner
+    doc = Sigma for gaussian filter will reduce exponentially at this rate
+
+    [scale_precond]
+    default = False
+    type = bool
+    help = Whether to use the object/probe scaling preconditioner
+    doc = This parameter can give faster convergence for weakly scattering samples.
+
+    [scale_probe_object]
+    default = 1.
+    type = float
+    lowlim = 0.0
+    help = Relative scale of probe to object
+
+    [probe_update_start]
+    default = 2
+    type = int
+    lowlim = 0
+    help = Number of iterations before probe update starts
+
+    [wavefield_precond]
+    default = False
+    type = bool
+    help = Whether to use the wavefield preconditioner
+    doc = This parameter can give faster convergence.
+
+    [wavefield_delta_object]
+    default = 0.1
+    type = float
+    help = Wavefield preconditioner damping constant for the object.
+
+    [wavefield_delta_probe]
+    default = 0.1
+    type = float
+    help = Wavefield preconditioner damping constant for the probe.
+
+    [bfgs_memory_size]
+    default = 5
+    type = int
+    lowlim = 2
+    help = Number of BFGS updates to store
+    """
+
+    SUPPORTED_MODELS = [Full, Vanilla, Bragg3dModel, BlockVanilla, BlockFull, GradFull, BlockGradFull]
+
+    def __init__(self, ptycho_parent, pars=None):
+        """
+        Limited-memory BFGS reconstruction engine.
+        """
+        super().__init__(ptycho_parent, pars)
+
+        # Memory of object updates and gradient differences
+        self.ob_s = [None]*self.p.bfgs_memory_size
+        self.ob_y = [None]*self.p.bfgs_memory_size
+
+        # Memory of probe updates and gradient differences
+        self.pr_s = [None]*self.p.bfgs_memory_size
+        self.pr_y = [None]*self.p.bfgs_memory_size
+
+        # Other BFGS memories
+        self.rho = np.zeros(self.p.bfgs_memory_size)
+        self.alpha = np.zeros(self.p.bfgs_memory_size)
+
+        self.ptycho.citations.add_article(
+            title='L-BFGS for Ptychography paper',
+            author='Fowkes J. and Daurer B.',
+            journal='TBA',
+            volume=None,
+            year=None,
+            page=None,
+            doi='TBA',
+            comment='The L-BFGS reconstruction algorithm',
+        )
+
+    def engine_initialize(self):
+        """
+        Prepare for reconstruction.
+        """
+        super().engine_initialize()
+
+        # Create containers for memories of updates and gradient differences
+        for i in range(self.p.bfgs_memory_size):
+            self.ob_s[i] = self.ob.copy(self.ob.ID + '_s' + str(i), fill=0.)
+            self.ob_y[i] = self.ob.copy(self.ob.ID + '_y' + str(i), fill=0.)
+            self.pr_s[i] = self.pr.copy(self.pr.ID + '_s' + str(i), fill=0.)
+            self.pr_y[i] = self.pr.copy(self.pr.ID + '_y' + str(i), fill=0.)
+
+
+
+    def engine_prepare(self):
+        """
+        Last minute initialization, everything, that needs to be recalculated,
+        when new data arrives.
+        """
+        super().engine_prepare()
+
+    def engine_iterate(self, num=1):
+        """
+        Compute `num` iterations.
+        """
+        tg = 0.
+        tc = 0.
+        ta = time.time()
+        for it in range(num):
+            #######################################
+            # Compute new gradient (same as ML)
+            #######################################
+            t1 = time.time()
+            error_dct = self.ML_model.new_grad()
+            new_ob_grad, new_pr_grad = self.ob_grad_new, self.pr_grad_new
+            tg += time.time() - t1
+
+            if self.p.probe_update_start <= self.curiter:
+                # Apply probe support if needed
+                for name, s in new_pr_grad.storages.items():
+                    self.support_constraint(s)
+                    #support = self.probe_support.get(name)
+                    #if support is not None:
+                    #    s.data *= support
+            else:
+                new_pr_grad.fill(0.)
+
+            # probe/object rescaling (not used for now)
+            if self.p.scale_precond:
+                cn2_new_pr_grad = Cnorm2(new_pr_grad)
+                cn2_new_ob_grad = Cnorm2(new_ob_grad)
+                if cn2_new_pr_grad > 1e-5:
+                    scale_p_o = (self.p.scale_probe_object * cn2_new_ob_grad
+                                 / cn2_new_pr_grad)
+                else:
+                    scale_p_o = self.p.scale_probe_object
+                if self.scale_p_o is None:
+                    self.scale_p_o = scale_p_o
+                else:
+                    self.scale_p_o = self.scale_p_o ** self.scale_p_o_memory
+                    self.scale_p_o *= scale_p_o ** (1-self.scale_p_o_memory)
+                logger.debug('Scale P/O: %6.3g' % scale_p_o)
+            else:
+                self.scale_p_o = self.p.scale_probe_object
+
+            ############################
+            # LBFGS Two Loop Recursion
+            ############################
+            if self.curiter == 0: # Initial steepest-descent step
+
+                # Wavefield preconditioner (applied twice)
+                if self.p.wavefield_precond:
+                    for name, s in new_ob_grad.storages.items():
+                        new_ob_grad.storages[name].data /= self.ob_fln.storages[name].data + self.p.wavefield_delta_object
+                        new_pr_grad.storages[name].data /= self.pr_fln.storages[name].data + self.p.wavefield_delta_probe
+
+                # Object steepest-descent step
+                self.ob_h -= new_ob_grad
+
+                # Probe steepest-descent step
+                new_pr_grad *= self.scale_p_o # probe preconditioning (applied twice)
+                self.pr_h -= new_pr_grad
+
+            else: # Two-loop LBFGS recursion
+
+                # Wavefield preconditioner
+                if self.p.wavefield_precond:
+                    for name, s in self.ob_h.storages.items():
+                        self.ob_h.storages[name].data *= np.sqrt(self.ob_fln.storages[name].data + self.p.wavefield_delta_object)
+                        self.pr_h.storages[name].data *= np.sqrt(self.pr_fln.storages[name].data + self.p.wavefield_delta_probe)
+                    for name, s in new_ob_grad.storages.items():
+                        new_ob_grad.storages[name].data /= np.sqrt(self.ob_fln.storages[name].data + self.p.wavefield_delta_object)
+                        new_pr_grad.storages[name].data /= np.sqrt(self.pr_fln.storages[name].data + self.p.wavefield_delta_probe)
+
+                # Memory index
+                mi = min(self.curiter,self.p.bfgs_memory_size)
+
+                # Remember last object update and gradient difference
+                self.ob_s[mi-1] << self.ob_h
+                self.ob_y[mi-1] << new_ob_grad
+                self.ob_y[mi-1] -= self.ob_grad
+
+                # Remember last probe update and gradient difference
+                self.pr_h /= np.sqrt(self.scale_p_o) # probe preconditioning
+                self.pr_s[mi-1] << self.pr_h
+                new_pr_grad *= np.sqrt(self.scale_p_o) # probe preconditioning
+                self.pr_y[mi-1] << new_pr_grad
+                self.pr_y[mi-1] -= self.pr_grad
+
+                # Compute and store rho
+                self.rho[mi-1] = 1. / ( np.real(Cdot(self.ob_y[mi-1], self.ob_s[mi-1]))
+                                        + np.real(Cdot(self.pr_y[mi-1], self.pr_s[mi-1])) )
+                # BFGS update
+                self.ob_h << new_ob_grad
+                self.pr_h << new_pr_grad
+                # Compute right-hand side of BGFS product
+                for i in reversed(range(mi)):
+                    self.alpha[i] = self.rho[i]*( np.real(Cdot(self.ob_s[i], self.ob_h))
+                                                  + np.real(Cdot(self.pr_s[i], self.pr_h)) )
+
+                    # self.ob_h -= self.alpha[i]*self.ob_y[i]
+                    self.ob_h /= self.alpha[i]
+                    self.ob_h -= self.ob_y[i]
+                    self.ob_h *= self.alpha[i]
+                    # self.pr_h -= self.alpha[i]*self.pr_y[i]
+                    self.pr_h /= self.alpha[i]
+                    self.pr_h -= self.pr_y[i]
+                    self.pr_h *= self.alpha[i]
+
+                # Compute centre of BFGS product (scaled identity)
+                c_num = ( np.real(Cdot(self.ob_s[mi-1], self.ob_y[mi-1]))
+                         + np.real(Cdot(self.pr_s[mi-1], self.pr_y[mi-1])) )
+                c_denom = Cnorm2(self.ob_y[mi-1]) + Cnorm2(self.pr_y[mi-1])
+                self.ob_h *= (c_num/c_denom)
+                self.pr_h *= (c_num/c_denom)
+
+
+                # Compute left-hand side of BFGS product
+                for i in range(mi):
+                    beta = self.rho[i]*( np.real(Cdot(self.ob_y[i], self.ob_h))
+                                         + np.real(Cdot(self.pr_y[i], self.pr_h)) )
+
+                    # self.ob_h += (self.alpha[i]-beta)*self.ob_s[i]
+                    self.ob_h /= (self.alpha[i]-beta)
+                    self.ob_h += self.ob_s[i]
+                    self.ob_h *= (self.alpha[i]-beta)
+                    # self.pr_h += (self.alpha[i]-beta)*self.pr_s[i]
+                    self.pr_h /= (self.alpha[i]-beta)
+                    self.pr_h += self.pr_s[i]
+                    self.pr_h *= (self.alpha[i]-beta)
+
+                # Wavefield preconditioner
+                if self.p.wavefield_precond:
+                    for name, s in self.ob_h.storages.items():
+                        self.ob_h.storages[name].data /= np.sqrt(self.ob_fln.storages[name].data + self.p.wavefield_delta_object)
+                        self.pr_h.storages[name].data /= np.sqrt(self.pr_fln.storages[name].data + self.p.wavefield_delta_probe)
+
+                # Flip step direction for minimisation
+                self.ob_h *= -1
+                self.pr_h *= np.sqrt(self.scale_p_o) # probe preconditioning
+                self.pr_h *= -1
+
+            # update current gradients with new gradients
+            self.ob_grad << new_ob_grad
+            self.pr_grad << new_pr_grad
+
+            # linesearch (same as ML)
+            t2 = time.time()
+            B = self.ML_model.poly_line_coeffs(self.ob_h, self.pr_h)
+            tc += time.time() - t2
+
+
+            if np.isinf(B).any() or np.isnan(B).any():
+                logger.warning(
+                    'Warning! inf or nan found! Trying to continue...')
+                B[np.isinf(B)] = 0.
+                B[np.isnan(B)] = 0.
+
+            dt = self.ptycho.FType
+            self.tmin = dt(-.5 * B[1] / B[2])
+
+            # step update
+            self.ob_h *= self.tmin
+            self.pr_h *= self.tmin
+            self.ob += self.ob_h
+            self.pr += self.pr_h
+
+            # Roll memory for overwriting
+            if self.curiter >= self.p.bfgs_memory_size:
+                self.ob_s.append(self.ob_s.pop(0))
+                self.pr_s.append(self.pr_s.pop(0))
+                self.ob_y.append(self.ob_y.pop(0))
+                self.pr_y.append(self.pr_y.pop(0))
+                self.rho = np.roll(self.rho,-1)
+
+            # Position correction
+            self.position_update()
+
+            # Allow for customized modifications at the end of each iteration
+            self._post_iterate_update()
+
+            # increase iteration counter
+            self.curiter +=1
+
+        logger.info('Time spent in gradient calculation: %.2f' % tg)
+        logger.info('  ....  in coefficient calculation: %.2f' % tc)
+        return error_dct  # np.array([[self.ML_model.LL[0]] * 3])
+
+    def _post_iterate_update(self):
+        """
+        Enables modification at the end of each LBFGS iteration.
+        """
+        pass
+
+    def engine_finalize(self):
+        """
+        Delete temporary containers.
+        """
+        super().engine_finalize()
+
+        # Delete containers for memories of updates and gradient differences
+        for i in reversed(range(self.p.bfgs_memory_size)):
+            del self.ptycho.containers[self.ob_s[i].ID]
+            del self.ob_s[i]
+            del self.ptycho.containers[self.ob_y[i].ID]
+            del self.ob_y[i]
+            del self.ptycho.containers[self.pr_s[i].ID]
+            del self.pr_s[i]
+            del self.ptycho.containers[self.pr_y[i].ID]
+            del self.pr_y[i]
diff --git a/ptypy/custom/LBFGS_cupy.py b/ptypy/custom/LBFGS_cupy.py
new file mode 100644
index 000000000..564a977b3
--- /dev/null
+++ b/ptypy/custom/LBFGS_cupy.py
@@ -0,0 +1,172 @@
+# -*- coding: utf-8 -*-
+"""
+Limited-memory BFGS reconstruction engine.
+
+TODO.
+
+  * Implement other regularizers
+
+This file is part of the PTYPY package.
+
+    :copyright: Copyright 2014 by the PTYPY team, see AUTHORS.
+    :license: GPLv2, see LICENSE for details.
+"""
+import numpy as np
+import cupy as cp
+
+from ptypy.engines import register
+from ptypy.custom.LBFGS_serial import LBFGS_serial
+from ptypy.accelerate.base.engines.ML_serial import ML_serial, BaseModelSerial
+from ptypy.accelerate.cuda_cupy.engines.ML_cupy import ML_cupy, GaussianModel
+from ptypy import utils as u
+from ptypy.utils.verbose import logger, log
+from ptypy.utils import parallel
+from ptypy.accelerate.cuda_cupy import get_context
+from ptypy.accelerate.cuda_cupy.kernels import PropagationKernel, RealSupportKernel, FourierSupportKernel
+from ptypy.accelerate.cuda_cupy.kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel
+from ptypy.accelerate.cuda_cupy.array_utils import ArrayUtilsKernel, DerivativesKernel, FFTGaussianSmoothingKernel, TransposeKernel
+from ptypy.accelerate.cuda_cupy.mem_utils import GpuDataManager
+from ptypy.accelerate.base import address_manglers
+
+__all__ = ['LBFGS_cupy']
+
+MAX_BLOCKS = 99999  # can be used to limit the number of blocks, simulating that they don't fit
+#MAX_BLOCKS = 3  # can be used to limit the number of blocks, simulating that they don't fit
+
+@register()
+class LBFGS_cupy(LBFGS_serial, ML_cupy):
+
+    """
+    Defaults:
+
+    [probe_update_cuda_atomics]
+    default = False
+    type = bool
+    help = For GPU, use the atomics version for probe update kernel
+
+    [object_update_cuda_atomics]
+    default = True
+    type = bool
+    help = For GPU, use the atomics version for object update kernel
+
+    [use_cuda_device_memory_pool]
+    default = True
+    type = bool
+    help = For GPU, use a device memory pool
+
+    [fft_lib]
+    default = reikna
+    type = str
+    help = Choose the pycuda-compatible FFT module.
+    doc = One of:
+      - ``'reikna'`` : the reikna packaga (fast load, competitive compute for streaming)
+      - ``'cuda'`` : ptypy's cuda wrapper (delayed load, but fastest compute if all data is on GPU)
+      - ``'skcuda'`` : scikit-cuda (fast load, slowest compute due to additional store/load stages)
+    choices = 'reikna','cuda','skcuda'
+    userlevel = 2
+    """
+
+    def __init__(self, ptycho_parent, pars=None):
+        """
+        Limited-memory BFGS reconstruction engine.
+        """
+        super().__init__(ptycho_parent, pars)
+
+    def engine_initialize(self):
+        """
+        Prepare for LBFGS reconstruction.
+        """
+        super().engine_initialize()
+
+    def engine_prepare(self):
+        super().engine_prepare()
+
+    def _initialize_model(self):
+
+        # Create noise model
+        if self.p.ML_type.lower() == "gaussian":
+            self.ML_model = GaussianModel(self)
+        elif self.p.ML_type.lower() == "poisson":
+            raise NotImplementedError('Poisson norm model not yet implemented')
+        elif self.p.ML_type.lower() == "euclid":
+            raise NotImplementedError('Euclid norm model not yet implemented')
+        else:
+            raise RuntimeError("Unsupported ML_type: '%s'" % self.p.ML_type)
+
+    def _get_smooth_gradient(self, data, sigma):
+        if self.p.smooth_gradient_method == "convolution":
+            if self.GSK.tmp is None:
+                self.GSK.tmp = cp.empty(data.shape, dtype=np.complex64)
+            try:
+                self.GSK.convolution(data, [sigma, sigma], tmp=self.GSK.tmp)
+            except MemoryError:
+                raise RuntimeError("Convolution kernel too large for direct convolution on GPU",
+                                   "Please reduce parameter smooth_gradient or set smooth_gradient_method='fft'.")
+        elif self.p.smooth_gradient_method == "fft":
+            self.FGSK.filter(data, sigma)
+        else:
+            raise NotImplementedError("smooth_gradient_method should be ```convolution``` or ```fft```.")
+        return data
+
+    def _smooth_gradient(self):
+        new_ob_grad = self.ob_grad_new
+        # Smoothing preconditioner decay (once per iteration)
+        if self.smooth_gradient:
+            self.smooth_gradient.sigma *= (1. - self.p.smooth_gradient_decay)
+            for name, s in new_ob_grad.storages.items():
+                s.gpu = self._get_smooth_gradient(s.gpu, self.smooth_gradient.sigma)
+
+    def _get_ob_norm(self):
+        norm = self._get_norm(self.ob_grad_new)
+        return norm
+
+    def _get_pr_norm(self):
+        norm = self._get_norm(self.pr_grad_new)
+        return norm
+
+    def _get_dot(self, c1, c2):
+        dot = np.double(0.)
+        for name, s2 in c2.storages.items():
+            s1 = c1.storages[name]
+            dot += self._dot_kernel(s2.gpu, s1.gpu).get()[0]
+        return dot
+
+    def _get_norm(self, c):
+        norm = np.double(0.)
+        for name, s in c.storages.items():
+            norm += self._dot_kernel(s.gpu, s.gpu).get()[0]
+        return norm
+
+    def _replace_ob_pr_ys(self, mi):
+        self.cdotr_ob_ys[mi-1] = self._get_dot(self.ob_y[mi-1],
+                self.ob_s[mi-1])
+        self.cdotr_pr_ys[mi-1] = self._get_dot(self.pr_y[mi-1],
+                self.pr_s[mi-1])
+        self.cn2_ob_y[mi-1] = self._get_norm(self.ob_y[mi-1])
+        self.cn2_pr_y[mi-1] = self._get_norm(self.pr_y[mi-1])
+
+    def _get_ob_pr_sh(self, k):
+        ob_sh = self._get_dot(self.ob_s[k], self.ob_h)
+        pr_sh = self._get_dot(self.pr_s[k], self.pr_h)
+        return ob_sh, pr_sh
+
+    def _get_ob_pr_yh(self, k):
+        ob_yh = self._get_dot(self.ob_y[k], self.ob_h)
+        pr_yh = self._get_dot(self.pr_y[k], self.pr_h)
+        return ob_yh, pr_yh
+
+    def engine_iterate(self, num=1):
+        """
+        Compute `num` iterations.
+        """
+        error_dct = super().engine_iterate(num)
+        # copy all data back to cpu
+        self._set_pr_ob_ref_for_data(dev='cpu', container=None, sync_copy=True)
+        return error_dct
+
+    def engine_finalize(self):
+        """
+        Clear all GPU data, pinned memory, etc
+        """
+        super().engine_finalize()
+
diff --git a/ptypy/custom/LBFGS_pycuda.py b/ptypy/custom/LBFGS_pycuda.py
new file mode 100644
index 000000000..833e57ef6
--- /dev/null
+++ b/ptypy/custom/LBFGS_pycuda.py
@@ -0,0 +1,175 @@
+# -*- coding: utf-8 -*-
+"""
+Limited-memory BFGS reconstruction engine.
+
+TODO.
+
+  * Implement other regularizers
+
+This file is part of the PTYPY package.
+
+    :copyright: Copyright 2014 by the PTYPY team, see AUTHORS.
+    :license: GPLv2, see LICENSE for details.
+"""
+import numpy as np
+from pycuda import gpuarray
+import pycuda.driver as cuda
+import pycuda.cumath
+from pycuda.tools import DeviceMemoryPool
+
+from ptypy.engines import register
+from ptypy.custom.LBFGS_serial import LBFGS_serial
+from ptypy.accelerate.base.engines.ML_serial import ML_serial, BaseModelSerial
+from ptypy.accelerate.cuda_pycuda.engines.ML_pycuda import ML_pycuda, GaussianModel
+from ptypy import utils as u
+from ptypy.utils.verbose import logger, log
+from ptypy.utils import parallel
+from ptypy.accelerate.cuda_pycuda import get_context
+from ptypy.accelerate.cuda_pycuda.kernels import PropagationKernel, RealSupportKernel, FourierSupportKernel
+from ptypy.accelerate.cuda_pycuda.kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel
+from ptypy.accelerate.cuda_pycuda.array_utils import ArrayUtilsKernel, DerivativesKernel, GaussianSmoothingKernel, TransposeKernel
+
+from ptypy.accelerate.cuda_pycuda.mem_utils import GpuDataManager
+from ptypy.accelerate.base import address_manglers
+
+__all__ = ['LBFGS_pycuda']
+
+MAX_BLOCKS = 99999  # can be used to limit the number of blocks, simulating that they don't fit
+#MAX_BLOCKS = 3  # can be used to limit the number of blocks, simulating that they don't fit
+
+@register()
+class LBFGS_pycuda(LBFGS_serial, ML_pycuda):
+
+    """
+    Defaults:
+
+    [probe_update_cuda_atomics]
+    default = False
+    type = bool
+    help = For GPU, use the atomics version for probe update kernel
+
+    [object_update_cuda_atomics]
+    default = True
+    type = bool
+    help = For GPU, use the atomics version for object update kernel
+
+    [use_cuda_device_memory_pool]
+    default = True
+    type = bool
+    help = For GPU, use a device memory pool
+
+    [fft_lib]
+    default = reikna
+    type = str
+    help = Choose the pycuda-compatible FFT module.
+    doc = One of:
+      - ``'reikna'`` : the reikna packaga (fast load, competitive compute for streaming)
+      - ``'cuda'`` : ptypy's cuda wrapper (delayed load, but fastest compute if all data is on GPU)
+      - ``'skcuda'`` : scikit-cuda (fast load, slowest compute due to additional store/load stages)
+    choices = 'reikna','cuda','skcuda'
+    userlevel = 2
+    """
+
+    def __init__(self, ptycho_parent, pars=None):
+        """
+        Limited-memory BFGS reconstruction engine.
+        """
+        super().__init__(ptycho_parent, pars)
+
+    def engine_initialize(self):
+        """
+        Prepare for LBFGS reconstruction.
+        """
+        super().engine_initialize()
+
+    def engine_prepare(self):
+        super().engine_prepare()
+
+    def _initialize_model(self):
+
+        # Create noise model
+        if self.p.ML_type.lower() == "gaussian":
+            self.ML_model = GaussianModel(self)
+        elif self.p.ML_type.lower() == "poisson":
+            raise NotImplementedError('Poisson norm model not yet implemented')
+        elif self.p.ML_type.lower() == "euclid":
+            raise NotImplementedError('Euclid norm model not yet implemented')
+        else:
+            raise RuntimeError("Unsupported ML_type: '%s'" % self.p.ML_type)
+
+    def _get_smooth_gradient(self, data, sigma):
+        if self.p.smooth_gradient_method == "convolution":
+            if self.GSK.tmp is None:
+                self.GSK.tmp = gpuarray.empty(data.shape, dtype=np.complex64)
+            try:
+                self.GSK.convolution(data, [sigma, sigma], tmp=self.GSK.tmp)
+            except MemoryError:
+                raise RuntimeError("Convolution kernel too large for direct convolution on GPU",
+                                   "Please reduce parameter smooth_gradient or set smooth_gradient_method='fft'.")
+        elif self.p.smooth_gradient_method == "fft":
+            self.FGSK.filter(data, sigma)
+        else:
+            raise NotImplementedError("smooth_gradient_method should be ```convolution``` or ```fft```.")
+        return data
+
+    def _smooth_gradient(self):
+        new_ob_grad = self.ob_grad_new
+        # Smoothing preconditioner decay (once per iteration)
+        if self.smooth_gradient:
+            self.smooth_gradient.sigma *= (1. - self.p.smooth_gradient_decay)
+            for name, s in new_ob_grad.storages.items():
+                s.gpu = self._get_smooth_gradient(s.gpu, self.smooth_gradient.sigma)
+
+    def _get_ob_norm(self):
+        norm = self._get_norm(self.ob_grad_new)
+        return norm
+
+    def _get_pr_norm(self):
+        norm = self._get_norm(self.pr_grad_new)
+        return norm
+
+    def _get_dot(self, c1, c2):
+        dot = np.double(0.)
+        for name, s2 in c2.storages.items():
+            s1 = c1.storages[name]
+            dot += self._dot_kernel(s2.gpu, s1.gpu).get()[0]
+        return dot
+
+    def _get_norm(self, c):
+        norm = np.double(0.)
+        for name, s in c.storages.items():
+            norm += self._dot_kernel(s.gpu, s.gpu).get()[0]
+        return norm
+
+    def _replace_ob_pr_ys(self, mi):
+        self.cdotr_ob_ys[mi-1] = self._get_dot(self.ob_y[mi-1],
+                self.ob_s[mi-1])
+        self.cdotr_pr_ys[mi-1] = self._get_dot(self.pr_y[mi-1],
+                self.pr_s[mi-1])
+        self.cn2_ob_y[mi-1] = self._get_norm(self.ob_y[mi-1])
+        self.cn2_pr_y[mi-1] = self._get_norm(self.pr_y[mi-1])
+
+    def _get_ob_pr_sh(self, k):
+        ob_sh = self._get_dot(self.ob_s[k], self.ob_h)
+        pr_sh = self._get_dot(self.pr_s[k], self.pr_h)
+        return ob_sh, pr_sh
+
+    def _get_ob_pr_yh(self, k):
+        ob_yh = self._get_dot(self.ob_y[k], self.ob_h)
+        pr_yh = self._get_dot(self.pr_y[k], self.pr_h)
+        return ob_yh, pr_yh
+
+    def engine_iterate(self, num=1):
+        """
+        Compute `num` iterations.
+        """
+        error_dct = super().engine_iterate(num)
+        # copy all data back to cpu
+        self._set_pr_ob_ref_for_data(dev='cpu', container=None, sync_copy=True)
+        return error_dct
+
+    def engine_finalize(self):
+        """
+        Clear all GPU data, pinned memory, etc
+        """
+        super().engine_finalize()
diff --git a/ptypy/custom/LBFGS_serial.py b/ptypy/custom/LBFGS_serial.py
new file mode 100644
index 000000000..27ac49bd7
--- /dev/null
+++ b/ptypy/custom/LBFGS_serial.py
@@ -0,0 +1,303 @@
+# -*- coding: utf-8 -*-
+"""
+Limited-memory BFGS reconstruction engine.
+
+TODO.
+
+  * Implement other regularizers
+
+This file is part of the PTYPY package.
+
+    :copyright: Copyright 2014 by the PTYPY team, see AUTHORS.
+    :license: GPLv2, see LICENSE for details.
+"""
+import numpy as np
+import time
+
+from ptypy.custom.LBFGS import LBFGS
+from ptypy.engines.ML import ML, BaseModel
+# from .projectional_serial import serialize_array_access
+from ptypy.accelerate.base.engines.ML_serial import ML_serial, GaussianModel
+from ptypy.accelerate.base.engines.projectional_serial import serialize_array_access
+from ptypy import utils as u
+from ptypy.utils.verbose import logger, log
+from ptypy.utils import parallel
+from ptypy.engines.utils import Cnorm2, Cdot
+from ptypy.engines import register
+from ptypy.accelerate.base.kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel
+from ptypy.accelerate.base import address_manglers
+
+
+__all__ = ['LBFGS_serial']
+
+@register()
+class LBFGS_serial(LBFGS, ML_serial):
+
+    def __init__(self, ptycho_parent, pars=None):
+        """
+        Limited-memory BFGS reconstruction engine.
+        """
+        super(LBFGS_serial, self).__init__(ptycho_parent, pars)
+
+        self.cdotr_ob_ys = [0] * self.p.bfgs_memory_size
+        self.cdotr_pr_ys = [0] * self.p.bfgs_memory_size
+        self.cn2_ob_y = [0] * self.p.bfgs_memory_size
+        self.cn2_pr_y = [0] * self.p.bfgs_memory_size
+
+    def engine_initialize(self):
+        """
+        Prepare for reconstruction.
+        """
+        super(LBFGS_serial, self).engine_initialize()
+
+    def _initialize_model(self):
+        # Create noise model
+        if self.p.ML_type.lower() == "gaussian":
+            self.ML_model = GaussianModel(self)
+        elif self.p.ML_type.lower() == "poisson":
+            raise NotImplementedError('Poisson norm model not yet implemented')
+        elif self.p.ML_type.lower() == "euclid":
+            raise NotImplementedError('Euclid norm model not yet implemented')
+        else:
+            raise RuntimeError("Unsupported ML_type: '%s'" % self.p.ML_type)
+
+    def engine_prepare(self):
+        super(LBFGS_serial, self).engine_prepare()
+
+    def _get_smooth_gradient(self, data, sigma):
+        return self.smooth_gradient(data)
+
+    def _smooth_gradient(self):
+        new_ob_grad = self.ob_grad_new
+        # Smoothing preconditioner decay (once per iteration)
+        if self.smooth_gradient:
+            self.smooth_gradient.sigma *= (1. - self.p.smooth_gradient_decay)
+            for name, s in new_ob_grad.storages.items():
+                s.data[:] = self._get_smooth_gradient(s.data, self.smooth_gradient.sigma)
+
+    def _get_ob_norm(self):
+        norm = Cnorm2(self.ob_grad_new)
+        return norm
+
+    def _get_pr_norm(self):
+        norm = Cnorm2(self.pr_grad_new)
+        return norm
+
+    def _replace_ob_pr_ys(self, mi):
+        self.cdotr_ob_ys[mi-1] = np.real(Cdot(self.ob_y[mi-1],
+            self.ob_s[mi-1]))
+        self.cdotr_pr_ys[mi-1] = np.real(Cdot(self.pr_y[mi-1],
+            self.pr_s[mi-1]))
+        self.cn2_ob_y[mi-1] = Cnorm2(self.ob_y[mi-1])
+        self.cn2_pr_y[mi-1] = Cnorm2(self.pr_y[mi-1])
+
+    def _get_ob_pr_sh(self, k):
+        ob_sh = np.real(Cdot(self.ob_s[k], self.ob_h))
+        pr_sh = np.real(Cdot(self.pr_s[k], self.pr_h))
+        return ob_sh, pr_sh
+
+    def _get_ob_pr_yh(self, k):
+        ob_yh = np.real(Cdot(self.ob_y[k], self.ob_h))
+        pr_yh = np.real(Cdot(self.pr_y[k], self.pr_h))
+        return ob_yh, pr_yh
+
+    def engine_iterate(self, num=1):
+        """
+        Compute `num` iterations.
+        """
+        ########################
+        # Compute new gradient
+        ########################
+        tg = 0.
+        tc = 0.
+        ta = time.time()
+        for it in range(num):
+            t1 = time.time()
+            error_dct = self.ML_model.new_grad()
+            new_ob_grad, new_pr_grad = self.ob_grad_new, self.pr_grad_new
+            tg += time.time() - t1
+
+            if self.p.probe_update_start <= self.curiter:
+                # Apply probe support if needed
+                for name, s in new_pr_grad.storages.items():
+                    self.support_constraint(s)
+                    #support = self.probe_support.get(name)
+                    #if support is not None:
+                    #    s.data *= support
+            else:
+                new_pr_grad.fill(0.)
+
+            # probe/object rescaling (not used for now)
+            if self.p.scale_precond:
+                cn2_new_pr_grad = self._get_pr_norm()
+                cn2_new_ob_grad = self._get_ob_norm()
+
+                if cn2_new_pr_grad > 1e-5:
+                    scale_p_o = (self.p.scale_probe_object * cn2_new_ob_grad
+                                 / cn2_new_pr_grad)
+                else:
+                    scale_p_o = self.p.scale_probe_object
+                if self.scale_p_o is None:
+                    self.scale_p_o = scale_p_o
+                else:
+                    self.scale_p_o = self.scale_p_o ** self.scale_p_o_memory
+                    self.scale_p_o *= scale_p_o ** (1-self.scale_p_o_memory)
+                logger.debug('Scale P/O: %6.3g' % scale_p_o)
+            else:
+                self.scale_p_o = self.p.scale_probe_object
+
+            ############################
+            # LBFGS Two Loop Recursion
+            ############################
+            if self.curiter == 0: # Initial steepest-descent step
+
+                # Wavefield preconditioner (applied twice)
+                if self.p.wavefield_precond:
+                    for name, s in new_ob_grad.storages.items():
+                        new_ob_grad.storages[name].data /= self.ob_fln.storages[name].data + self.p.wavefield_delta_object
+                        new_pr_grad.storages[name].data /= self.pr_fln.storages[name].data + self.p.wavefield_delta_probe
+
+                # Object steepest-descent step
+                self.ob_h -= new_ob_grad
+
+                # Probe steepest-descent step
+                new_pr_grad *= self.scale_p_o # probe preconditioning (applied twice)
+                self.pr_h -= new_pr_grad
+
+            else: # Two-loop LBFGS recursion
+
+                # Wavefield preconditioner
+                if self.p.wavefield_precond:
+                    for name, s in self.ob_h.storages.items():
+                        self.ob_h.storages[name].data *= np.sqrt(self.ob_fln.storages[name].data + self.p.wavefield_delta_object)
+                        self.pr_h.storages[name].data *= np.sqrt(self.pr_fln.storages[name].data + self.p.wavefield_delta_probe)
+                    for name, s in new_ob_grad.storages.items():
+                        new_ob_grad.storages[name].data /= np.sqrt(self.ob_fln.storages[name].data + self.p.wavefield_delta_object)
+                        new_pr_grad.storages[name].data /= np.sqrt(self.pr_fln.storages[name].data + self.p.wavefield_delta_probe)
+
+                # Memory index
+                mi = min(self.curiter, self.p.bfgs_memory_size)
+
+                # Remember last object update and gradient difference
+                self.ob_s[mi-1] << self.ob_h
+                self.ob_y[mi-1] << new_ob_grad
+                self.ob_y[mi-1] -= self.ob_grad
+
+                # Remember last probe update and gradient difference
+                self.pr_h /= np.sqrt(self.scale_p_o) # probe preconditioning
+                self.pr_s[mi-1] << self.pr_h
+                new_pr_grad *= np.sqrt(self.scale_p_o) # probe preconditioning
+                self.pr_y[mi-1] << new_pr_grad
+                self.pr_y[mi-1] -= self.pr_grad
+
+                # Compute and store rho
+                self._replace_ob_pr_ys(mi)
+                self.rho[mi-1] = 1. / (self.cdotr_ob_ys[mi-1] +
+                        self.cdotr_pr_ys[mi-1])
+
+                # BFGS update
+                self.ob_h << new_ob_grad
+                self.pr_h << new_pr_grad
+                # Compute right-hand side of BGFS product
+                for i in reversed(range(mi)):
+                    ob_sh, pr_sh = self._get_ob_pr_sh(i)
+                    self.alpha[i] = self.rho[i]*(ob_sh + pr_sh)
+
+                    #TODO: support operand * for 'float' and 'Container'
+                    # (reusing self.ob_grad here is not efficient)
+                    # self.ob_h -= self.alpha[i]*self.ob_y[i]
+                    self.ob_grad << self.ob_y[i]
+                    self.ob_grad *= self.alpha[i]
+                    self.ob_h -= self.ob_grad
+                    #TODO: support operand * for 'float' and 'Container'
+                    # (reusing self.pr_grad here is not efficient)
+                    # self.pr_h -= self.alpha[i]*self.pr_y[i]
+                    self.pr_grad << self.pr_y[i]
+                    self.pr_grad *= self.alpha[i]
+                    self.pr_h -= self.pr_grad
+
+                # Compute centre of BFGS product (scaled identity)
+                c_num = self.cdotr_ob_ys[mi-1] + self.cdotr_pr_ys[mi-1]
+                c_denom = self.cn2_ob_y[mi-1] + self.cn2_pr_y[mi-1]
+                self.ob_h *= (c_num/c_denom)
+                self.pr_h *= (c_num/c_denom)
+
+
+                # Compute left-hand side of BFGS product
+                for i in range(mi):
+                    ob_yh, pr_yh = self._get_ob_pr_yh(i)
+                    beta = self.rho[i]*(ob_yh + pr_yh)
+
+                    #TODO: support operand * for 'float' and 'Container'
+                    # (reusing self.ob_grad here is not efficient)
+                    # self.ob_h += (self.alpha[i]-beta)*self.ob_s[i]
+                    self.ob_grad << self.ob_s[i]
+                    self.ob_grad *= (self.alpha[i]-beta)
+                    self.ob_h += self.ob_grad
+
+                    #TODO: support operand * for 'float' and 'Container'
+                    # (reusing self.pr_grad here is not efficient)
+                    # self.pr_h += (self.alpha[i]-beta)*self.pr_s[i]
+                    self.pr_grad << self.pr_s[i]
+                    self.pr_grad *= (self.alpha[i]-beta)
+                    self.pr_h += self.pr_grad
+
+                # Wavefield preconditioner
+                if self.p.wavefield_precond:
+                    for name, s in self.ob_h.storages.items():
+                        self.ob_h.storages[name].data /= np.sqrt(self.ob_fln.storages[name].data + self.p.wavefield_delta_object)
+                        self.pr_h.storages[name].data /= np.sqrt(self.pr_fln.storages[name].data + self.p.wavefield_delta_probe)
+
+                # Flip step direction for minimisation
+                self.ob_h *= -1
+                self.pr_h *= np.sqrt(self.scale_p_o) # probe preconditioning
+                self.pr_h *= -1
+
+            # update current gradients with new gradients
+            self.ob_grad << new_ob_grad
+            self.pr_grad << new_pr_grad
+
+            # linesearch (same as ML)
+            t2 = time.time()
+            B = self.ML_model.poly_line_coeffs(self.ob_h, self.pr_h)
+            tc += time.time() - t2
+
+
+            if np.isinf(B).any() or np.isnan(B).any():
+                logger.warning(
+                    'Warning! inf or nan found! Trying to continue...')
+                B[np.isinf(B)] = 0.
+                B[np.isnan(B)] = 0.
+
+            dt = self.ptycho.FType
+            self.tmin = dt(-.5 * B[1] / B[2])
+
+            # step update
+            self.ob_h *= self.tmin
+            self.pr_h *= self.tmin
+            self.ob += self.ob_h
+            self.pr += self.pr_h
+
+            # Roll memory for overwriting
+            if self.curiter >= self.p.bfgs_memory_size:
+                self.ob_s.append(self.ob_s.pop(0))
+                self.pr_s.append(self.pr_s.pop(0))
+                self.ob_y.append(self.ob_y.pop(0))
+                self.pr_y.append(self.pr_y.pop(0))
+                self.rho = np.roll(self.rho,-1)
+
+            # Position correction
+            self.position_update()
+
+            # Allow for customized modifications at the end of each iteration
+            self._post_iterate_update()
+
+            # increase iteration counter
+            self.curiter +=1
+
+        logger.info('Time spent in gradient calculation: %.2f' % tg)
+        logger.info('  ....  in coefficient calculation: %.2f' % tc)
+        return error_dct  # np.array([[self.ML_model.LL[0]] * 3])
+
+    def engine_finalize(self):
+        super(LBFGS_serial, self).engine_finalize()
diff --git a/templates/engines/cupy/moonflower_LBFGS_cupy.py b/templates/engines/cupy/moonflower_LBFGS_cupy.py
new file mode 100644
index 000000000..8cc235756
--- /dev/null
+++ b/templates/engines/cupy/moonflower_LBFGS_cupy.py
@@ -0,0 +1,62 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+
+from ptypy.core import Ptycho
+from ptypy import utils as u
+import ptypy
+ptypy.load_gpu_engines(arch="cupy")
+from ptypy.custom import LBFGS_cupy
+
+import tempfile
+tmpdir = tempfile.gettempdir()
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = "info"
+p.frames_per_block = 400
+# set home path
+p.io = u.Param()
+p.io.home =  "/".join([tmpdir, "ptypy"])
+p.io.autosave = u.Param(active=False)
+p.io.autoplot = u.Param(active=False)
+p.io.interaction = u.Param(active=False)
+
+# max 200 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+# now you have to specify which ScanModel to use with scans.XX.name,
+# just as you have to give 'name' for engines and PtyScan subclasses.
+p.scans.MF.name = 'BlockFull'
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 100
+p.scans.MF.data.save = None
+
+p.scans.MF.illumination = u.Param(diversity=None)
+p.scans.MF.coherence = u.Param(num_probe_modes=1)
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'LBFGS_cupy'
+p.engines.engine00.numiter = 300
+p.engines.engine00.numiter_contiguous = 5
+p.engines.engine00.reg_del2 = True                      # Whether to use a Gaussian prior (smoothing) regularizer
+p.engines.engine00.reg_del2_amplitude = 1.             # Amplitude of the Gaussian prior if used
+p.engines.engine00.scale_precond = True
+p.engines.engine00.floating_intensities = False
+
+# prepare and run
+if __name__ == "__main__":
+    P = Ptycho(p,level=5)
diff --git a/templates/engines/cupy/moonflower_LBFGS_cupy_wavefield.py b/templates/engines/cupy/moonflower_LBFGS_cupy_wavefield.py
new file mode 100644
index 000000000..3273e715c
--- /dev/null
+++ b/templates/engines/cupy/moonflower_LBFGS_cupy_wavefield.py
@@ -0,0 +1,59 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+
+from ptypy.core import Ptycho
+from ptypy import utils as u
+import ptypy
+ptypy.load_gpu_engines(arch="cupy")
+from ptypy.custom import LBFGS_cupy
+
+import tempfile
+tmpdir = tempfile.gettempdir()
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = "info"
+p.frames_per_block = 400
+# set home path
+p.io = u.Param()
+p.io.home =  "/".join([tmpdir, "ptypy"])
+p.io.autosave = u.Param(active=False)
+p.io.autoplot = u.Param(active=False)
+p.io.interaction = u.Param(active=False)
+
+# max 200 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+# now you have to specify which ScanModel to use with scans.XX.name,
+# just as you have to give 'name' for engines and PtyScan subclasses.
+p.scans.MF.name = 'BlockFull'
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 100
+p.scans.MF.data.save = None
+
+p.scans.MF.illumination = u.Param(diversity=None)
+p.scans.MF.coherence = u.Param(num_probe_modes=1)
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'LBFGS_cupy'
+p.engines.engine00.numiter = 300
+p.engines.engine00.numiter_contiguous = 5
+p.engines.engine00.wavefield_precond = True
+
+# prepare and run
+if __name__ == "__main__":
+    P = Ptycho(p,level=5)
diff --git a/templates/engines/numpy/moonflower_LBFGS_Gaussian.py b/templates/engines/numpy/moonflower_LBFGS_Gaussian.py
new file mode 100644
index 000000000..995bce464
--- /dev/null
+++ b/templates/engines/numpy/moonflower_LBFGS_Gaussian.py
@@ -0,0 +1,61 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+#import ptypy
+from ptypy.core import Ptycho
+from ptypy import utils as u
+from ptypy.custom import LBFGS
+
+import tempfile
+tmpdir = tempfile.gettempdir()
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = "info"
+
+# set home path
+p.io = u.Param()
+p.io.home = "/".join([tmpdir, "ptypy"])
+
+# saving intermediate results
+p.io.autosave = u.Param(active=False)
+
+# opens plotting GUI if interaction set to active)
+p.io.autoplot = u.Param(active=True)
+p.io.interaction = u.Param(active=True)
+
+# max 100 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+p.scans.MF.name = 'BlockFull'
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 200
+p.scans.MF.data.save = None
+
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'LBFGS'
+p.engines.engine00.ML_type = 'Gaussian'
+p.engines.engine00.reg_del2 = True                      # Whether to use a Gaussian prior (smoothing) regularizer
+p.engines.engine00.reg_del2_amplitude = 1.             # Amplitude of the Gaussian prior if used
+p.engines.engine00.scale_precond = True
+#p.engines.engine00.scale_probe_object = 1.
+p.engines.engine00.floating_intensities = False
+p.engines.engine00.numiter = 300
+
+# prepare and run
+if __name__ == "__main__":
+    P = Ptycho(p,level=5)
diff --git a/templates/engines/numpy/moonflower_LBFGS_Gaussian_wavefield.py b/templates/engines/numpy/moonflower_LBFGS_Gaussian_wavefield.py
new file mode 100644
index 000000000..13c28a536
--- /dev/null
+++ b/templates/engines/numpy/moonflower_LBFGS_Gaussian_wavefield.py
@@ -0,0 +1,57 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+#import ptypy
+from ptypy.core import Ptycho
+from ptypy import utils as u
+from ptypy.custom import LBFGS
+
+import tempfile
+tmpdir = tempfile.gettempdir()
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = "info"
+
+# set home path
+p.io = u.Param()
+p.io.home = "/".join([tmpdir, "ptypy"])
+
+# saving intermediate results
+p.io.autosave = u.Param(active=False)
+
+# opens plotting GUI if interaction set to active)
+p.io.autoplot = u.Param(active=True)
+p.io.interaction = u.Param(active=True)
+
+# max 100 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+p.scans.MF.name = 'BlockFull'
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 200
+p.scans.MF.data.save = None
+
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'LBFGS'
+p.engines.engine00.ML_type = 'Gaussian'
+p.engines.engine00.numiter = 300
+p.engines.engine00.wavefield_precond = True
+
+# prepare and run
+if __name__ == "__main__":
+    P = Ptycho(p,level=5)
diff --git a/templates/engines/pycuda/moonflower_LBFGS_pycuda.py b/templates/engines/pycuda/moonflower_LBFGS_pycuda.py
new file mode 100644
index 000000000..2e4ea2828
--- /dev/null
+++ b/templates/engines/pycuda/moonflower_LBFGS_pycuda.py
@@ -0,0 +1,62 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+
+from ptypy.core import Ptycho
+from ptypy import utils as u
+import ptypy
+ptypy.load_gpu_engines(arch="cuda")
+from ptypy.custom import LBFGS_pycuda
+
+import tempfile
+tmpdir = tempfile.gettempdir()
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = "info"
+p.frames_per_block = 400
+# set home path
+p.io = u.Param()
+p.io.home =  "/".join([tmpdir, "ptypy"])
+p.io.autosave = u.Param(active=False)
+p.io.autoplot = u.Param(active=False)
+p.io.interaction = u.Param(active=False)
+
+# max 200 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+# now you have to specify which ScanModel to use with scans.XX.name,
+# just as you have to give 'name' for engines and PtyScan subclasses.
+p.scans.MF.name = 'BlockFull'
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 100
+p.scans.MF.data.save = None
+
+p.scans.MF.illumination = u.Param(diversity=None)
+p.scans.MF.coherence = u.Param(num_probe_modes=1)
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'LBFGS_pycuda'
+p.engines.engine00.numiter = 300
+p.engines.engine00.numiter_contiguous = 5
+p.engines.engine00.reg_del2 = True                      # Whether to use a Gaussian prior (smoothing) regularizer
+p.engines.engine00.reg_del2_amplitude = 1.             # Amplitude of the Gaussian prior if used
+p.engines.engine00.scale_precond = True
+p.engines.engine00.floating_intensities = False
+
+# prepare and run
+if __name__ == "__main__":
+    P = Ptycho(p,level=5)
diff --git a/templates/engines/pycuda/moonflower_LBFGS_pycuda_wavefield.py b/templates/engines/pycuda/moonflower_LBFGS_pycuda_wavefield.py
new file mode 100644
index 000000000..eff11bf01
--- /dev/null
+++ b/templates/engines/pycuda/moonflower_LBFGS_pycuda_wavefield.py
@@ -0,0 +1,59 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+
+from ptypy.core import Ptycho
+from ptypy import utils as u
+import ptypy
+ptypy.load_gpu_engines(arch="cuda")
+from ptypy.custom import LBFGS_pycuda
+
+import tempfile
+tmpdir = tempfile.gettempdir()
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = "info"
+p.frames_per_block = 400
+# set home path
+p.io = u.Param()
+p.io.home =  "/".join([tmpdir, "ptypy"])
+p.io.autosave = u.Param(active=False)
+p.io.autoplot = u.Param(active=False)
+p.io.interaction = u.Param(active=False)
+
+# max 200 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+# now you have to specify which ScanModel to use with scans.XX.name,
+# just as you have to give 'name' for engines and PtyScan subclasses.
+p.scans.MF.name = 'BlockFull'
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 100
+p.scans.MF.data.save = None
+
+p.scans.MF.illumination = u.Param(diversity=None)
+p.scans.MF.coherence = u.Param(num_probe_modes=1)
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'LBFGS_pycuda'
+p.engines.engine00.numiter = 300
+p.engines.engine00.numiter_contiguous = 5
+p.engines.engine00.wavefield_precond = True
+
+# prepare and run
+if __name__ == "__main__":
+    P = Ptycho(p,level=5)
diff --git a/templates/notebooks/moonflower_lbfgs.ipynb b/templates/notebooks/moonflower_lbfgs.ipynb
new file mode 100644
index 000000000..dffa0b537
--- /dev/null
+++ b/templates/notebooks/moonflower_lbfgs.ipynb
@@ -0,0 +1,165 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## PtyPy moonflower example\n",
+    "#### scan model: BlockGradFull\n",
+    "#### engine: Limited-memory BFGS (LBFGS)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "python"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from ptypy.core import Ptycho\n",
+    "from ptypy import utils as u\n",
+    "from ptypy.custom import LBFGS"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "python"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# create parameter tree\n",
+    "p = u.Param()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "python"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# set verbose level to interactive\n",
+    "p.verbose_level = \"interactive\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "python"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# set home path and io settings (no files saved)\n",
+    "p.io = u.Param()\n",
+    "p.io.rfile = None\n",
+    "p.io.autosave = u.Param(active=False)\n",
+    "p.io.autoplot = u.Param(active=False)\n",
+    "p.io.interaction = u.Param(active=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "python"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# max 200 frames (128x128px) of diffraction data\n",
+    "p.scans = u.Param()\n",
+    "p.scans.MF = u.Param()\n",
+    "p.scans.MF.name = 'BlockGradFull'\n",
+    "p.scans.MF.data= u.Param()\n",
+    "p.scans.MF.data.name = 'MoonFlowerScan'\n",
+    "p.scans.MF.data.shape = 128\n",
+    "p.scans.MF.data.num_frames = 200\n",
+    "p.scans.MF.data.save = None\n",
+    "p.scans.MF.data.density = 0.2\n",
+    "p.scans.MF.data.photons = 1e8\n",
+    "p.scans.MF.data.psf = 0."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "python"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# maximum likelihood reconstrucion engine\n",
+    "p.engines = u.Param()\n",
+    "p.engines.engine00 = u.Param()\n",
+    "p.engines.engine00.name = 'LBFGS'\n",
+    "p.engines.engine00.ML_type = 'Gaussian'\n",
+    "p.engines.engine00.reg_del2 = True  \n",
+    "p.engines.engine00.reg_del2_amplitude = 1. \n",
+    "p.engines.engine00.scale_precond = True\n",
+    "p.engines.engine00.floating_intensities = False\n",
+    "p.engines.engine00.numiter = 300"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "python"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# prepare and run\n",
+    "P = Ptycho(p,level=5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Plotting the results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "python"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import ptypy.utils.plot_client as pc\n",
+    "fig = pc.figure_from_ptycho(P)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/templates/notebooks/moonflower_lbfgs_pycuda.ipynb b/templates/notebooks/moonflower_lbfgs_pycuda.ipynb
new file mode 100644
index 000000000..a2a03f3d6
--- /dev/null
+++ b/templates/notebooks/moonflower_lbfgs_pycuda.ipynb
@@ -0,0 +1,194 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## PtyPy moonflower example\n",
+    "#### scan model: BlockGradFull\n",
+    "#### engine: Limited-memory BFGS (LBFGS) with GPU acceleration"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "python"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import ptypy\n",
+    "from ptypy.core import Ptycho\n",
+    "from ptypy import utils as u"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "python"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Import pycuda engines (DM, RAAR)\n",
+    "ptypy.load_gpu_engines(\"cuda\")\n",
+    "from ptypy.custom import LBFGS_pycuda"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "python"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# create parameter tree\n",
+    "p = u.Param()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "python"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# set verbose level to interactive\n",
+    "p.verbose_level = \"interactive\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "python"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Nr. of frames in a block\n",
+    "p.frames_per_block = 20"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "python"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# set home path and io settings (no files saved)\n",
+    "p.io = u.Param()\n",
+    "p.io.rfile = None\n",
+    "p.io.autosave = u.Param(active=False)\n",
+    "p.io.autoplot = u.Param(active=False)\n",
+    "p.io.interaction = u.Param(active=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "python"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# max 200 frames (128x128px) of diffraction data\n",
+    "p.scans = u.Param()\n",
+    "p.scans.MF = u.Param()\n",
+    "p.scans.MF.name = 'BlockGradFull'\n",
+    "p.scans.MF.data= u.Param()\n",
+    "p.scans.MF.data.name = 'MoonFlowerScan'\n",
+    "p.scans.MF.data.shape = 128\n",
+    "p.scans.MF.data.num_frames = 200\n",
+    "p.scans.MF.data.save = None\n",
+    "p.scans.MF.data.density = 0.2\n",
+    "p.scans.MF.data.photons = 1e8\n",
+    "p.scans.MF.data.psf = 0."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "python"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# maximum likelihood reconstrucion engine\n",
+    "p.engines = u.Param()\n",
+    "p.engines.engine00 = u.Param()\n",
+    "p.engines.engine00.name = 'LBFGS_pycuda'\n",
+    "p.engines.engine00.ML_type = 'Gaussian'\n",
+    "p.engines.engine00.reg_del2 = True  \n",
+    "p.engines.engine00.reg_del2_amplitude = 1. \n",
+    "p.engines.engine00.scale_precond = True\n",
+    "p.engines.engine00.floating_intensities = False\n",
+    "p.engines.engine00.numiter = 300"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "python"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# prepare and run\n",
+    "P = Ptycho(p,level=5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Plotting the results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "python"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import ptypy.utils.plot_client as pc\n",
+    "fig = pc.figure_from_ptycho(P)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/test/accelerate_tests/base_tests/LBFGS_test.py b/test/accelerate_tests/base_tests/LBFGS_test.py
new file mode 100644
index 000000000..8f679b1a6
--- /dev/null
+++ b/test/accelerate_tests/base_tests/LBFGS_test.py
@@ -0,0 +1,192 @@
+"""
+Test for the LBFGS engine.
+
+This file is part of the PTYPY package.
+    :copyright: Copyright 2014 by the PTYPY team, see AUTHORS.
+    :license: GPLv2, see LICENSE for details.
+"""
+import unittest
+
+from test import utils as tu
+from ptypy import utils as u
+import ptypy
+from ptypy.custom import LBFGS, LBFGS_serial
+import tempfile
+import shutil
+import numpy as np
+import pytest
+
+class LBFGSSerialTest(unittest.TestCase):
+
+    def setUp(self):
+        self.outpath = tempfile.mkdtemp(suffix="LBFGS_serial_test")
+
+    def tearDown(self):
+        shutil.rmtree(self.outpath)
+
+    def check_engine_output(self, output, plotting=False, debug=False, tol=0.1):
+        P_LBFGS, P_LBFGS_serial = output
+        numiter = len(P_LBFGS.runtime["iter_info"])
+        LL_LBFGS = np.array([P_LBFGS.runtime["iter_info"][i]["error"][1] for i in range(numiter)])
+        LL_LBFGS_serial = np.array([P_LBFGS_serial.runtime["iter_info"][i]["error"][1] for i in range(numiter)])
+        crop = 42
+        OBJ_LBFGS_serial, OBJ_LBFGS = P_LBFGS_serial.obj.S["SMFG00"].data[0,crop:-crop,crop:-crop], P_LBFGS.obj.S["SMFG00"].data[0,crop:-crop,crop:-crop]
+        PRB_LBFGS_serial, PRB_LBFGS = P_LBFGS_serial.probe.S["SMFG00"].data[0], P_LBFGS.probe.S["SMFG00"].data[0]
+        eng_LBFGS = P_LBFGS.engines["engine00"]
+        eng_LBFGS_serial = P_LBFGS_serial.engines["engine00"]
+        if debug:
+            import matplotlib.pyplot as plt
+            plt.figure("LBFGS debug")
+            plt.imshow(np.abs(eng_LBFGS.debug))
+            plt.figure("LBFGS serial debug")
+            plt.imshow(np.abs(eng_LBFGS_serial.debug))
+            plt.show()
+
+        if plotting:
+            import matplotlib.pyplot as plt
+            plt.figure("Errors")
+            plt.plot(LL_LBFGS, label="LBFGS")
+            plt.plot(LL_LBFGS_serial, label="LBFGS_serial")
+            plt.legend()
+            plt.show()
+            plt.figure("Phase LBFGS")
+            plt.imshow(np.angle(OBJ_LBFGS))
+            plt.figure("Ampltitude LBFGS")
+            plt.imshow(np.abs(OBJ_LBFGS))
+            plt.figure("Phase LBFGS serial")
+            plt.imshow(np.angle(OBJ_LBFGS_serial))
+            plt.figure("Amplitude LBFGS serial")
+            plt.imshow(np.abs(OBJ_LBFGS_serial))
+            plt.figure("Phase difference")
+            plt.imshow(np.angle(OBJ_LBFGS_serial) - np.angle(OBJ_LBFGS), vmin=-0.1, vmax=0.1)
+            plt.colorbar()
+            plt.figure("Amplitude difference")
+            plt.imshow(np.abs(OBJ_LBFGS_serial) - np.abs(OBJ_LBFGS), vmin=-0.1, vmax=0.1)
+            plt.colorbar()
+            plt.show()
+        # np.testing.assert_allclose(eng_LBFGS.debug, eng_LBFGS_serial.debug, atol=1e-7, rtol=1e-7,
+        #                             err_msg="The debug arrays are not matching as expected")
+        RMSE_ob = (np.mean(np.abs(OBJ_LBFGS_serial - OBJ_LBFGS)**2))
+        RMSE_pr = (np.mean(np.abs(PRB_LBFGS_serial - PRB_LBFGS)**2))
+        # RMSE_LL = (np.mean(np.abs(LL_LBFGS_serial - LL_LBFGS)**2))
+        np.testing.assert_allclose(RMSE_ob, 0.0, atol=tol,
+                                    err_msg="The object arrays are not matching as expected")
+        np.testing.assert_allclose(RMSE_pr, 0.0, atol=tol,
+                                    err_msg="The probe arrays are not matching as expected")
+        # np.testing.assert_allclose(RMSE_LL, 0.0, atol=1e-7,
+                                    # err_msg="The log-likelihood errors are not matching as expected")
+
+
+    def test_LBFGS_serial_base(self):
+        out = []
+        for eng in ["LBFGS", "LBFGS_serial"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 100
+            engine_params.floating_intensities = False
+            engine_params.reg_del2 = False
+            engine_params.reg_del2_amplitude = 1.
+            engine_params.scale_precond = False
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+        self.check_engine_output(out, plotting=False, debug=False)
+
+    def test_LBFGS_serial_regularizer(self):
+        out = []
+        for eng in ["LBFGS", "LBFGS_serial"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 90
+            engine_params.floating_intensities = False
+            engine_params.reg_del2 = True
+            engine_params.reg_del2_amplitude = 1.
+            engine_params.scale_precond = False
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+        self.check_engine_output(out, plotting=False, debug=False, tol=0.2)
+
+    def test_LBFGS_serial_preconditioner(self):
+        out = []
+        for eng in ["LBFGS", "LBFGS_serial"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 100
+            engine_params.floating_intensities = False
+            engine_params.reg_del2 = False
+            engine_params.reg_del2_amplitude = 1.
+            engine_params.scale_precond = True
+            engine_params.scale_probe_object = 1e-6
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+        self.check_engine_output(out, plotting=False, debug=False)
+
+    def test_LBFGS_serial_floating(self):
+        out = []
+        # fail at iter num 80
+        for eng in ["LBFGS", "LBFGS_serial"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 100
+            engine_params.floating_intensities = True
+            engine_params.reg_del2 = False
+            engine_params.reg_del2_amplitude = 1.
+            engine_params.scale_precond = False
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+        self.check_engine_output(out, plotting=False, debug=False)
+
+    def test_LBFGS_serial_smoothing_regularizer(self):
+        out = []
+        for eng in ["LBFGS", "LBFGS_serial"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 100
+            engine_params.floating_intensities = False
+            engine_params.reg_del2 = False
+            engine_params.reg_del2_amplitude = 1.
+            engine_params.smooth_gradient = 20
+            engine_params.smooth_gradient_decay = 1/10.
+            engine_params.scale_precond = False
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+        self.check_engine_output(out, plotting=False, debug=False)
+
+    @pytest.mark.skip(reason="Funny behaviour with this test, most likely related to Gaussian filter, see issue #607")
+    def test_LBFGS_serial_wavefield_preconditioner(self):
+        out = []
+        for eng in ["LBFGS", "LBFGS_serial"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 500
+            engine_params.floating_intensities = False
+            engine_params.reg_del2 = False
+            engine_params.reg_del2_amplitude = 1.
+            engine_params.smooth_gradient = 0.
+            engine_params.smooth_gradient_decay = 0.
+            engine_params.scale_precond = False
+            engine_params.wavefield_precond = True
+            engine_params.wavefield_delta_object = 0.1
+            engine_params.wavefield_delta_probe = 0.1
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
+        self.check_engine_output(out, plotting=False, debug=False, tol=0.3)
+
+    def test_LBFGS_serial_all(self):
+        out = []
+        for eng in ["LBFGS", "LBFGS_serial"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 100
+            engine_params.floating_intensities = False
+            engine_params.reg_del2 = True
+            engine_params.reg_del2_amplitude = 1.
+            engine_params.smooth_gradient = 20
+            engine_params.smooth_gradient_decay = 1/10.
+            engine_params.scale_precond = True
+            engine_params.scale_probe_object = 1e-6
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+        self.check_engine_output(out, plotting=False, debug=False)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/accelerate_tests/cuda_cupy_tests/LBFGS_test.py b/test/accelerate_tests/cuda_cupy_tests/LBFGS_test.py
new file mode 100644
index 000000000..902b62095
--- /dev/null
+++ b/test/accelerate_tests/cuda_cupy_tests/LBFGS_test.py
@@ -0,0 +1,190 @@
+"""
+Test for the LBFGS engine.
+
+This file is part of the PTYPY package.
+    :copyright: Copyright 2014 by the PTYPY team, see AUTHORS.
+    :license: GPLv2, see LICENSE for details.
+"""
+import unittest
+
+from test import utils as tu
+from ptypy import utils as u
+import ptypy
+from ptypy.custom import LBFGS_serial, LBFGS_cupy
+import tempfile
+import shutil
+import numpy as np
+
+class LBFGSCupyTest(unittest.TestCase):
+
+    def setUp(self):
+        self.outpath = tempfile.mkdtemp(suffix="LBFGS_cupy_test")
+
+    def tearDown(self):
+        shutil.rmtree(self.outpath)
+
+    def check_engine_output(self, output, plotting=False, debug=False, tol=0.1):
+        P_LBFGS_serial, P_LBFGS_cupy = output
+        numiter = len(P_LBFGS_serial.runtime["iter_info"])
+        LL_LBFGS_serial = np.array([P_LBFGS_serial.runtime["iter_info"][i]["error"][1] for i in range(numiter)])
+        LL_LBFGS_cupy = np.array([P_LBFGS_cupy.runtime["iter_info"][i]["error"][1] for i in range(numiter)])
+        crop = 42
+        OBJ_LBFGS_cupy, OBJ_LBFGS_serial = P_LBFGS_cupy.obj.S["SMFG00"].data[0,crop:-crop,crop:-crop], P_LBFGS_serial.obj.S["SMFG00"].data[0,crop:-crop,crop:-crop]
+        PRB_LBFGS_cupy, PRB_LBFGS_serial = P_LBFGS_cupy.probe.S["SMFG00"].data[0], P_LBFGS_serial.probe.S["SMFG00"].data[0]
+        eng_LBFGS_serial = P_LBFGS_serial.engines["engine00"]
+        eng_LBFGS_cupy = P_LBFGS_cupy.engines["engine00"]
+        if debug:
+            import matplotlib.pyplot as plt
+            plt.figure("LBFGS serial debug")
+            plt.imshow(np.abs(eng_LBFGS_serial.debug))
+            plt.figure("LBFGS cupy debug")
+            plt.imshow(np.abs(eng_LBFGS_cupy.debug))
+            plt.show()
+
+        if plotting:
+            import matplotlib.pyplot as plt
+            plt.figure("Errors")
+            plt.plot(LL_LBFGS_serial, label="LBFGS_serial")
+            plt.plot(LL_LBFGS_cupy, label="LBFGS_cupy")
+            plt.legend()
+            plt.show()
+            plt.figure("Phase LBFGS serial")
+            plt.imshow(np.angle(OBJ_LBFGS_serial))
+            plt.figure("Ampltitude LBFGS serial")
+            plt.imshow(np.abs(OBJ_LBFGS_serial))
+            plt.figure("Phase LBFGS cupy")
+            plt.imshow(np.angle(OBJ_LBFGS_cupy))
+            plt.figure("Amplitude LBFGS cupy")
+            plt.imshow(np.abs(OBJ_LBFGS_cupy))
+            plt.figure("Phase difference")
+            plt.imshow(np.angle(OBJ_LBFGS_cupy) - np.angle(OBJ_LBFGS_serial), vmin=-0.1, vmax=0.1)
+            plt.colorbar()
+            plt.figure("Amplitude difference")
+            plt.imshow(np.abs(OBJ_LBFGS_cupy) - np.abs(OBJ_LBFGS_serial), vmin=-0.1, vmax=0.1)
+            plt.colorbar()
+            plt.show()
+        # np.testing.assert_allclose(eng_LBFGS.debug, eng_LBFGS_serial.debug, atol=1e-7, rtol=1e-7,
+        #                             err_msg="The debug arrays are not matching as expected")
+        RMSE_ob = (np.mean(np.abs(OBJ_LBFGS_cupy - OBJ_LBFGS_serial)**2))
+        RMSE_pr = (np.mean(np.abs(PRB_LBFGS_cupy - PRB_LBFGS_serial)**2))
+        # RMSE_LL = (np.mean(np.abs(LL_LBFGS_cupy - LL_LBFGS_serial)**2))
+        np.testing.assert_allclose(RMSE_ob, 0.0, atol=tol,
+                                    err_msg="The object arrays are not matching as expected")
+        np.testing.assert_allclose(RMSE_pr, 0.0, atol=tol,
+                                    err_msg="The probe arrays are not matching as expected")
+        # np.testing.assert_allclose(RMSE_LL, 0.0, atol=1e-7,
+                                    # err_msg="The log-likelihood errors are not matching as expected")
+
+
+    def test_LBFGS_cupy_base(self):
+        out = []
+        for eng in ["LBFGS_serial", "LBFGS_cupy"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 100
+            engine_params.floating_intensities = False
+            engine_params.reg_del2 = False
+            engine_params.reg_del2_amplitude = 1.
+            engine_params.scale_precond = False
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+        self.check_engine_output(out, plotting=False, debug=False)
+
+    def test_LBFGS_cupy_regularizer(self):
+        out = []
+        for eng in ["LBFGS_serial", "LBFGS_cupy"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 90
+            engine_params.floating_intensities = False
+            engine_params.reg_del2 = True
+            engine_params.reg_del2_amplitude = 1.
+            engine_params.scale_precond = False
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+        self.check_engine_output(out, plotting=False, debug=False)
+
+    def test_LBFGS_cupy_preconditioner(self):
+        out = []
+        for eng in ["LBFGS_serial", "LBFGS_cupy"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 100
+            engine_params.floating_intensities = False
+            engine_params.reg_del2 = False
+            engine_params.reg_del2_amplitude = 1.
+            engine_params.scale_precond = True
+            engine_params.scale_probe_object = 1e-6
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+        self.check_engine_output(out, plotting=False, debug=False)
+
+    def test_LBFGS_cupy_floating(self):
+        out = []
+        # fail at iter num 80
+        for eng in ["LBFGS_serial", "LBFGS_cupy"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 100
+            engine_params.floating_intensities = True
+            engine_params.reg_del2 = False
+            engine_params.reg_del2_amplitude = 1.
+            engine_params.scale_precond = False
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+        self.check_engine_output(out, plotting=False, debug=False)
+
+    def test_LBFGS_cupy_smoothing_regularizer(self):
+        out = []
+        for eng in ["LBFGS_serial", "LBFGS_cupy"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 100
+            engine_params.floating_intensities = False
+            engine_params.reg_del2 = False
+            engine_params.reg_del2_amplitude = 1.
+            engine_params.smooth_gradient = 20
+            engine_params.smooth_gradient_decay = 1/10.
+            engine_params.scale_precond = False
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+        self.check_engine_output(out, plotting=False, debug=False)
+
+    def test_LBFGS_cupy_wavefield_preconditioner(self):
+        out = []
+        for eng in ["LBFGS_serial", "LBFGS_cupy"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 500
+            engine_params.floating_intensities = False
+            engine_params.reg_del2 = False
+            engine_params.reg_del2_amplitude = 1.
+            engine_params.smooth_gradient = 0
+            engine_params.smooth_gradient_decay = 0.
+            engine_params.scale_precond = False
+            engine_params.wavefield_precond = True
+            engine_params.wavefield_delta_object = 0.1
+            engine_params.wavefield_delta_probe = 0.1
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
+        self.check_engine_output(out, plotting=False, debug=False, tol=0.3)
+
+    def test_LBFGS_cupy_all(self):
+        out = []
+        for eng in ["LBFGS_serial", "LBFGS_cupy"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 100
+            engine_params.floating_intensities = False
+            engine_params.reg_del2 = True
+            engine_params.reg_del2_amplitude = 1.
+            engine_params.smooth_gradient = 20
+            engine_params.smooth_gradient_decay = 1/10.
+            engine_params.scale_precond = True
+            engine_params.scale_probe_object = 1e-6
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+        self.check_engine_output(out, plotting=False, debug=False)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/accelerate_tests/cuda_pycuda_tests/LBFGS_test.py b/test/accelerate_tests/cuda_pycuda_tests/LBFGS_test.py
new file mode 100644
index 000000000..f5a077d2a
--- /dev/null
+++ b/test/accelerate_tests/cuda_pycuda_tests/LBFGS_test.py
@@ -0,0 +1,190 @@
+"""
+Test for the LBFGS engine.
+
+This file is part of the PTYPY package.
+    :copyright: Copyright 2014 by the PTYPY team, see AUTHORS.
+    :license: GPLv2, see LICENSE for details.
+"""
+import unittest
+
+from test import utils as tu
+from ptypy import utils as u
+import ptypy
+from ptypy.custom import LBFGS_serial, LBFGS_pycuda
+import tempfile
+import shutil
+import numpy as np
+
+class LBFGSPycudaTest(unittest.TestCase):
+
+    def setUp(self):
+        self.outpath = tempfile.mkdtemp(suffix="LBFGS_pycuda_test")
+
+    def tearDown(self):
+        shutil.rmtree(self.outpath)
+
+    def check_engine_output(self, output, plotting=False, debug=False, tol=0.1):
+        P_LBFGS_serial, P_LBFGS_pycuda = output
+        numiter = len(P_LBFGS_serial.runtime["iter_info"])
+        LL_LBFGS_serial = np.array([P_LBFGS_serial.runtime["iter_info"][i]["error"][1] for i in range(numiter)])
+        LL_LBFGS_pycuda = np.array([P_LBFGS_pycuda.runtime["iter_info"][i]["error"][1] for i in range(numiter)])
+        crop = 42
+        OBJ_LBFGS_pycuda, OBJ_LBFGS_serial = P_LBFGS_pycuda.obj.S["SMFG00"].data[0,crop:-crop,crop:-crop], P_LBFGS_serial.obj.S["SMFG00"].data[0,crop:-crop,crop:-crop]
+        PRB_LBFGS_pycuda, PRB_LBFGS_serial = P_LBFGS_pycuda.probe.S["SMFG00"].data[0], P_LBFGS_serial.probe.S["SMFG00"].data[0]
+        eng_LBFGS_serial = P_LBFGS_serial.engines["engine00"]
+        eng_LBFGS_pycuda = P_LBFGS_pycuda.engines["engine00"]
+        if debug:
+            import matplotlib.pyplot as plt
+            plt.figure("LBFGS serial debug")
+            plt.imshow(np.abs(eng_LBFGS_serial.debug))
+            plt.figure("LBFGS pycuda debug")
+            plt.imshow(np.abs(eng_LBFGS_pycuda.debug))
+            plt.show()
+
+        if plotting:
+            import matplotlib.pyplot as plt
+            plt.figure("Errors")
+            plt.plot(LL_LBFGS_serial, label="LBFGS_serial")
+            plt.plot(LL_LBFGS_pycuda, label="LBFGS_pycuda")
+            plt.legend()
+            plt.show()
+            plt.figure("Phase LBFGS serial")
+            plt.imshow(np.angle(OBJ_LBFGS_serial))
+            plt.figure("Ampltitude LBFGS serial")
+            plt.imshow(np.abs(OBJ_LBFGS_serial))
+            plt.figure("Phase LBFGS pycuda")
+            plt.imshow(np.angle(OBJ_LBFGS_pycuda))
+            plt.figure("Amplitude LBFGS pycuda")
+            plt.imshow(np.abs(OBJ_LBFGS_pycuda))
+            plt.figure("Phase difference")
+            plt.imshow(np.angle(OBJ_LBFGS_pycuda) - np.angle(OBJ_LBFGS_serial), vmin=-0.1, vmax=0.1)
+            plt.colorbar()
+            plt.figure("Amplitude difference")
+            plt.imshow(np.abs(OBJ_LBFGS_pycuda) - np.abs(OBJ_LBFGS_serial), vmin=-0.1, vmax=0.1)
+            plt.colorbar()
+            plt.show()
+        # np.testing.assert_allclose(eng_LBFGS.debug, eng_LBFGS_serial.debug, atol=1e-7, rtol=1e-7,
+        #                             err_msg="The debug arrays are not matching as expected")
+        RMSE_ob = (np.mean(np.abs(OBJ_LBFGS_pycuda - OBJ_LBFGS_serial)**2))
+        RMSE_pr = (np.mean(np.abs(PRB_LBFGS_pycuda - PRB_LBFGS_serial)**2))
+        # RMSE_LL = (np.mean(np.abs(LL_LBFGS_pycuda - LL_LBFGS_serial)**2))
+        np.testing.assert_allclose(RMSE_ob, 0.0, atol=tol,
+                                    err_msg="The object arrays are not matching as expected")
+        np.testing.assert_allclose(RMSE_pr, 0.0, atol=tol,
+                                    err_msg="The probe arrays are not matching as expected")
+        # np.testing.assert_allclose(RMSE_LL, 0.0, atol=1e-7,
+                                    # err_msg="The log-likelihood errors are not matching as expected")
+
+
+    def test_LBFGS_pycuda_base(self):
+        out = []
+        for eng in ["LBFGS_serial", "LBFGS_pycuda"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 100
+            engine_params.floating_intensities = False
+            engine_params.reg_del2 = False
+            engine_params.reg_del2_amplitude = 1.
+            engine_params.scale_precond = False
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+        self.check_engine_output(out, plotting=False, debug=False)
+
+    def test_LBFGS_pycuda_regularizer(self):
+        out = []
+        for eng in ["LBFGS_serial", "LBFGS_pycuda"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 90
+            engine_params.floating_intensities = False
+            engine_params.reg_del2 = True
+            engine_params.reg_del2_amplitude = 1.
+            engine_params.scale_precond = False
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+        self.check_engine_output(out, plotting=False, debug=False)
+
+    def test_LBFGS_pycuda_preconditioner(self):
+        out = []
+        for eng in ["LBFGS_serial", "LBFGS_pycuda"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 100
+            engine_params.floating_intensities = False
+            engine_params.reg_del2 = False
+            engine_params.reg_del2_amplitude = 1.
+            engine_params.scale_precond = True
+            engine_params.scale_probe_object = 1e-6
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+        self.check_engine_output(out, plotting=False, debug=False)
+
+    def test_LBFGS_pycuda_floating(self):
+        out = []
+        # fail at iter num 80
+        for eng in ["LBFGS_serial", "LBFGS_pycuda"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 100
+            engine_params.floating_intensities = True
+            engine_params.reg_del2 = False
+            engine_params.reg_del2_amplitude = 1.
+            engine_params.scale_precond = False
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+        self.check_engine_output(out, plotting=False, debug=False)
+
+    def test_LBFGS_pycuda_smoothing_regularizer(self):
+        out = []
+        for eng in ["LBFGS_serial", "LBFGS_pycuda"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 100
+            engine_params.floating_intensities = False
+            engine_params.reg_del2 = False
+            engine_params.reg_del2_amplitude = 1.
+            engine_params.smooth_gradient = 20
+            engine_params.smooth_gradient_decay = 1/10.
+            engine_params.scale_precond = False
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+        self.check_engine_output(out, plotting=False, debug=False)
+
+    def test_LBFGS_pycuda_wavefield_preconditioner(self):
+        out = []
+        for eng in ["LBFGS_serial", "LBFGS_pycuda"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 700
+            engine_params.floating_intensities = False
+            engine_params.reg_del2 = False
+            engine_params.reg_del2_amplitude = 1.
+            engine_params.smooth_gradient = 0
+            engine_params.smooth_gradient_decay = 0.
+            engine_params.scale_precond = False
+            engine_params.wavefield_precond = True
+            engine_params.wavefield_delta_object = 0.1
+            engine_params.wavefield_delta_probe = 0.1
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
+        self.check_engine_output(out, plotting=False, debug=False, tol=0.3)
+
+    def test_LBFGS_pycuda_all(self):
+        out = []
+        for eng in ["LBFGS_serial", "LBFGS_pycuda"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 100
+            engine_params.floating_intensities = False
+            engine_params.reg_del2 = True
+            engine_params.reg_del2_amplitude = 1.
+            engine_params.smooth_gradient = 20
+            engine_params.smooth_gradient_decay = 1/10.
+            engine_params.scale_precond = True
+            engine_params.scale_probe_object = 1e-6
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+        self.check_engine_output(out, plotting=False, debug=False)
+
+if __name__ == "__main__":
+    unittest.main()