diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index d53a9a632..1567443c1 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -20,16 +20,19 @@ on:
 jobs:
   build-linux:
     runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash -el {0}
     strategy:
       max-parallel: 10
       fail-fast: false
       matrix:
         python-version:
-          - "3.8"
           - "3.9"
           - "3.10"
           - "3.11"
           - "3.12"
+          - "3.13"
         conda-env:
           - "core"
           - "full"
@@ -37,72 +40,27 @@ jobs:
     steps:
     - name: Checkout
       uses: actions/checkout@v4
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v5
+    - uses: conda-incubator/setup-miniconda@v3
       with:
-        python-version:  ${{ matrix.python-version }}
-    - name: Add conda to system path
-      run: |
-        # $CONDA is an environment variable pointing to the root of the miniconda directory
-        echo $CONDA/bin >> $GITHUB_PATH
-        conda --version
-        conda info
-    - name: Make sure conda is updated
-      run: |
-        conda update conda
-        conda --version
+        auto-update-conda: true
+        auto-activate-base: false
+        conda-remove-defaults: true
+        channels: conda-forge
+        activate-environment: ptypy_env
+        python-version: ${{ matrix.python-version }}
+        miniforge-version: latest
     - name: Install ${{ matrix.conda-env }} dependencies
       run: |
-        # replace python version in dependencies
         sed -i 's/python/python=${{ matrix.python-version }}/' dependencies_${{ matrix.conda-env }}.yml
-        if [ ${{ matrix.conda-env }} == 'full' ] && [ ${{ matrix.python-version }} == '3.12' ]; then
-          sed -i '/- pyfftw/d' dependencies_${{ matrix.conda-env }}.yml
-        fi
-        # if [ ${{ matrix.conda-env }} == 'full' ] && [ ${{ matrix.python-version }} == '3.8' ]; then
-        #   sed -i '/- mpi4py/d' dependencies_${{ matrix.conda-env }}.yml
-        # fi
-        # if [ ${{ matrix.conda-env }} == 'full' ] && [ ${{ matrix.python-version }} == '3.9' ]; then
-        #   sed -i '/- mpi4py/d' dependencies_${{ matrix.conda-env }}.yml
-        # fi
-        conda install --solver=classic mpich
-        conda env update --file dependencies_${{ matrix.conda-env }}.yml --name base
-        conda install --solver=classic flake8 pytest pytest-cov
-        conda list	
+        conda install -c conda-forge mpich
+        conda env update --file dependencies_${{ matrix.conda-env }}.yml --name ptypy_env
+        conda install -c conda-forge flake8 pytest pytest-cov
+        conda list
     - name: Prepare ptypy
-      run: |
-        # Install ptypy
-        pip install .
+      run: pip install .
     - name: Lint with flake8
       run: |
         # stop the build if there are Python syntax errors or undefined names
-        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
-        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        # flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+        flake8 . --count --select=E9,F63,F7,F82 --ignore=F824 --show-source --statistics
     - name: Test with pytest
-      run: |
-        # pytest ptypy/test -v --doctest-modules --junitxml=junit/test-results.xml --cov=ptypy --cov-report=xml --cov-report=html --cov-config=.coveragerc
-        pytest -v
-#    - name: cobertura-report
-#      if: github.event_name == 'pull_request' && (github.event.action == 'opened' || github.event.action == 'reopened' || github.event.action == 'synchronize')
-#      uses: 5monkeys/cobertura-action@v7
-#      with:
-#        # The GITHUB_TOKEN for this repo
-#        repo_token:  ${{ secrets.GITHUB_TOKEN }}
-#        # Path to the cobertura file.
-#        path: coverage.xml
-#        # If files with 100% should be skipped from report.
-#        skip_covered: true
-#        # Minimum allowed coverage percentage as an integer.
-#        minimum_coverage: 90
-#        only_changed_files: true
-#    - name: Junit Report to Annotations
-#      uses: ashley-taylor/junit-report-annotations-action@master
-#      with:
-#        # github token
-#        access-token: ${{ secrets.GITHUB_TOKEN }}
-#        # glob to junit xml files
-#        path: junit/test-results.xml
-#        # include summary annotation
-#        includeSummary: true
-#        # max number of failed tests to include
-#        numFailures: 10
+      run: pytest -v
diff --git a/cufft/extensions.py b/cufft/extensions.py
index 4d8b7f598..468d8d1cb 100644
--- a/cufft/extensions.py
+++ b/cufft/extensions.py
@@ -4,8 +4,8 @@
 import os, re
 import subprocess
 import sysconfig
-from distutils.unixccompiler import UnixCCompiler
-from distutils.command.build_ext import build_ext
+from setuptools._distutils.unixccompiler import UnixCCompiler
+from setuptools.command.build_ext import build_ext
 
 
 def find_in_path(name, path):
@@ -116,6 +116,7 @@ def _compile(self, obj, src, ext, cc_args, extra_postargs, pp_opts):
         compiler_command = [self.CUDA["nvcc"]] + self.NVCC_FLAGS + self.OPTFLAGS + ["-Xcompiler"] + self.CXXFLAGS + CPPFLAGS
         compiler_exec = " ".join(compiler_command)
         self.set_executable('compiler_so', compiler_exec)
+        self.set_executable('compiler_so_cxx', compiler_exec)
         postargs = [] # we don't actually have any postargs
         super(NvccCompiler, self)._compile(obj, src, ext, cc_args, postargs, pp_opts) # the _compile method
         # reset the default compiler_so, which we might have changed for cuda
diff --git a/cufft/setup.py b/cufft/setup.py
index a8ef7c61e..b02262bdf 100644
--- a/cufft/setup.py
+++ b/cufft/setup.py
@@ -2,7 +2,7 @@
 
 # we should aim to remove the distutils dependency
 import setuptools
-from distutils.core import setup, Extension
+from setuptools import setup, Extension
 import os
 
 ext_modules = []
diff --git a/dependencies_dev.yml b/dependencies_dev.yml
index 5462c145a..6563f9dd8 100644
--- a/dependencies_dev.yml
+++ b/dependencies_dev.yml
@@ -10,6 +10,7 @@ dependencies:
   - pyzmq
   - pep8
   - mpi4py
+  - packaging
   - pillow
   - pyfftw
   - pip
diff --git a/dependencies_full.yml b/dependencies_full.yml
index e43241fce..205bdb19e 100644
--- a/dependencies_full.yml
+++ b/dependencies_full.yml
@@ -9,6 +9,7 @@ dependencies:
   - h5py
   - pyzmq
   - mpi4py
+  - packaging
   - pillow
   - pyfftw
   - pyyaml
diff --git a/doc/script2rst.py b/doc/script2rst.py
index b05148e46..b30d94667 100644
--- a/doc/script2rst.py
+++ b/doc/script2rst.py
@@ -1,5 +1,6 @@
 import sys
 import io
+from importlib.resources import files
 import contextlib
 import os
 
@@ -9,13 +10,12 @@
            'simupod.py',
            'ownengine.py',
            'subclassptyscan.py']
+_ptypy_dir = files('ptypy')
 
 if len(sys.argv) == 1:
-    import pkg_resources
-
     for script in scripts:
-        scr = pkg_resources.resource_filename('ptypy', tutorial_dir+script)
-        if not os.path.exists(scr):
+        scr = _ptypy_dir / (tutorial_dir + script)
+        if not scr.exists():
             print('Using backup tutorial for %s' % script)
             scr = '../tutorial/'+script
         #subprocess.call(['python',sys.argv[0]+' '+scr]) # doesn't work
@@ -50,13 +50,13 @@ def stdoutIO(stdout=None):
 frst.write("""
 .. note::
    This tutorial was generated from the python source
-   :file:`[ptypy_root]/tutorial/%(fname)s` using :file:`ptypy/doc/%(this)s`. 
+   :file:`[ptypy_root]/tutorial/%(fname)s` using :file:`ptypy/doc/%(this)s`.
    You are encouraged to modify the parameters and rerun the tutorial with::
-   
+
      $ python [ptypy_root]/tutorial/%(fname)s
 
 """ % {'fname': os.path.split(script_name)[-1], 'this': sys.argv[0]})
- 
+
 was_comment = True
 
 while True:
@@ -86,7 +86,7 @@ def stdoutIO(stdout=None):
             frst.write('   '+line2[1:].strip()+'\n')
         frst.write('\n')
         continue
-        
+
     if line.startswith('"""'):
         frst.write('.. parsed-literal::\n\n')
         while True:
@@ -95,11 +95,11 @@ def stdoutIO(stdout=None):
                 break
             frst.write('   ' + line2)
         continue
-    
+
     decorator = False
     indent = False
     for key in indent_keys:
-        if line.startswith(key): 
+        if line.startswith(key):
             indent = True
             break
 
@@ -125,12 +125,12 @@ def stdoutIO(stdout=None):
             pt = fpy.tell()
         exec(func+'\n')
         continue
-        
+
     wline = line.strip()
     if not wline:
         frst.write('\n')
         continue
-    
+
     with stdoutIO() as sout:
         exec(wline)
         out = sout.getvalue()
@@ -150,9 +150,9 @@ def stdoutIO(stdout=None):
             if was_comment:
                 wline = '\n::\n\n'+wline
                 was_comment = False
-                
+
             frst.write(wline+'\n')
-        
+
         #print out
         if out.strip():
             print(out)
@@ -160,5 +160,5 @@ def stdoutIO(stdout=None):
                 frst.write(' '*3+l+'\n')
             out = ''
 
-    
+
 
diff --git a/ptypy/accelerate/base/array_utils.py b/ptypy/accelerate/base/array_utils.py
index bec4cc8e4..5af588413 100644
--- a/ptypy/accelerate/base/array_utils.py
+++ b/ptypy/accelerate/base/array_utils.py
@@ -70,14 +70,20 @@ def complex_gaussian_filter(input, mfs):
     '''
     takes 2D and 3D arrays. Complex input, complex output. mfs has len 0<x<=2
     '''
-    if len(mfs) > 2:
+    if isinstance(mfs, list) and len(mfs) > 2:
         raise NotImplementedError("Only batches of 2D arrays allowed!")
+    if isinstance(mfs, (int, float)):
+        mfs_list = [mfs, mfs]
+        if input.ndim == 3:
+            mfs_list = [0, mfs, mfs]
+    elif isinstance(mfs, (list,tuple)):
+        mfs_list = list(mfs)
+        if input.ndim == 3:
+            mfs_list.insert(0,0)
+    else:
+        raise NotImplementedError(f"Cannot interpret mfs of type {type(mfs)}")
 
-    if input.ndim == 3:
-        mfs = np.insert(mfs, 0, 0)
-
-    return (ndi.gaussian_filter(np.real(input), mfs) + 1j * ndi.gaussian_filter(np.imag(input), mfs)).astype(
-        input.dtype)
+    return (ndi.gaussian_filter(np.real(input), mfs_list).astype(input.dtype) + 1j * ndi.gaussian_filter(np.imag(input), mfs_list))
 
 
 def complex_gaussian_filter_fft(input, mfs):
diff --git a/ptypy/accelerate/base/engines/ML_serial.py b/ptypy/accelerate/base/engines/ML_serial.py
index 1e953d26a..3618d0f9c 100644
--- a/ptypy/accelerate/base/engines/ML_serial.py
+++ b/ptypy/accelerate/base/engines/ML_serial.py
@@ -50,6 +50,7 @@ def __init__(self, ptycho_parent, pars=None):
         self.diff_info = {}
         self.cn2_ob_grad = 0.
         self.cn2_pr_grad = 0.
+        self.sqrt = np.sqrt
 
     def engine_initialize(self):
         """
@@ -162,6 +163,13 @@ def _get_smooth_gradient(self, data, sigma):
 
     def _replace_ob_grad(self):
         new_ob_grad = self.ob_grad_new
+
+        # Wavefield preconditioner for the object
+        if self.p.wavefield_precond:
+            for name, s in new_ob_grad.storages.items():
+                s.data /= np.sqrt(self.ob_fln.storages[name].data
+                                  + self.p.wavefield_delta_object)
+
         # Smoothing preconditioner
         if self.smooth_gradient:
             self.smooth_gradient.sigma *= (1. - self.p.smooth_gradient_decay)
@@ -175,7 +183,8 @@ def _replace_ob_grad(self):
 
     def _replace_pr_grad(self):
         new_pr_grad = self.pr_grad_new
-        # probe support
+
+        # Probe support
         if self.p.probe_update_start <= self.curiter:
             # Apply probe support if needed
             for name, s in new_pr_grad.storages.items():
@@ -183,6 +192,12 @@ def _replace_pr_grad(self):
         else:
             new_pr_grad.fill(0.)
 
+        # Wavefield preconditioner for the probe
+        if self.p.wavefield_precond:
+            for name, s in new_pr_grad.storages.items():
+                s.data /= np.sqrt(self.pr_fln.storages[name].data
+                                  + self.p.wavefield_delta_probe)
+
         norm = Cnorm2(new_pr_grad)
         dot = np.real(Cdot(new_pr_grad, self.pr_grad))
         self.pr_grad << new_pr_grad
@@ -240,19 +255,36 @@ def engine_iterate(self, num=1):
             self.cn2_pr_grad = cn2_new_pr_grad
 
             dt = self.ptycho.FType
+
             # 3. Next conjugate
             self.ob_h *= dt(bt / self.tmin)
-
-            # Smoothing preconditioner
-            if self.smooth_gradient:
+            # Wavefield preconditioner for the object (with and without smoothing preconditioner)
+            if self.p.wavefield_precond:
                 for name, s in self.ob_h.storages.items():
-                    s.data[:] -= self._get_smooth_gradient(self.ob_grad.storages[name].data, self.smooth_gradient.sigma)
+                    if self.smooth_gradient:
+                        s.data[:] -= self._get_smooth_gradient(self.ob_grad.storages[name].data
+                                      / self.sqrt(self.ob_fln.storages[name].data + self.p.wavefield_delta_object)
+                                      , self.smooth_gradient.sigma)
+                    else:
+                        s.data[:] -= (self.ob_grad.storages[name].data
+                                      / self.sqrt(self.ob_fln.storages[name].data + self.p.wavefield_delta_object))
             else:
-                self.ob_h -= self.ob_grad
+                # Smoothing preconditioner for the object
+                if self.smooth_gradient:
+                    for name, s in self.ob_h.storages.items():
+                        s.data[:] -= self._get_smooth_gradient(self.ob_grad.storages[name].data, self.smooth_gradient.sigma)
+                else:
+                    self.ob_h -= self.ob_grad
 
             self.pr_h *= dt(bt / self.tmin)
             self.pr_grad *= dt(self.scale_p_o)
-            self.pr_h -= self.pr_grad
+            # Wavefield preconditioner for the probe
+            if self.p.wavefield_precond:
+                for name, s in self.pr_h.storages.items():
+                    s.data[:] -= (self.pr_grad.storages[name].data
+                                  / self.sqrt(self.pr_fln.storages[name].data + self.p.wavefield_delta_probe))
+            else:
+                self.pr_h -= self.pr_grad
 
             # In principle, the way things are now programmed this part
             # could be iterated over in a real Newton-Raphson style.
@@ -416,6 +448,11 @@ def new_grad(self):
         pr_grad = self.engine.pr_grad_new
         ob_grad << 0.
         pr_grad << 0.
+        if self.engine.p.wavefield_precond:
+            ob_fln = self.engine.ob_fln
+            pr_fln = self.engine.pr_fln
+            ob_fln << 0.
+            pr_fln << 0.
 
         # We need an array for MPI
         LL = np.array([0.])
@@ -450,6 +487,11 @@ def new_grad(self):
             prg = pr_grad.S[pID].data
             I = prep.I
 
+            # local references for wavefield precond
+            if self.engine.p.wavefield_precond:
+                obf = ob_fln.S[oID].data
+                prf = pr_fln.S[pID].data
+
             # make propagated exit (to buffer)
             AWK.build_aux_no_ex(aux, addr, ob, pr, add=False)
 
@@ -465,8 +507,12 @@ def new_grad(self):
             GDK.error_reduce(addr, err_phot)
             aux[:] = BW(aux)
 
-            POK.ob_update_ML(addr, obg, pr, aux)
-            POK.pr_update_ML(addr, prg, ob, aux)
+            if self.engine.p.wavefield_precond:
+                POK.ob_update_ML_wavefield(addr, obg, obf, pr, aux)
+                POK.pr_update_ML_wavefield(addr, prg, prf, ob, aux)
+            else:
+                POK.ob_update_ML(addr, obg, pr, aux)
+                POK.pr_update_ML(addr, prg, ob, aux)
 
         for dID, prep in self.engine.diff_info.items():
             err_phot = prep.err_phot
@@ -480,6 +526,9 @@ def new_grad(self):
         # MPI reduction of gradients
         ob_grad.allreduce()
         pr_grad.allreduce()
+        if self.engine.p.wavefield_precond:
+            ob_fln.allreduce()
+            pr_fln.allreduce()
         parallel.allreduce(LL)
 
         # Object regularizer
diff --git a/ptypy/accelerate/base/kernels.py b/ptypy/accelerate/base/kernels.py
index c5343f940..22eb37747 100644
--- a/ptypy/accelerate/base/kernels.py
+++ b/ptypy/accelerate/base/kernels.py
@@ -608,6 +608,32 @@ def pr_update_ML(self, addr, pr, ob, ex, fac=2.0):
                 ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] * fac
         return
 
+    def ob_update_ML_wavefield(self, addr, ob, obf, pr, ex, fac=2.0):
+
+        sh = addr.shape
+        flat_addr = addr.reshape(sh[0] * sh[1], sh[2], sh[3])
+        rows, cols = ex.shape[-2:]
+        for ind, (prc, obc, exc, mac, dic) in enumerate(flat_addr):
+            ob[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols] += \
+                pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols].conj() * \
+                ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] * fac
+            obf[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols] += \
+                abs2(pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols])
+        return
+
+    def pr_update_ML_wavefield(self, addr, pr, prf, ob, ex, fac=2.0):
+
+        sh = addr.shape
+        flat_addr = addr.reshape(sh[0] * sh[1], sh[2], sh[3])
+        rows, cols = ex.shape[-2:]
+        for ind, (prc, obc, exc, mac, dic) in enumerate(flat_addr):
+            pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols] += \
+                ob[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols].conj() * \
+                ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] * fac
+            prf[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols] += \
+                abs2(ob[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols])
+        return
+
     def ob_update_local(self, addr, ob, pr, ex, aux, prn, a=0., b=1.):
         sh = addr.shape
         flat_addr = addr.reshape(sh[0] * sh[1], sh[2], sh[3])
diff --git a/ptypy/accelerate/cuda_common/ob_update2_ML_wavefield.cu b/ptypy/accelerate/cuda_common/ob_update2_ML_wavefield.cu
new file mode 100644
index 000000000..649ce4a89
--- /dev/null
+++ b/ptypy/accelerate/cuda_common/ob_update2_ML_wavefield.cu
@@ -0,0 +1,123 @@
+/** ob_update2_ML_wavefield.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double)
+ * - MATH_TYPE: the data type used for computation
+ * - ACC_TYPE: accumulator for the ob field
+ *
+ * NOTE: This version of ob_update goes over all tiles that need to be accumulated
+ * in a single thread block to avoid global atomic additions (as in ob_update_ML_wavefield.cu).
+ * This requires a local array of NUM_MODES size to store the local updates.
+ * GPU registers per thread are limited (255 32bit registers on V100),
+ * and at some point the registers will spill into shared or global memory
+ * and the kernel will get considerably slower.
+ */
+
+#include "common.cuh"
+
+#define pr_dlayer(k) addr[(k)]
+#define ex_dlayer(k) addr[6 * num_pods + (k)]
+#define obj_dlayer(k) addr[3 * num_pods + (k)]
+#define obj_roi_row(k) addr[4 * num_pods + (k)]
+#define obj_roi_column(k) addr[5 * num_pods + (k)]
+
+extern "C" __global__ void ob_update2_ML_wavefield(int pr_sh,
+                                                   int ob_modes,
+                                                   int num_pods,
+                                                   int ob_sh_rows,
+                                                   int ob_sh_cols,
+                                                   int pr_modes,
+                                                   complex<OUT_TYPE>* ob_g,
+                                                   const complex<IN_TYPE>* __restrict__ pr_g,
+                                                   const complex<IN_TYPE>* __restrict__ ex_g,
+                                                   OUT_TYPE* ob_f,
+                                                   const int* addr,
+                                                   IN_TYPE fac_)
+{
+  int y = blockIdx.y * BDIM_Y + threadIdx.y;
+  int dy = ob_sh_rows;
+  int z = blockIdx.x * BDIM_X + threadIdx.x;
+  int dz = ob_sh_cols;
+  MATH_TYPE fac = fac_;
+  complex<ACC_TYPE> ob[NUM_MODES];
+  ACC_TYPE of[NUM_MODES];
+
+  int txy = threadIdx.y * BDIM_X + threadIdx.x;
+  assert(ob_modes <= NUM_MODES);
+
+  if (y < dy && z < dz)
+  {
+#pragma unroll
+    for (int i = 0; i < NUM_MODES; ++i)
+    {
+      auto idx = i * dy * dz + y * dz + z;
+      assert(idx < ob_modes * ob_sh_rows * ob_sh_cols);
+      ob[i] = ob_g[idx];
+      of[i] = ob_f[idx];
+    }
+  }
+
+  __shared__ int addresses[BDIM_X * BDIM_Y * 5];
+
+  for (int p = 0; p < num_pods; p += BDIM_X * BDIM_Y)
+  {
+    int mi = BDIM_X * BDIM_Y;
+    if (mi > num_pods - p)
+      mi = num_pods - p;
+
+    if (p > 0)
+      __syncthreads();
+
+    if (txy < mi)
+    {
+      assert(p + txy < num_pods);
+      assert(txy < BDIM_X * BDIM_Y);
+      addresses[txy * 5 + 0] = pr_dlayer(p + txy);
+      addresses[txy * 5 + 1] = ex_dlayer(p + txy);
+      addresses[txy * 5 + 2] = obj_dlayer(p + txy);
+      assert(obj_dlayer(p + txy) < NUM_MODES);
+      assert(addresses[txy * 5 + 2] < NUM_MODES);
+      addresses[txy * 5 + 3] = obj_roi_row(p + txy);
+      addresses[txy * 5 + 4] = obj_roi_column(p + txy);
+    }
+
+    __syncthreads();
+
+    if (y >= dy || z >= dz)
+      continue;
+
+#pragma unroll 4
+    for (int i = 0; i < mi; ++i)
+    {
+      int* ad = addresses + i * 5;
+      int v1 = y - ad[3];
+      int v2 = z - ad[4];
+      if (v1 >= 0 && v1 < pr_sh && v2 >= 0 && v2 < pr_sh)
+      {
+        auto pridx = ad[0] * pr_sh * pr_sh + v1 * pr_sh + v2;
+        assert(pridx < pr_modes * pr_sh * pr_sh);
+        complex<MATH_TYPE> pr = pr_g[pridx];
+        int idx = ad[2];
+        assert(idx < NUM_MODES);
+        auto cpr = conj(pr);
+        auto exidx = ad[1] * pr_sh * pr_sh + v1 * pr_sh + v2;
+        complex<MATH_TYPE> ex_val = ex_g[exidx];
+        complex<ACC_TYPE> add_val = cpr * ex_val * fac;
+        ob[idx] += add_val;
+        complex<MATH_TYPE> abs2_val = cpr * pr;
+        ACC_TYPE add_val2 = abs2_val.real();
+        of[idx] += add_val2;
+      }
+    }
+  }
+
+  if (y < dy && z < dz)
+  {
+    for (int i = 0; i < NUM_MODES; ++i)
+    {
+      ob_g[i * dy * dz + y * dz + z] = ob[i];
+      ob_f[i * dy * dz + y * dz + z] = of[i];
+    }
+  }
+}
diff --git a/ptypy/accelerate/cuda_common/ob_update_ML_wavefield.cu b/ptypy/accelerate/cuda_common/ob_update_ML_wavefield.cu
new file mode 100644
index 000000000..bab2fd620
--- /dev/null
+++ b/ptypy/accelerate/cuda_common/ob_update_ML_wavefield.cu
@@ -0,0 +1,73 @@
+/** ob_update_ML_wavefield.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double)
+ * - MATH_TYPE: the data type used for computation
+ */
+
+#include "common.cuh"
+
+template <class T>
+__device__ inline void atomicAdd(complex<T>* x, const complex<T>& y)
+{
+  auto xf = reinterpret_cast<T*>(x);
+  atomicAdd(xf, y.real());
+  atomicAdd(xf + 1, y.imag());
+}
+
+extern "C"
+{
+  __global__ void ob_update_ML_wavefield(const complex<IN_TYPE>* __restrict__ exit_wave,
+                                         int A,
+                                         int B,
+                                         int C,
+                                         const complex<IN_TYPE>* __restrict__ probe,
+                                         int D,
+                                         int E,
+                                         int F,
+                                         complex<OUT_TYPE>* obj,
+                                         int G,
+                                         int H,
+                                         int I,
+                                         OUT_TYPE* obj_fln,
+                                         const int* __restrict__ addr,
+                                         IN_TYPE fac_)
+  {
+    const int bid = blockIdx.x;
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
+    const int addr_stride = 15;
+    MATH_TYPE fac = fac_;
+
+    const int* oa = addr + 3 + bid * addr_stride;
+    const int* pa = addr + bid * addr_stride;
+    const int* ea = addr + 6 + bid * addr_stride;
+
+    probe += pa[0] * E * F + pa[1] * F + pa[2];
+    obj += oa[0] * H * I + oa[1] * I + oa[2];
+    obj_fln += oa[0] * H * I + oa[1] * I + oa[2];
+
+    assert(oa[0] * H * I + oa[1] * I + oa[2] + (B - 1) * I + C - 1 < G * H * I);
+
+    exit_wave += ea[0] * B * C;
+
+    for (int b = ty; b < B; b += blockDim.y)
+    {
+      for (int c = tx; c < C; c += blockDim.x)
+      {
+        complex<MATH_TYPE> probe_val = probe[b * F + c];
+        complex<MATH_TYPE> exit_val = exit_wave[b * C + c];
+        complex<MATH_TYPE> add_val_m = conj(probe_val) * exit_val * fac;
+        complex<OUT_TYPE> add_val = add_val_m;
+
+        atomicAdd(&obj[b * I + c], add_val);
+
+        complex<MATH_TYPE> abs2_val = conj(probe_val) * probe_val;
+        OUT_TYPE add_val2 = abs2_val.real();
+
+        atomicAdd(&obj_fln[b * I + c], add_val2);
+      }
+    }
+  }
+}
diff --git a/ptypy/accelerate/cuda_common/pr_update2_ML_wavefield.cu b/ptypy/accelerate/cuda_common/pr_update2_ML_wavefield.cu
new file mode 100644
index 000000000..567d928be
--- /dev/null
+++ b/ptypy/accelerate/cuda_common/pr_update2_ML_wavefield.cu
@@ -0,0 +1,123 @@
+/** pr_update2_ML_wavefield.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double)
+ * - MATH_TYPE: the data type used for computation
+ * - ACC_TYPE: accumulator type for local pr array
+ *
+ * NOTE: This version of pr_update goes over all tiles that need to be accumulated
+ * in a single thread block to avoid global atomic additions (as in pr_update_ML_wavefield.cu).
+ * This requires a local array of NUM_MODES size to store the local updates.
+ * GPU registers per thread are limited (255 32bit registers on V100),
+ * and at some point the registers will spill into shared or global memory
+ * and the kernel will get considerably slower.
+ */
+
+#include "common.cuh"
+
+#define pr_dlayer(k) addr[(k)]
+#define ex_dlayer(k) addr[6 * num_pods + (k)]
+#define obj_dlayer(k) addr[3 * num_pods + (k)]
+#define obj_roi_row(k) addr[4 * num_pods + (k)]
+#define obj_roi_column(k) addr[5 * num_pods + (k)]
+
+extern "C" __global__ void pr_update2_ML_wavefield(int pr_sh,
+                                                   int ob_sh_row,
+                                                   int ob_sh_col,
+                                                   int pr_modes,
+                                                   int ob_modes,
+                                                   int num_pods,
+                                                   complex<OUT_TYPE>* pr_g,
+                                                   const complex<IN_TYPE>* __restrict__ ob_g,
+                                                   const complex<IN_TYPE>* __restrict__ ex_g,
+                                                   OUT_TYPE* pr_f,
+                                                   const int* addr,
+                                                   IN_TYPE fac_)
+{
+  int y = blockIdx.y * BDIM_Y + threadIdx.y;
+  int dy = pr_sh;
+  int z = blockIdx.x * BDIM_X + threadIdx.x;
+  int dz = pr_sh;
+  MATH_TYPE fac = fac_;
+  complex<ACC_TYPE> pr[NUM_MODES];
+  ACC_TYPE pf[NUM_MODES];
+
+  int txy = threadIdx.y * BDIM_X + threadIdx.x;
+  assert(pr_modes <= NUM_MODES);
+
+  if (y < pr_sh && z < pr_sh)
+  {
+#pragma unroll
+    for (int i = 0; i < NUM_MODES; ++i)
+    {
+      auto idx = i * dy * dz + y * dz + z;
+      assert(idx < pr_modes * pr_sh * pr_sh);
+      pr[i] = pr_g[idx];
+      pf[i] = pr_f[idx];
+    }
+  }
+
+  __shared__ int addresses[BDIM_X * BDIM_Y * 5];
+
+  for (int p = 0; p < num_pods; p += BDIM_X * BDIM_Y)
+  {
+    int mi = BDIM_X * BDIM_Y;
+    if (mi > num_pods - p)
+      mi = num_pods - p;
+
+    if (p > 0)
+      __syncthreads();
+
+    if (txy < mi)
+    {
+      assert(p + txy < num_pods);
+      assert(txy < BDIM_X * BDIM_Y);
+      addresses[txy * 5 + 0] = pr_dlayer(p + txy);
+      addresses[txy * 5 + 1] = ex_dlayer(p + txy);
+      addresses[txy * 5 + 2] = obj_dlayer(p + txy);
+      assert(obj_dlayer(p + txy) < NUM_MODES);
+      assert(addresses[txy * 5 + 2] < NUM_MODES);
+      addresses[txy * 5 + 3] = obj_roi_row(p + txy);
+      addresses[txy * 5 + 4] = obj_roi_column(p + txy);
+    }
+
+    __syncthreads();
+
+    if (y >= pr_sh || z >= pr_sh)
+      continue;
+
+#pragma unroll 4
+    for (int i = 0; i < mi; ++i)
+    {
+      int* ad = addresses + i * 5;
+      int v1 = y + ad[3];
+      int v2 = z + ad[4];
+      if (v1 >= 0 && v1 < ob_sh_row && v2 >= 0 && v2 < ob_sh_col)
+      {
+        auto obidx = ad[2] * ob_sh_row * ob_sh_col + v1 * ob_sh_col + v2;
+        assert(obidx < ob_modes * ob_sh_row * ob_sh_col);
+        complex<MATH_TYPE> ob = ob_g[obidx];
+        int idx = ad[0];
+        assert(idx < NUM_MODES);
+        auto cob = conj(ob);
+        auto exidx = ad[1] * pr_sh * pr_sh + y * pr_sh + z;
+        complex<MATH_TYPE> ex_val = ex_g[exidx];
+        complex<ACC_TYPE> add_val = cob * ex_val * fac;
+        pr[idx] += add_val;
+        complex<MATH_TYPE> abs2_val = cob * ob;
+        ACC_TYPE add_val2 = abs2_val.real();
+        pf[idx] += add_val2;
+      }
+    }
+  }
+
+  if (y < pr_sh && z < pr_sh)
+  {
+    for (int i = 0; i < NUM_MODES; ++i)
+    {
+      pr_g[i * dy * dz + y * dz + z] = pr[i];
+      pr_f[i * dy * dz + y * dz + z] = pf[i];
+    }
+  }
+}
diff --git a/ptypy/accelerate/cuda_common/pr_update_ML_wavefield.cu b/ptypy/accelerate/cuda_common/pr_update_ML_wavefield.cu
new file mode 100644
index 000000000..de236fc7c
--- /dev/null
+++ b/ptypy/accelerate/cuda_common/pr_update_ML_wavefield.cu
@@ -0,0 +1,73 @@
+#/** pr_update_ML_wavefield.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double)
+ * - MATH_TYPE: the data type used for computation
+ */
+
+#include "common.cuh"
+
+template <class T>
+__device__ inline void atomicAdd(complex<T>* x, const complex<T>& y)
+{
+  auto xf = reinterpret_cast<T*>(x);
+  atomicAdd(xf, y.real());
+  atomicAdd(xf + 1, y.imag());
+}
+
+extern "C"
+{
+  __global__ void pr_update_ML_wavefield(const complex<IN_TYPE>* __restrict__ exit_wave,
+                                                  int A,
+                                                  int B,
+                                                  int C,
+                                                  complex<OUT_TYPE>* probe,
+                                                  int D,
+                                                  int E,
+                                                  int F,
+                                                  const complex<IN_TYPE>* __restrict__ obj,
+                                                  int G,
+                                                  int H,
+                                                  int I,
+                                                  OUT_TYPE* probe_fln,
+                                                  const int* __restrict__ addr,
+                                                  IN_TYPE fac_)
+  {
+    const int bid = blockIdx.x;
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
+    const int addr_stride = 15;
+    MATH_TYPE fac = fac_;
+
+    const int* oa = addr + 3 + bid * addr_stride;
+    const int* pa = addr + bid * addr_stride;
+    const int* ea = addr + 6 + bid * addr_stride;
+
+    probe += pa[0] * E * F + pa[1] * F + pa[2];
+    probe_fln += pa[0] * E * F + pa[1] * F + pa[2];
+    obj += oa[0] * H * I + oa[1] * I + oa[2];
+
+    assert(oa[0] * H * I + oa[1] * I + oa[2] + (B - 1) * I + C - 1 < G * H * I);
+
+    exit_wave += ea[0] * B * C;
+
+    for (int b = ty; b < B; b += blockDim.y)
+    {
+      for (int c = tx; c < C; c += blockDim.x)
+      {
+        complex<MATH_TYPE> obj_val = obj[b * I + c];
+        complex<MATH_TYPE> exit_val = exit_wave[b * C + c];
+        complex<MATH_TYPE> add_val_m = conj(obj_val) * exit_val * fac;
+        complex<OUT_TYPE> add_val = add_val_m;
+
+        atomicAdd(&probe[b * F + c], add_val);
+
+        complex<MATH_TYPE> abs2_val = conj(obj_val) * obj_val;
+        OUT_TYPE add_val2 = abs2_val.real();
+
+        atomicAdd(&probe_fln[b * F + c], add_val2);
+      }
+    }
+  }
+}
diff --git a/ptypy/accelerate/cuda_cupy/engines/ML_cupy.py b/ptypy/accelerate/cuda_cupy/engines/ML_cupy.py
index caa0a192d..5f35fbd67 100644
--- a/ptypy/accelerate/cuda_cupy/engines/ML_cupy.py
+++ b/ptypy/accelerate/cuda_cupy/engines/ML_cupy.py
@@ -69,6 +69,7 @@ def __init__(self, ptycho_parent, pars=None):
         Maximum likelihood reconstruction engine.
         """
         super().__init__(ptycho_parent, pars)
+        self.sqrt = cp.sqrt
 
     def engine_initialize(self):
         """
@@ -280,6 +281,13 @@ def _get_smooth_gradient(self, data, sigma):
 
     def _replace_ob_grad(self):
         new_ob_grad = self.ob_grad_new
+
+        # Wavefield preconditioner for the object
+        if self.p.wavefield_precond:
+            for name, s in new_ob_grad.storages.items():
+                s.gpu /= cp.sqrt(self.ob_fln.storages[name].gpu
+                                 + self.p.wavefield_delta_object)
+
         # Smoothing preconditioner
         if self.smooth_gradient:
             self.smooth_gradient.sigma *= (1. - self.p.smooth_gradient_decay)
@@ -291,7 +299,8 @@ def _replace_ob_grad(self):
 
     def _replace_pr_grad(self):
         new_pr_grad = self.pr_grad_new
-        # probe support
+
+        # Probe support
         if self.p.probe_update_start <= self.curiter:
             # Apply probe support if needed
             for name, s in new_pr_grad.storages.items():
@@ -299,6 +308,12 @@ def _replace_pr_grad(self):
         else:
             new_pr_grad.fill(0.)
 
+        # Wavefield preconditioner for the probe
+        if self.p.wavefield_precond:
+            for name, s in new_pr_grad.storages.items():
+                s.gpu /= cp.sqrt(self.pr_fln.storages[name].gpu
+                                 + self.p.wavefield_delta_probe)
+
         return self._replace_grad(self.pr_grad, new_pr_grad)
 
     def _replace_grad(self, grad, new_grad):
@@ -318,7 +333,7 @@ def engine_iterate(self, num=1):
         return err
 
     def position_update(self):
-        """ 
+        """
         Position refinement
         """
         if not self.do_position_refinement or (not self.curiter):
@@ -503,6 +518,12 @@ def new_grad(self):
         qu_htod = self.engine.qu_htod
         queue = self.engine.queue
 
+        if self.engine.p.wavefield_precond:
+            ob_fln = self.engine.ob_fln
+            pr_fln = self.engine.pr_fln
+            ob_fln << 0.
+            pr_fln << 0.
+
         self.engine._set_pr_ob_ref_for_data('gpu')
         ob_grad << 0.
         pr_grad << 0.
@@ -536,6 +557,9 @@ def new_grad(self):
             obg = ob_grad.S[oID].data
             pr = self.engine.pr.S[pID].data
             prg = pr_grad.S[pID].data
+            if self.engine.p.wavefield_precond:
+                obf = ob_fln.S[oID].data
+                prf = pr_fln.S[pID].data
 
             # Schedule w & I to device
             ev_w, w, data_w = self.engine.w_data.to_gpu(
@@ -564,11 +588,17 @@ def new_grad(self):
 
             use_atomics = self.p.object_update_cuda_atomics
             addr = prep.addr_gpu if use_atomics else prep.addr2_gpu
-            POK.ob_update_ML(addr, obg, pr, aux, atomics=use_atomics)
+            if self.engine.p.wavefield_precond:
+                POK.ob_update_ML_wavefield(addr, obg, obf, pr, aux, atomics=use_atomics)
+            else:
+                POK.ob_update_ML(addr, obg, pr, aux, atomics=use_atomics)
 
             use_atomics = self.p.probe_update_cuda_atomics
             addr = prep.addr_gpu if use_atomics else prep.addr2_gpu
-            POK.pr_update_ML(addr, prg, ob, aux, atomics=use_atomics)
+            if self.engine.p.wavefield_precond:
+                POK.pr_update_ML_wavefield(addr, prg, prf, ob, aux, atomics=use_atomics)
+            else:
+                POK.pr_update_ML(addr, prg, ob, aux, atomics=use_atomics)
 
         queue.synchronize()
         self.engine.dID_list.reverse()
@@ -591,10 +621,18 @@ def new_grad(self):
             s.gpu.get(out=s.cpu)
         for s in pr_grad.S.values():
             s.gpu.get(out=s.cpu)
+        if self.engine.p.wavefield_precond:
+            for s in ob_fln.S.values():
+                s.gpu.get(out=s.cpu)
+            for s in pr_fln.S.values():
+                s.gpu.get(out=s.cpu)
         self.engine._set_pr_ob_ref_for_data('cpu')
 
         ob_grad.allreduce()
         pr_grad.allreduce()
+        if self.engine.p.wavefield_precond:
+            ob_fln.allreduce()
+            pr_fln.allreduce()
         parallel.allreduce(LL)
 
         # HtoD cause we continue on gpu
@@ -602,6 +640,11 @@ def new_grad(self):
             s.gpu.set(s.cpu)
         for s in pr_grad.S.values():
             s.gpu.set(s.cpu)
+        if self.engine.p.wavefield_precond:
+            for s in ob_fln.S.values():
+                s.gpu.set(s.cpu)
+            for s in pr_fln.S.values():
+                s.gpu.set(s.cpu)
         self.engine._set_pr_ob_ref_for_data('gpu')
 
         # Object regularizer
@@ -746,7 +789,7 @@ def delxf(x, axis=-1):
         self.dot = lambda x, y: self.AUK.dot(x, y).get().item()
 
         self._grad_reg_kernel = cp.ElementwiseKernel(
-            "float32 fac, complex64 py, complex64 px, complex64 my, complex64 mx", 
+            "float32 fac, complex64 py, complex64 px, complex64 my, complex64 mx",
             "complex64 out",
             "out = (px+py-my-mx) * fac",
             "grad_reg",
diff --git a/ptypy/accelerate/cuda_cupy/kernels.py b/ptypy/accelerate/cuda_cupy/kernels.py
index b743e9b08..07cfe8a42 100644
--- a/ptypy/accelerate/cuda_cupy/kernels.py
+++ b/ptypy/accelerate/cuda_cupy/kernels.py
@@ -909,6 +909,18 @@ def __init__(self, queue_thread=None,
             'MATH_TYPE': self.math_type
         })
         self.pr_update2_ML_cuda = None
+        self.ob_update_ML_wavefield_cuda = load_kernel("ob_update_ML_wavefield", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type
+        })
+        self.ob_update2_ML_wavefield_cuda = None
+        self.pr_update_ML_wavefield_cuda = load_kernel("pr_update_ML_wavefield", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type
+        })
+        self.pr_update2_ML_wavefield_cuda = None
         self.ob_update_local_cuda = load_kernel("ob_update_local", {
             'IN_TYPE': 'float',
             'OUT_TYPE': 'float',
@@ -1096,6 +1108,54 @@ def ob_update_ML(self, addr, ob, pr, ex, fac=2.0, atomics=True):
                                           np.float32(
                                         fac) if ex.dtype == np.complex64 else np.float64(fac)))
 
+    def ob_update_ML_wavefield(self, addr, ob, of, pr, ex, fac=2.0, atomics=True):
+        obsh = [np.int32(ax) for ax in ob.shape]
+        prsh = [np.int32(ax) for ax in pr.shape]
+        exsh = [np.int32(ax) for ax in ex.shape]
+
+        if self.queue is not None:
+            self.queue.use()
+        if atomics:
+            if addr.shape[3] != 3 or addr.shape[2] != 5:
+                raise ValueError(
+                    'Address not in required shape for tiled ob_update')
+
+            num_pods = np.int32(addr.shape[0] * addr.shape[1])
+            self.ob_update_ML_wavefield_cuda(grid=(int(num_pods), 1, 1),
+                                             block=(32, 32, 1),
+                                             args=(ex, num_pods, exsh[1], exsh[2],
+                                                   pr, prsh[0], prsh[1], prsh[2],
+                                                   ob, obsh[0], obsh[1], obsh[2],
+                                                   of,
+                                                   addr,
+                                                   np.float32(
+                                                 fac) if ex.dtype == np.complex64 else np.float64(fac)))
+        else:
+            if addr.shape[0] != 5 or addr.shape[1] != 3:
+                raise ValueError(
+                    'Address not in required shape for tiled ob_update')
+
+            num_pods = np.int32(addr.shape[2] * addr.shape[3])
+            if not self.ob_update2_ML_wavefield_cuda:
+                self.ob_update2_ML_wavefield_cuda = load_kernel("ob_update2_ML_wavefield", {
+                    "NUM_MODES": obsh[0],
+                    "BDIM_X": 16,
+                    "BDIM_Y": 16,
+                    'IN_TYPE': 'float',
+                    'OUT_TYPE': 'float',
+                    'MATH_TYPE': self.math_type,
+                    'ACC_TYPE': self.accumulator_type
+                })
+            grid = [int((x+15)//16) for x in ob.shape[-2:]]
+            grid = (grid[1], grid[0], int(1))
+            self.ob_update2_ML_wavefield_cuda(grid=grid,
+                                              block=(16, 16, 1),
+                                              args=(prsh[-1], obsh[0], num_pods,
+                                                    obsh[-2], obsh[-1], prsh[0],
+                                                    ob, pr, ex, of, addr,
+                                                    np.float32(
+                                                  fac) if ex.dtype == np.complex64 else np.float64(fac)))
+
     def pr_update_ML(self, addr, pr, ob, ex, fac=2.0, atomics=False):
         obsh = [np.int32(ax) for ax in ob.shape]
         prsh = [np.int32(ax) for ax in pr.shape]
@@ -1140,6 +1200,51 @@ def pr_update_ML(self, addr, pr, ob, ex, fac=2.0, atomics=False):
                                           np.float32(
                                         fac) if ex.dtype == np.complex64 else np.float64(fac)))
 
+    def pr_update_ML_wavefield(self, addr, pr, pf, ob, ex, fac=2.0, atomics=False):
+        obsh = [np.int32(ax) for ax in ob.shape]
+        prsh = [np.int32(ax) for ax in pr.shape]
+        if self.queue is not None:
+            self.queue.use()
+        if atomics:
+            if addr.shape[3] != 3 or addr.shape[2] != 5:
+                raise ValueError(
+                    'Address not in required shape for tiled pr_update')
+            num_pods = np.int32(addr.shape[0] * addr.shape[1])
+            self.pr_update_ML_wavefield_cuda(grid=(int(num_pods), 1, 1),
+                                             block=(32, 32, 1),
+                                             args=(ex, num_pods, prsh[1], prsh[2],
+                                                   pr, prsh[0], prsh[1], prsh[2],
+                                                   ob, obsh[0], obsh[1], obsh[2],
+                                                   pf,
+                                                   addr,
+                                                   np.float32(
+                                                 fac) if ex.dtype == np.complex64 else np.float64(fac)))
+        else:
+            if addr.shape[0] != 5 or addr.shape[1] != 3:
+                raise ValueError(
+                    'Address not in required shape for tiled pr_update')
+            num_pods = np.int32(addr.shape[2] * addr.shape[3])
+            if not self.pr_update2_ML_wavefield_cuda:
+                self.pr_update2_ML_wavefield_cuda = load_kernel("pr_update2_ML_wavefield", {
+                    "NUM_MODES": prsh[0],
+                    "BDIM_X": 16,
+                    "BDIM_Y": 16,
+                    'IN_TYPE': 'float',
+                    'OUT_TYPE': 'float',
+                    'MATH_TYPE': self.math_type,
+                    'ACC_TYPE': self.accumulator_type
+                })
+
+            grid = [int((x+15)//16) for x in pr.shape[-2:]]
+            grid = (grid[0], grid[1], int(1))
+            self.pr_update2_ML_wavefield_cuda(grid=grid,
+                                              block=(16, 16, 1),
+                                              args=(prsh[-1], obsh[-2], obsh[-1],
+                                                    prsh[0], obsh[0], num_pods,
+                                                    pr, ob, ex, pf, addr,
+                                                    np.float32(
+                                                  fac) if ex.dtype == np.complex64 else np.float64(fac)))
+
     def ob_update_local(self, addr, ob, pr, ex, aux, prn, a=0., b=1.):
         if self.queue is not None:
             self.queue.use()
diff --git a/ptypy/accelerate/cuda_cupy/multi_gpu.py b/ptypy/accelerate/cuda_cupy/multi_gpu.py
index 6ccbce9a7..e631c3405 100644
--- a/ptypy/accelerate/cuda_cupy/multi_gpu.py
+++ b/ptypy/accelerate/cuda_cupy/multi_gpu.py
@@ -10,12 +10,12 @@
   - OpenMPI in a conda install needs to have the environment variable
   --> if cuda support isn't enabled, the application simply crashes with a seg fault
 
-2) For NCCL peer-to-peer transfers, the EXCLUSIVE compute mode cannot be used. 
+2) For NCCL peer-to-peer transfers, the EXCLUSIVE compute mode cannot be used.
    It should be in DEFAULT mode.
 
 """
 
-from pkg_resources import parse_version
+from packaging.version import parse
 import numpy as np
 import cupy as cp
 from ptypy.utils import parallel
@@ -44,7 +44,7 @@
 have_cuda_mpi = (mpi4py is not None) and \
     "OMPI_MCA_opal_cuda_support" in os.environ and \
     os.environ["OMPI_MCA_opal_cuda_support"] == "true" and \
-    parse_version(parse_version(mpi4py.__version__).base_version) >= parse_version("3.1.0") and \
+    parse(parse(mpi4py.__version__).base_version) >= parse("3.1.0") and \
     not ('PTYPY_USE_MPI' in os.environ)
 
 
@@ -114,7 +114,7 @@ def allReduceSum(self, arr):
 
         count, datatype = self.__get_NCCL_count_dtype(arr)
 
-        self.com.allReduce(arr.data.ptr, arr.data.ptr, count, datatype, nccl.NCCL_SUM, 
+        self.com.allReduce(arr.data.ptr, arr.data.ptr, count, datatype, nccl.NCCL_SUM,
             cp.cuda.get_current_stream().ptr)
 
     def __get_NCCL_count_dtype(self, arr):
diff --git a/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py
index d527d9f15..a05839b47 100644
--- a/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py
+++ b/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py
@@ -14,7 +14,7 @@
 import numpy as np
 from pycuda import gpuarray
 import pycuda.driver as cuda
-import pycuda.cumath
+import pycuda.cumath as cm
 
 from ptypy.engines import register
 from ptypy.accelerate.base.engines.ML_serial import ML_serial, BaseModelSerial
@@ -73,6 +73,7 @@ def __init__(self, ptycho_parent, pars=None):
         Maximum likelihood reconstruction engine.
         """
         super().__init__(ptycho_parent, pars)
+        self.sqrt = cm.sqrt
 
     def engine_initialize(self):
         """
@@ -275,6 +276,13 @@ def _get_smooth_gradient(self, data, sigma):
 
     def _replace_ob_grad(self):
         new_ob_grad = self.ob_grad_new
+
+        # Wavefield preconditioner for the object
+        if self.p.wavefield_precond:
+            for name, s in new_ob_grad.storages.items():
+                s.gpu /= cm.sqrt(self.ob_fln.storages[name].gpu
+                              + self.p.wavefield_delta_object)
+
         # Smoothing preconditioner
         if self.smooth_gradient:
             self.smooth_gradient.sigma *= (1. - self.p.smooth_gradient_decay)
@@ -285,7 +293,8 @@ def _replace_ob_grad(self):
 
     def _replace_pr_grad(self):
         new_pr_grad = self.pr_grad_new
-        # probe support
+
+        # Probe support
         if self.p.probe_update_start <= self.curiter:
             # Apply probe support if needed
             for name, s in new_pr_grad.storages.items():
@@ -293,6 +302,12 @@ def _replace_pr_grad(self):
         else:
             new_pr_grad.fill(0.)
 
+        # Wavefield preconditioner for the probe
+        if self.p.wavefield_precond:
+            for name, s in new_pr_grad.storages.items():
+                s.gpu /= cm.sqrt(self.pr_fln.storages[name].gpu
+                              + self.p.wavefield_delta_probe)
+
         return self._replace_grad(self.pr_grad , new_pr_grad)
 
     def _replace_grad(self, grad, new_grad):
@@ -485,6 +500,12 @@ def new_grad(self):
         qu_htod = self.engine.qu_htod
         queue = self.engine.queue
 
+        if self.engine.p.wavefield_precond:
+            ob_fln = self.engine.ob_fln
+            pr_fln = self.engine.pr_fln
+            ob_fln << 0.
+            pr_fln << 0.
+
         self.engine._set_pr_ob_ref_for_data('gpu')
         ob_grad << 0.
         pr_grad << 0.
@@ -518,6 +539,9 @@ def new_grad(self):
             obg = ob_grad.S[oID].data
             pr = self.engine.pr.S[pID].data
             prg = pr_grad.S[pID].data
+            if self.engine.p.wavefield_precond:
+                obf = ob_fln.S[oID].data
+                prf = pr_fln.S[pID].data
 
             # Schedule w & I to device
             ev_w, w, data_w = self.engine.w_data.to_gpu(prep.weights, dID, qu_htod)
@@ -545,11 +569,17 @@ def new_grad(self):
 
             use_atomics = self.p.object_update_cuda_atomics
             addr = prep.addr_gpu if use_atomics else prep.addr2_gpu
-            POK.ob_update_ML(addr, obg, pr, aux, atomics=use_atomics)
+            if self.engine.p.wavefield_precond:
+                POK.ob_update_ML_wavefield(addr, obg, obf, pr, aux, atomics=use_atomics)
+            else:
+                POK.ob_update_ML(addr, obg, pr, aux, atomics=use_atomics)
 
             use_atomics = self.p.probe_update_cuda_atomics
             addr = prep.addr_gpu if use_atomics else prep.addr2_gpu
-            POK.pr_update_ML(addr, prg, ob, aux, atomics=use_atomics)
+            if self.engine.p.wavefield_precond:
+                POK.pr_update_ML_wavefield(addr, prg, prf, ob, aux, atomics=use_atomics)
+            else:
+                POK.pr_update_ML(addr, prg, ob, aux, atomics=use_atomics)
 
         queue.synchronize()
         self.engine.dID_list.reverse()
@@ -571,10 +601,18 @@ def new_grad(self):
             s.gpu.get(s.cpu)
         for s in pr_grad.S.values():
             s.gpu.get(s.cpu)
+        if self.engine.p.wavefield_precond:
+            for s in ob_fln.S.values():
+                s.gpu.get(s.cpu)
+            for s in pr_fln.S.values():
+                s.gpu.get(s.cpu)
         self.engine._set_pr_ob_ref_for_data('cpu')
 
         ob_grad.allreduce()
         pr_grad.allreduce()
+        if self.engine.p.wavefield_precond:
+            ob_fln.allreduce()
+            pr_fln.allreduce()
         parallel.allreduce(LL)
 
         # HtoD cause we continue on gpu
@@ -582,6 +620,11 @@ def new_grad(self):
             s.gpu.set(s.cpu)
         for s in pr_grad.S.values():
             s.gpu.set(s.cpu)
+        if self.engine.p.wavefield_precond:
+            for s in ob_fln.S.values():
+                s.gpu.set(s.cpu)
+            for s in pr_fln.S.values():
+                s.gpu.set(s.cpu)        
         self.engine._set_pr_ob_ref_for_data('gpu')
 
         # Object regularizer
diff --git a/ptypy/accelerate/cuda_pycuda/kernels.py b/ptypy/accelerate/cuda_pycuda/kernels.py
index 8fcebb6ef..bab47a8a3 100644
--- a/ptypy/accelerate/cuda_pycuda/kernels.py
+++ b/ptypy/accelerate/cuda_pycuda/kernels.py
@@ -880,6 +880,18 @@ def __init__(self, queue_thread=None,
             'MATH_TYPE': self.math_type
         })
         self.pr_update2_ML_cuda = None
+        self.ob_update_ML_wavefield_cuda = load_kernel("ob_update_ML_wavefield", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type
+        })
+        self.ob_update2_ML_wavefield_cuda = None
+        self.pr_update_ML_wavefield_cuda = load_kernel("pr_update_ML_wavefield", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type
+        })
+        self.pr_update2_ML_wavefield_cuda = None
         self.ob_update_local_cuda = load_kernel("ob_update_local", {
             'IN_TYPE': 'float',
             'OUT_TYPE': 'float',
@@ -1045,6 +1057,46 @@ def ob_update_ML(self, addr, ob, pr, ex, fac=2.0, atomics=True):
                                     np.float32(fac) if ex.dtype == np.complex64 else np.float64(fac),
                                     block=(16, 16, 1), grid=grid, stream=self.queue)
 
+    def ob_update_ML_wavefield(self, addr, ob, of, pr, ex, fac=2.0, atomics=True):
+        obsh = [np.int32(ax) for ax in ob.shape]
+        prsh = [np.int32(ax) for ax in pr.shape]
+        exsh = [np.int32(ax) for ax in ex.shape]
+
+        if atomics:
+            if addr.shape[3] != 3 or addr.shape[2] != 5:
+                raise ValueError('Address not in required shape for tiled ob_update')
+
+            num_pods = np.int32(addr.shape[0] * addr.shape[1])
+            self.ob_update_ML_wavefield_cuda(ex, num_pods, exsh[1], exsh[2],
+                                             pr, prsh[0], prsh[1], prsh[2],
+                                             ob, obsh[0], obsh[1], obsh[2],
+                                             of,
+                                             addr,
+                                             np.float32(fac) if ex.dtype == np.complex64 else np.float64(fac),
+                                             block=(32, 32, 1), grid=(int(num_pods), 1, 1), stream=self.queue)
+        else:
+            if addr.shape[0] != 5 or addr.shape[1] != 3:
+                raise ValueError('Address not in required shape for tiled ob_update')
+
+            num_pods = np.int32(addr.shape[2] * addr.shape[3])
+            if not self.ob_update2_ML_wavefield_cuda:
+                self.ob_update2_ML_wavefield_cuda = load_kernel("ob_update2_ML_wavefield", {
+                    "NUM_MODES": obsh[0],
+                    "BDIM_X": 16,
+                    "BDIM_Y": 16,
+                    'IN_TYPE': 'float',
+                    'OUT_TYPE': 'float',
+                    'MATH_TYPE': self.math_type,
+                    'ACC_TYPE': self.accumulator_type
+                })
+            grid = [int((x+15)//16) for x in ob.shape[-2:]]
+            grid = (grid[1], grid[0], int(1))
+            self.ob_update2_ML_wavefield_cuda(prsh[-1], obsh[0], num_pods,
+                                              obsh[-2], obsh[-1], prsh[0],
+                                              ob, pr, ex, of, addr,
+                                              np.float32(fac) if ex.dtype == np.complex64 else np.float64(fac),
+                                              block=(16, 16, 1), grid=grid, stream=self.queue)
+
     def pr_update_ML(self, addr, pr, ob, ex, fac=2.0, atomics=False):
         obsh = [np.int32(ax) for ax in ob.shape]
         prsh = [np.int32(ax) for ax in pr.shape]
@@ -1081,6 +1133,42 @@ def pr_update_ML(self, addr, pr, ob, ex, fac=2.0, atomics=False):
                                  np.float32(fac) if ex.dtype == np.complex64 else np.float64(fac),
                                  block=(16, 16, 1), grid=grid, stream=self.queue)
 
+    def pr_update_ML_wavefield(self, addr, pr, pf, ob, ex, fac=2.0, atomics=False):
+        obsh = [np.int32(ax) for ax in ob.shape]
+        prsh = [np.int32(ax) for ax in pr.shape]
+        if atomics:
+            if addr.shape[3] != 3 or addr.shape[2] != 5:
+                raise ValueError('Address not in required shape for tiled pr_update')
+            num_pods = np.int32(addr.shape[0] * addr.shape[1])
+            self.pr_update_ML_wavefield_cuda(ex, num_pods, prsh[1], prsh[2],
+                                          pr, prsh[0], prsh[1], prsh[2],
+                                          ob, obsh[0], obsh[1], obsh[2],
+                                          pf,
+                                          addr,
+                                          np.float32(fac) if ex.dtype == np.complex64 else np.float64(fac),
+                                          block=(32, 32, 1), grid=(int(num_pods), 1, 1), stream=self.queue)
+        else:
+            if addr.shape[0] != 5 or addr.shape[1] != 3:
+                raise ValueError('Address not in required shape for tiled pr_update')
+            num_pods = np.int32(addr.shape[2] * addr.shape[3])
+            if not self.pr_update2_ML_wavefield_cuda:
+                self.pr_update2_ML_wavefield_cuda = load_kernel("pr_update2_ML_wavefield", {
+                    "NUM_MODES": prsh[0],
+                    "BDIM_X": 16,
+                    "BDIM_Y": 16,
+                    'IN_TYPE': 'float',
+                    'OUT_TYPE': 'float',
+                    'MATH_TYPE': self.math_type,
+                    'ACC_TYPE': self.accumulator_type
+                })
+
+            grid = [int((x+15)//16) for x in pr.shape[-2:]]
+            grid = (grid[0], grid[1], int(1))
+            self.pr_update2_ML_wavefield_cuda(prsh[-1], obsh[-2], obsh[-1],
+                                           prsh[0], obsh[0], num_pods,
+                                           pr, ob, ex, pf, addr,
+                                           np.float32(fac) if ex.dtype == np.complex64 else np.float64(fac),
+                                           block=(16, 16, 1), grid=grid, stream=self.queue)
 
     def ob_update_local(self, addr, ob, pr, ex, aux, prn, a=0., b=1.):
         prn_max = gpuarray.max(prn, stream=self.queue)
diff --git a/ptypy/accelerate/cuda_pycuda/multi_gpu.py b/ptypy/accelerate/cuda_pycuda/multi_gpu.py
index 73138c3ee..09ec60552 100644
--- a/ptypy/accelerate/cuda_pycuda/multi_gpu.py
+++ b/ptypy/accelerate/cuda_pycuda/multi_gpu.py
@@ -5,7 +5,7 @@
 
 Findings:
 
-1) NCCL works with unit tests, but not in the engines. It seems to 
+1) NCCL works with unit tests, but not in the engines. It seems to
 add something to the existing pycuda Context or create a new one,
 as a later event recording on an exit wave transfer fails with
 'ivalid resource handle' Cuda Error. This error typically happens if for example
@@ -22,14 +22,14 @@
   - OpenMPI in a conda install needs to have the environment variable
   --> if cuda support isn't enabled, the application simply crashes with a seg fault
 
-4) For NCCL peer-to-peer transfers, the EXCLUSIVE compute mode cannot be used. 
+4) For NCCL peer-to-peer transfers, the EXCLUSIVE compute mode cannot be used.
    It should be in DEFAULT mode.
 
 5) NCCL support has been dropped from PyCUDA module, but can be used with CuPy module instead
 
 """
 
-from pkg_resources import parse_version
+from packaging.version import parse
 import numpy as np
 from pycuda import gpuarray
 import pycuda.driver as cuda
@@ -54,7 +54,7 @@
 have_cuda_mpi = (mpi4py is not None) and \
     "OMPI_MCA_opal_cuda_support" in os.environ and \
     os.environ["OMPI_MCA_opal_cuda_support"] == "true" and \
-    parse_version(parse_version(mpi4py.__version__).base_version) >= parse_version("3.1.0") and \
+    parse(parse(mpi4py.__version__).base_version) >= parse("3.1.0") and \
     hasattr(gpuarray.GPUArray, '__cuda_array_interface__') and \
     not ('PTYPY_USE_MPI' in os.environ)
 
@@ -97,7 +97,7 @@ def allReduceSum(self, arr):
         if parallel.MPIenabled:
             comm = parallel.comm
             comm.Allreduce(parallel.MPI.IN_PLACE, arr)
-            
+
 
 # pick the appropriate communicator depending on installed packages
 def get_multi_gpu_communicator(use_cuda_mpi=True):
diff --git a/ptypy/custom/WASP.py b/ptypy/custom/WASP.py
index 275224b9a..3ef84bb5c 100644
--- a/ptypy/custom/WASP.py
+++ b/ptypy/custom/WASP.py
@@ -124,11 +124,11 @@ def __init__(self, ptycho_parent, pars=None):
         self.article = dict(
                 title='WASP: Weighted Average of Sequential Projections for ptychographic phase retrieval',
                 author='A. M. Maiden, W. Mei and P. Li',
-                journal='Optica',
-                volume=42,
+                journal='Optics Express',
+                volume=32,
                 year=2024,
-                page=42,
-                doi='doi',
+                page=21327,
+                doi='10.1364/OE.516946',
                 comment='Weighted Average of Sequential Projections',
                 )
         self.ptycho.citations.add_article(**self.article)
@@ -237,7 +237,8 @@ def overlap_update(self):
             # update object first, then probe, and accumulate their sum for
             # averaging after going through all the views
             self.object_update(view, ex_old)
-            self.probe_update(view, ex_old, ob_old)
+            if self.p.probe_update_start <= self.curiter:
+                self.probe_update(view, ex_old, ob_old)
 
         # WASP
         self.wasp_averaging()
@@ -323,16 +324,17 @@ def wasp_averaging(self):
 
             self.clip_object(s)
 
-        for name, p in self.pr.storages.items():
-            pr_sum_nmr = self.pr_sum_nmr.storages[name].data
-            pr_sum_dnm = self.pr_sum_dnm.storages[name].data
+        if self.p.probe_update_start <= self.curiter:
+            for name, p in self.pr.storages.items():
+                pr_sum_nmr = self.pr_sum_nmr.storages[name].data
+                pr_sum_dnm = self.pr_sum_dnm.storages[name].data
 
-            parallel.allreduce(pr_sum_nmr)
-            parallel.allreduce(pr_sum_dnm)
+                parallel.allreduce(pr_sum_nmr)
+                parallel.allreduce(pr_sum_dnm)
 
-            # avoid division by zero
-            is_zero = np.isclose(pr_sum_dnm, 0)
-            p.data = np.where(is_zero, pr_sum_nmr, pr_sum_nmr / pr_sum_dnm)
+                # avoid division by zero
+                is_zero = np.isclose(pr_sum_dnm, 0)
+                p.data = np.where(is_zero, pr_sum_nmr, pr_sum_nmr / pr_sum_dnm)
 
     def clip_object(self, ob):
         """Copied from _ProjectionEngine
diff --git a/ptypy/custom/WASP_cupy.py b/ptypy/custom/WASP_cupy.py
index 06d1507fb..c23448c49 100644
--- a/ptypy/custom/WASP_cupy.py
+++ b/ptypy/custom/WASP_cupy.py
@@ -338,8 +338,9 @@ def engine_iterate(self, num=1):
                     ob_old = ob.copy()
                     POK.ob_update_wasp(addr, ob, pr, ex, aux, ob_sum_nmr,
                                        ob_sum_dnm, alpha=self.p.alpha)
-                    POK.pr_update_wasp(addr, pr, ob_old, ex, aux, pr_sum_nmr,
-                                       pr_sum_dnm, beta=self.p.beta)
+                    if self.p.probe_update_start <= self.curiter:
+                        POK.pr_update_wasp(addr, pr, ob_old, ex, aux, pr_sum_nmr,
+                                           pr_sum_dnm, beta=self.p.beta)
 
                     ## compute log-likelihood
                     if self.p.compute_log_likelihood:
@@ -355,7 +356,8 @@ def engine_iterate(self, num=1):
                 self.multigpu.allReduceSum(pr_sum_dnm)
 
                 POK.avg_wasp(ob, ob_sum_nmr, ob_sum_dnm)
-                POK.avg_wasp(pr, pr_sum_nmr, pr_sum_dnm)
+                if self.p.probe_update_start <= self.curiter:
+                    POK.avg_wasp(pr, pr_sum_nmr, pr_sum_dnm)
 
                 # Clip object
                 if self.p.clip_object is not None:
diff --git a/ptypy/custom/WASP_pycuda.py b/ptypy/custom/WASP_pycuda.py
index e96bce1b5..e631e855a 100644
--- a/ptypy/custom/WASP_pycuda.py
+++ b/ptypy/custom/WASP_pycuda.py
@@ -338,9 +338,9 @@ def engine_iterate(self, num=1):
                     ob_old = ob.copy()
                     POK.ob_update_wasp(addr, ob, pr, ex, aux, ob_sum_nmr,
                                        ob_sum_dnm, alpha=self.p.alpha)
-                    POK.pr_update_wasp(addr, pr, ob_old, ex, aux, pr_sum_nmr,
-                                       pr_sum_dnm, beta=self.p.beta)
-
+                    if self.p.probe_update_start <= self.curiter:
+                        POK.pr_update_wasp(addr, pr, ob_old, ex, aux, pr_sum_nmr,
+                                           pr_sum_dnm, beta=self.p.beta)
 
                     ## compute log-likelihood
                     if self.p.compute_log_likelihood:
@@ -356,7 +356,8 @@ def engine_iterate(self, num=1):
                 self.multigpu.allReduceSum(pr_sum_dnm)
 
                 POK.avg_wasp(ob, ob_sum_nmr, ob_sum_dnm)
-                POK.avg_wasp(pr, pr_sum_nmr, pr_sum_dnm)
+                if self.p.probe_update_start <= self.curiter:
+                    POK.avg_wasp(pr, pr_sum_nmr, pr_sum_dnm)
 
                 # Clip object
                 if self.p.clip_object is not None:
diff --git a/ptypy/custom/WASP_serial.py b/ptypy/custom/WASP_serial.py
index 96391ea27..ccd961b2a 100644
--- a/ptypy/custom/WASP_serial.py
+++ b/ptypy/custom/WASP_serial.py
@@ -292,8 +292,9 @@ def engine_iterate(self, num=1):
                     ob_old = ob.copy()
                     POK.ob_update_wasp(addr, ob, pr, ex, aux, ob_sum_nmr,
                                        ob_sum_dnm, alpha=self.p.alpha)
-                    POK.pr_update_wasp(addr, pr, ob_old, ex, aux, pr_sum_nmr,
-                                       pr_sum_dnm, beta=self.p.beta)
+                    if self.p.probe_update_start <= self.curiter:
+                        POK.pr_update_wasp(addr, pr, ob_old, ex, aux, pr_sum_nmr,
+                                           pr_sum_dnm, beta=self.p.beta)
 
                     self.benchmark.wasp_ob_pr_update += time.time() - t1
                     self.benchmark.calls_wasp_ob_pr_update += 1
@@ -321,7 +322,8 @@ def engine_iterate(self, num=1):
                 parallel.allreduce(pr_sum_dnm)
 
                 POK.avg_wasp(ob, ob_sum_nmr, ob_sum_dnm)
-                POK.avg_wasp(pr, pr_sum_nmr, pr_sum_dnm)
+                if self.p.probe_update_start <= self.curiter:
+                    POK.avg_wasp(pr, pr_sum_nmr, pr_sum_dnm)
 
                 self.benchmark.wasp_averaging += time.time() - t1
                 self.benchmark.calls_wasp_averaging += 1
diff --git a/ptypy/engines/ML.py b/ptypy/engines/ML.py
index 9d5bb7d1a..8cafd78c8 100644
--- a/ptypy/engines/ML.py
+++ b/ptypy/engines/ML.py
@@ -106,6 +106,22 @@ class ML(PositionCorrectionEngine):
     help = How many coefficients to be used in the the linesearch
     doc = choose between the 'quadratic' approximation (default) or 'all'
 
+    [wavefield_precond]
+    default = False
+    type = bool
+    help = Whether to use the wavefield preconditioner
+    doc = This parameter can give faster convergence.
+
+    [wavefield_delta_object]
+    default = 0.1
+    type = float
+    help = Wavefield preconditioner damping constant for the object.
+
+    [wavefield_delta_probe]
+    default = 0.1
+    type = float
+    help = Wavefield preconditioner damping constant for the probe.
+
     """
 
     SUPPORTED_MODELS = [Full, Vanilla, Bragg3dModel, BlockVanilla, BlockFull, GradFull, BlockGradFull]
@@ -137,6 +153,9 @@ def __init__(self, ptycho_parent, pars=None):
         # Probe gradient
         self.pr_grad_new = None
 
+        # Object and probe fluence maps
+        self.ob_fln = None
+        self.pr_fln = None
 
         # Other
         self.tmin = None
@@ -172,6 +191,11 @@ def engine_initialize(self):
         self.pr_grad_new = self.pr.copy(self.pr.ID + '_grad_new', fill=0.)
         self.pr_h = self.pr.copy(self.pr.ID + '_h', fill=0.)
 
+        # Object and probe fluence maps
+        if self.p.wavefield_precond:
+            self.ob_fln = self.ob.copy(self.ob.ID + '_fln', fill=0., dtype='real')
+            self.pr_fln = self.pr.copy(self.pr.ID + '_fln', fill=0., dtype='real')
+
         self.tmin = 1.
 
         # Other options
@@ -230,6 +254,12 @@ def engine_iterate(self, num=1):
             else:
                 new_pr_grad.fill(0.)
 
+            # Wavefield preconditioner
+            if self.p.wavefield_precond:
+                for name, s in new_ob_grad.storages.items():
+                    new_ob_grad.storages[name].data /= np.sqrt(self.ob_fln.storages[name].data + self.p.wavefield_delta_object)
+                    new_pr_grad.storages[name].data /= np.sqrt(self.pr_fln.storages[name].data + self.p.wavefield_delta_probe)
+
             # Smoothing preconditioner
             if self.smooth_gradient:
                 self.smooth_gradient.sigma *= (1. - self.p.smooth_gradient_decay)
@@ -276,19 +306,35 @@ def engine_iterate(self, num=1):
             self.pr_grad << new_pr_grad
 
             dt = self.ptycho.FType
+
             # 3. Next conjugate
             self.ob_h *= bt / self.tmin
-
-            # Smoothing preconditioner
-            if self.smooth_gradient:
+            # Wavefield preconditioner for the object (with and without smoothing preconditioner)
+            if self.p.wavefield_precond:
                 for name, s in self.ob_h.storages.items():
-                    s.data[:] -= self.smooth_gradient(self.ob_grad.storages[name].data)
+                    if self.smooth_gradient:
+                        s.data[:] -= self.smooth_gradient(self.ob_grad.storages[name].data 
+                                      / np.sqrt(self.ob_fln.storages[name].data + self.p.wavefield_delta_object))
+                    else:
+                        s.data[:] -= (self.ob_grad.storages[name].data 
+                                      / np.sqrt(self.ob_fln.storages[name].data + self.p.wavefield_delta_object)) 
             else:
-                self.ob_h -= self.ob_grad
+                # Smoothing preconditioner for the object
+                if self.smooth_gradient:
+                    for name, s in self.ob_h.storages.items():
+                        s.data[:] -= self.smooth_gradient(self.ob_grad.storages[name].data)
+                else:
+                    self.ob_h -= self.ob_grad
 
             self.pr_h *= bt / self.tmin
             self.pr_grad *= self.scale_p_o
-            self.pr_h -= self.pr_grad
+            # Wavefield preconditioner for the probe
+            if self.p.wavefield_precond:
+                for name, s in self.pr_h.storages.items():
+                    s.data[:] -= (self.pr_grad.storages[name].data 
+                                  / np.sqrt(self.pr_fln.storages[name].data + self.p.wavefield_delta_probe))
+            else:
+                self.pr_h -= self.pr_grad
 
             # In principle, the way things are now programmed this part
             # could be iterated over in a real Newton-Raphson style.
@@ -309,9 +355,9 @@ def engine_iterate(self, num=1):
                 self.tmin = dt(-0.5 * B[1] / B[2])
             else:
                 raise NotImplementedError("poly_line_coeffs should be 'quadratic' or 'all'")
-                
+
             tc += time.time() - t2
-                
+
             self.ob_h *= self.tmin
             self.pr_h *= self.tmin
             self.ob += self.ob_h
@@ -353,6 +399,11 @@ def engine_finalize(self):
         del self.pr_grad_new
         del self.ptycho.containers[self.pr_h.ID]
         del self.pr_h
+        if self.p.wavefield_precond:
+            del self.ptycho.containers[self.ob_fln.ID]
+            del self.ob_fln
+            del self.ptycho.containers[self.pr_fln.ID]
+            del self.pr_fln
 
         # Save floating intensities into runtime
         self.ptycho.runtime["float_intens"] = parallel.gather_dict(self.ML_model.float_intens_coeff)
@@ -377,6 +428,8 @@ def __init__(self, MLengine):
         self.ob = self.engine.ob
         self.ob_grad = self.engine.ob_grad_new
         self.pr_grad = self.engine.pr_grad_new
+        self.ob_fln = self.engine.ob_fln
+        self.pr_fln = self.engine.pr_fln
         self.pr = self.engine.pr
         self.float_intens_coeff = {}
 
@@ -490,6 +543,9 @@ def new_grad(self):
         """
         self.ob_grad.fill(0.)
         self.pr_grad.fill(0.)
+        if self.p.wavefield_precond:
+            self.ob_fln.fill(0.)
+            self.pr_fln.fill(0.)
 
         # We need an array for MPI
         LL = np.array([0.])
@@ -531,6 +587,11 @@ def new_grad(self):
                 self.ob_grad[pod.ob_view] += 2. * xi * pod.probe.conj()
                 self.pr_grad[pod.pr_view] += 2. * xi * pod.object.conj()
 
+                # Compute fluence maps for object and probe
+                if self.p.wavefield_precond:
+                    self.ob_fln[pod.ob_view] += u.abs2(pod.probe)
+                    self.pr_fln[pod.pr_view] += u.abs2(pod.object)
+
             diff_view.error = LLL
             error_dct[dname] = np.array([0, LLL / np.prod(DI.shape), 0])
             LL += LLL
@@ -538,6 +599,9 @@ def new_grad(self):
         # MPI reduction of gradients
         self.ob_grad.allreduce()
         self.pr_grad.allreduce()
+        if self.p.wavefield_precond:
+            self.ob_fln.allreduce()
+            self.pr_fln.allreduce()
         parallel.allreduce(LL)
 
         # Object regularizer
@@ -733,6 +797,9 @@ def new_grad(self):
         """
         self.ob_grad.fill(0.)
         self.pr_grad.fill(0.)
+        if self.p.wavefield_precond:
+            self.ob_fln.fill(0.)
+            self.pr_fln.fill(0.)
 
         # We need an array for MPI
         LL = np.array([0.])
@@ -774,6 +841,11 @@ def new_grad(self):
                 self.ob_grad[pod.ob_view] += 2 * xi * pod.probe.conj()
                 self.pr_grad[pod.pr_view] += 2 * xi * pod.object.conj()
 
+                # Compute fluence maps for object and probe
+                if self.p.wavefield_precond:
+                    self.ob_fln[pod.ob_view] += u.abs2(pod.probe)
+                    self.pr_fln[pod.pr_view] += u.abs2(pod.object)
+
             diff_view.error = LLL
             error_dct[dname] = np.array([0, LLL / np.prod(DI.shape), 0])
             LL += LLL
@@ -781,6 +853,9 @@ def new_grad(self):
         # MPI reduction of gradients
         self.ob_grad.allreduce()
         self.pr_grad.allreduce()
+        if self.p.wavefield_precond:
+            self.ob_fln.allreduce()
+            self.pr_fln.allreduce()
         parallel.allreduce(LL)
 
         # Object regularizer
@@ -989,6 +1064,9 @@ def new_grad(self):
         """
         self.ob_grad.fill(0.)
         self.pr_grad.fill(0.)
+        if self.p.wavefield_precond:
+            self.ob_fln.fill(0.)
+            self.pr_fln.fill(0.)
 
         # We need an array for MPI
         LL = np.array([0.])
@@ -1030,6 +1108,11 @@ def new_grad(self):
                 self.ob_grad[pod.ob_view] += 2. * xi * pod.probe.conj()
                 self.pr_grad[pod.pr_view] += 2. * xi * pod.object.conj()
 
+                # Compute fluence maps for object and probe
+                if self.p.wavefield_precond:
+                    self.ob_fln[pod.ob_view] += u.abs2(pod.probe)
+                    self.pr_fln[pod.pr_view] += u.abs2(pod.object)
+
             diff_view.error = LLL
             error_dct[dname] = np.array([0, LLL / np.prod(DA.shape), 0])
             LL += LLL
@@ -1037,6 +1120,9 @@ def new_grad(self):
         # MPI reduction of gradients
         self.ob_grad.allreduce()
         self.pr_grad.allreduce()
+        if self.p.wavefield_precond:
+            self.ob_fln.allreduce()
+            self.pr_fln.allreduce()
         parallel.allreduce(LL)
 
         # Object regularizer
@@ -1290,7 +1376,7 @@ def __init__(self, sigma):
             self.sigma = sigma
 
         def __call__(self, x):
-            return u.c_gf(x, [0, self.sigma, self.sigma])
+            return  u.c_gf(x, [0, self.sigma, self.sigma])
 
     # from scipy.signal import correlate2d
     # class HannFilt:
diff --git a/ptypy/resources/__init__.py b/ptypy/resources/__init__.py
index 50bab0b5e..becdece26 100644
--- a/ptypy/resources/__init__.py
+++ b/ptypy/resources/__init__.py
@@ -1,15 +1,17 @@
-import pkg_resources
+from importlib.resources import files
 import numpy as np
 
-flowerfile = pkg_resources.resource_filename(__name__,'flowers.png')
-moonfile = pkg_resources.resource_filename(__name__,'moon.png')
-treefile = pkg_resources.resource_filename(__name__,'tree.png')
+_res_dir = files('ptypy.resources')
+flowerfile =  _res_dir / 'flowers.png'
+moonfile = _res_dir / 'moon.png'
+treefile = _res_dir / 'tree.png'
+
 
 def flower_obj(shape=None):
     from ptypy import utils as u
     import numpy as np
     from matplotlib.image import imread
-    
+
     im = u.rgb2complex(imread(flowerfile))
     if shape is not None:
         sh = u.expect2(shape)
@@ -37,7 +39,7 @@ def moon_pr(shape=None):
     from ptypy import utils as u
     import numpy as np
     from matplotlib.image import imread
-    
+
     im = u.rgb2complex(imread(moonfile))
     if shape is not None:
         sh = u.expect2(shape)
diff --git a/ptypy/utils/plot_utils.py b/ptypy/utils/plot_utils.py
index 2f85e72b5..a4a81a596 100644
--- a/ptypy/utils/plot_utils.py
+++ b/ptypy/utils/plot_utils.py
@@ -128,7 +128,7 @@ def pause(timeout=-1, message=None):
                     print(message)
                 time.sleep(timeout)
 
-# BD: With version 9.1.0 of PIL, _MODE_CONV has been removed, 
+# BD: With version 9.1.0 of PIL, _MODE_CONV has been removed,
 #     see here: https://github.com/python-pillow/Pillow/pull/6057
 #     can't see a reason why this is still needed, therefore commenting it out
 # FIXME: Is this still needed?
@@ -179,7 +179,7 @@ def complex2hsv(cin, vmin=0., vmax=None):
     else:
         assert vmin < vmax
         v = (v.clip(vmin, vmax)-vmin)/(vmax-vmin)
-    
+
     return np.asarray((h, s, v))
 
 
@@ -345,7 +345,7 @@ def imsave(a, filename=None, vmin=None, vmax=None, cmap=None):
     uses a matplotlib colormap with name 'gray'
     """
     if str(cmap) == cmap:
-        cmap = mpl.colormaps.get_cmap(cmap)
+        cmap = mpl.colormaps[cmap]
 
     if a.dtype.kind == 'c':
         i = complex2rgb(a, vmin=vmin, vmax=vmax)
@@ -415,8 +415,12 @@ def imload(filename):
                                                       (0.340,   1,    1),
                                                       (0.650,   0,    0),
                                                       (1.000,   0,    0))}
-                                                      
-mpl.colormaps.register(cmap=LinearSegmentedColormap(name='franzmap', segmentdata=franzmap_cm, N=256))
+
+_lscm = LinearSegmentedColormap(name='franzmap', segmentdata=franzmap_cm, N=256)
+try:
+    mpl.colormaps.register(cmap=_lscm)
+except AttributeError:
+    matplotlib.cm.register_cmap(cmap=_lscm)
 
 def franzmap():
     """\
@@ -426,7 +430,7 @@ def franzmap():
     im = mpl.pyplot.gci()
 
     if im is not None:
-        im.set_cmap(matplotlib.cm.get_cmap('franzmap'))
+        im.set_cmap(mpl.colormaps['franzmap'])
     mpl.pyplot.draw_if_interactive()
 
 
@@ -595,46 +599,46 @@ class PtyAxis(object):
     """
     Plot environment for matplotlib to allow for a Image plot with color axis,
     potentially of a potentially complex array.
-    
+
     Please note that this class may undergo changes or become obsolete altogether.
     """
     def __init__(self, ax=None, data=None, channel='r', cmap=None, fontsize=8, **kwargs):
         """
-        
+
         Parameters
         ----------
-        
+
         ax : pyplot.axis
             An axis in matplotlib figure. If ``None`` a figure with a single
             axis will be created.
-            
+
         data : numpy.ndarray
-            The (complex) twodimensional data to be displayed. 
-            
+            The (complex) twodimensional data to be displayed.
+
         channel : str
             Choose
-            
+
             - ``'a'`` to plot absolute/modulus value of the data,
             - ``'p'`` to plot phase value of the data,
             - ``'a'`` to plot real value of the data,
             - ``'a'`` to plot imaginary value of the data,
-            - ``'a'`` to plot a composite image where phase channel maps to hue and 
+            - ``'a'`` to plot a composite image where phase channel maps to hue and
               modulus channel maps to brightness of the color.
-              
-        cmap : str 
+
+        cmap : str
             String representation of one of matplotlibs colormaps.
-            
+
         fontsize : int
             Base font size of labels, etc.
-            
+
         Keyword Arguments
         -----------------
         vmin, vmax : float
             Minimum and maximum value for colormap scaling
-            
+
         rmramp : bool
             Remove phase ramp if ``True``, default is ``False``
-            
+
         """
         if ax is None:
             fig = plt.figure()
@@ -679,10 +683,10 @@ def set_channel(self, channel, update=True):
 
     def set_cmap(self, cmap, update=True):
         try:
-            self.cmap = mpl.colormaps.get_cmap(cmap)
+            self.cmap = mpl.colormaps[cmap]
         except:
             logger.debug("Colormap `%s` not found. Using `gray`" % str(cmap))
-            self.cmap = mpl.colormaps.get_cmap('gray')
+            self.cmap = mpl.colormaps['gray']
         if update:
             self._update()
             self._update_colorscale()
diff --git a/ptypy/version.py b/ptypy/version.py
index 16317abfa..0ae9661c4 100644
--- a/ptypy/version.py
+++ b/ptypy/version.py
@@ -1,6 +1,6 @@
 
-short_version = '0.8.2'
-version = '0.8.2'
+short_version = '0.9.0'
+version = '0.9.0'
 release = False
 
 if not release:
diff --git a/pyproject.toml b/pyproject.toml
index 4b35cf554..3f4b0c9b1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,7 +11,7 @@ authors = [
 ]
 description = "Ptychography Reconstruction for Python"
 readme = "README.rst"
-requires-python = ">=3.7"
+requires-python = ">=3.9"
 classifiers = [
     "Programming Language :: Python :: 3",
     "Development Status :: 3 - Alpha",
@@ -31,7 +31,7 @@ dependencies = [
 [project.optional-dependencies]
 full = ["mpi4py","matplotlib","pyzmq","pillow", "pyfftw", "pyyaml"]
 cupy-cuda11x = ["mpi4py","matplotlib","pyzmq","pillow", "pyfftw", "cupy-cuda11x"]
-cupy-cuda12x = ["mpi4py","matplotlib","pyzmq","pillow", "pyfftw", "cupy-cuda12x"] 
+cupy-cuda12x = ["mpi4py","matplotlib","pyzmq","pillow", "pyfftw", "cupy-cuda12x"]
 
 [project.scripts]
 "ptypy.plot" = "ptypy.cli.plotter:ptypy_plot"
@@ -55,6 +55,7 @@ testpaths = [
   "test/ptyscan_tests",
   "test/template_tests",
   "test/util_tests",
+  "test/accelerate_tests/base_tests"
 ]
 
 # this is all BETA according to setuptools
@@ -69,4 +70,4 @@ ptypy = "ptypy"
 
 [tool.setuptools.package-data]
 ptypy = ["resources/*",]
-"ptypy.accelerate.cuda_common" = ["*.cu", "*.cuh"]
\ No newline at end of file
+"ptypy.accelerate.cuda_common" = ["*.cu", "*.cuh"]
diff --git a/release_notes.md b/release_notes.md
index bed47e241..489885686 100644
--- a/release_notes.md
+++ b/release_notes.md
@@ -1,3 +1,29 @@
+# PtyPy 0.9 release notes
+
+We're excited to bring you a new release in preparation for the [PtyPy workshop 2025](https://www.synchrotron-soleil.fr/en/events/ptypy-2025)
+held at SOLEIL. This release provides a new pre-conditioner for the ML engine and a few bug fixes.
+
+## ML with wavefield pre-conditioner
+
+We have added a new pre-conditioner to the ML engines, which seems to significantly improve probe retrieval
+and overall convergence. It can be enabled by setting the following parameters
+
+```
+p.engines.engine.name = "ML"
+p.engines.engine.wavefield_precond = True
+p.engines.engine.wavefield_delta_object = 0.1
+p.engines.engine.wavefield_delta_probe = 0.1
+```
+
+where ```wavefield_delta_object``` and ```wavefield_delta_probe``` are regularisation parameters to avoid
+division by zero. 
+
+## Other changes
+
+We have dropped support for Python <= 3.8 and adopted Numpy >= 2.0.
+We have also fixed an issue with the filtered_fft build and made small improvements to the custom WASP engine.
+
+
 # PtyPy 0.8 release notes
 
 We're excited to bring you a new release, with new engines, CuPy support and
@@ -43,9 +69,8 @@ Patch release.
 
 # PtyPy 0.7 release notes
 
-This release is focused on improving the usability of PtyPy in Jupyter notebooks in preparation for the 
-[PtyPy workshop](https://www.diamond.ac.uk/Home/Events/2023/Ptychography--PtyPy--Software-Workshop-2023.html) 
-held at the Diamond Light Source in January 2023. The workshop features extensive interactive 
+This release is focused on improving the usability of PtyPy in Jupyter notebooks in preparation for the first 
+PtyPy workshop held at the Diamond Light Source in January 2023. The workshop features extensive interactive 
 [tutorials](https://ptycho.github.io/tutorials) delivered using Jupyter notebooks. 
 
 ## Build changes
diff --git a/templates/engines/cupy/moonflower_ML_cupy.py b/templates/engines/cupy/moonflower_ML_cupy.py
index af0427cfb..61845bdd3 100644
--- a/templates/engines/cupy/moonflower_ML_cupy.py
+++ b/templates/engines/cupy/moonflower_ML_cupy.py
@@ -16,7 +16,7 @@
 
 # for verbose output
 p.verbose_level = "info"
-p.frames_per_block = 400
+p.frames_per_block = 100
 # set home path
 p.io = u.Param()
 p.io.home =  "/".join([tmpdir, "ptypy"])
@@ -33,7 +33,7 @@
 p.scans.MF.data= u.Param()
 p.scans.MF.data.name = 'MoonFlowerScan'
 p.scans.MF.data.shape = 128
-p.scans.MF.data.num_frames = 100
+p.scans.MF.data.num_frames = 200
 p.scans.MF.data.save = None
 
 p.scans.MF.illumination = u.Param(diversity=None)
diff --git a/templates/engines/cupy/moonflower_ML_cupy_wavefield.py b/templates/engines/cupy/moonflower_ML_cupy_wavefield.py
new file mode 100644
index 000000000..0e2d896b2
--- /dev/null
+++ b/templates/engines/cupy/moonflower_ML_cupy_wavefield.py
@@ -0,0 +1,58 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+
+from ptypy.core import Ptycho
+from ptypy import utils as u
+import ptypy
+ptypy.load_gpu_engines(arch="cupy")
+
+import tempfile
+tmpdir = tempfile.gettempdir()
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = "info"
+p.frames_per_block = 100
+# set home path
+p.io = u.Param()
+p.io.home =  "/".join([tmpdir, "ptypy"])
+p.io.autosave = u.Param(active=False)
+p.io.autoplot = u.Param(active=False)
+p.io.interaction = u.Param(active=False)
+
+# max 200 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+# now you have to specify which ScanModel to use with scans.XX.name,
+# just as you have to give 'name' for engines and PtyScan subclasses.
+p.scans.MF.name = 'BlockFull' 
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 200
+p.scans.MF.data.save = None
+
+p.scans.MF.illumination = u.Param(diversity=None)
+p.scans.MF.coherence = u.Param(num_probe_modes=1)
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'ML_cupy'
+p.engines.engine00.numiter = 300
+p.engines.engine00.numiter_contiguous = 5
+p.engines.engine00.wavefield_precond = True
+
+# prepare and run
+if __name__ == "__main__":
+    P = Ptycho(p,level=5)
diff --git a/templates/engines/numpy/moonflower_ML_Gaussian_wavefield.py b/templates/engines/numpy/moonflower_ML_Gaussian_wavefield.py
new file mode 100644
index 000000000..b5f409f68
--- /dev/null
+++ b/templates/engines/numpy/moonflower_ML_Gaussian_wavefield.py
@@ -0,0 +1,56 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+#import ptypy
+from ptypy.core import Ptycho
+from ptypy import utils as u
+
+import tempfile
+tmpdir = tempfile.gettempdir()
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = "info"
+
+# set home path
+p.io = u.Param()
+p.io.home = "/".join([tmpdir, "ptypy"])
+
+# saving intermediate results
+p.io.autosave = u.Param(active=False)
+
+# opens plotting GUI if interaction set to active)
+p.io.autoplot = u.Param(active=True)
+p.io.interaction = u.Param(active=True)
+
+# max 100 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+p.scans.MF.name = 'BlockFull'
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 200
+p.scans.MF.data.save = None
+
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'ML'
+p.engines.engine00.ML_type = 'Gaussian'
+p.engines.engine00.numiter = 300
+p.engines.engine00.wavefield_precond = True
+
+# prepare and run
+if __name__ == "__main__":
+    P = Ptycho(p,level=5)
diff --git a/templates/engines/pycuda/moonflower_ML_pycuda.py b/templates/engines/pycuda/moonflower_ML_pycuda.py
index 2938275f7..8afcf3517 100644
--- a/templates/engines/pycuda/moonflower_ML_pycuda.py
+++ b/templates/engines/pycuda/moonflower_ML_pycuda.py
@@ -16,7 +16,7 @@
 
 # for verbose output
 p.verbose_level = "info"
-p.frames_per_block = 400
+p.frames_per_block = 100
 # set home path
 p.io = u.Param()
 p.io.home =  "/".join([tmpdir, "ptypy"])
diff --git a/templates/engines/pycuda/moonflower_ML_pycuda_wavefield.py b/templates/engines/pycuda/moonflower_ML_pycuda_wavefield.py
new file mode 100644
index 000000000..04c0528f3
--- /dev/null
+++ b/templates/engines/pycuda/moonflower_ML_pycuda_wavefield.py
@@ -0,0 +1,58 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+
+from ptypy.core import Ptycho
+from ptypy import utils as u
+import ptypy
+ptypy.load_gpu_engines(arch="pycuda")
+
+import tempfile
+tmpdir = tempfile.gettempdir()
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = "info"
+p.frames_per_block = 100
+# set home path
+p.io = u.Param()
+p.io.home =  "/".join([tmpdir, "ptypy"])
+p.io.autosave = u.Param(active=False)
+p.io.autoplot = u.Param(active=False)
+p.io.interaction = u.Param(active=False)
+
+# max 200 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+# now you have to specify which ScanModel to use with scans.XX.name,
+# just as you have to give 'name' for engines and PtyScan subclasses.
+p.scans.MF.name = 'BlockFull' 
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 200
+p.scans.MF.data.save = None
+
+p.scans.MF.illumination = u.Param(diversity=None)
+p.scans.MF.coherence = u.Param(num_probe_modes=1)
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'ML_pycuda'
+p.engines.engine00.numiter = 300
+p.engines.engine00.numiter_contiguous = 5
+p.engines.engine00.wavefield_precond = True
+
+# prepare and run
+if __name__ == "__main__":
+    P = Ptycho(p,level=5)
diff --git a/templates/engines/serial/moonflower_ML_serial_wavefield.py b/templates/engines/serial/moonflower_ML_serial_wavefield.py
new file mode 100644
index 000000000..3d2d2dc84
--- /dev/null
+++ b/templates/engines/serial/moonflower_ML_serial_wavefield.py
@@ -0,0 +1,59 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+from ptypy.core import Ptycho
+from ptypy import utils as u
+
+import ptypy
+ptypy.load_gpu_engines(arch="serial")
+
+import tempfile
+tmpdir = tempfile.gettempdir()
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = "info"
+p.frames_per_block = 100
+
+# set home path
+p.io = u.Param()
+p.io.home = "/".join([tmpdir, "ptypy"])
+p.io.autosave = u.Param(active=False)
+p.io.autoplot = u.Param(active=False)
+p.io.interaction = u.Param(active=False)
+
+# max 200 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+# now you have to specify which ScanModel to use with scans.XX.name,
+# just as you have to give 'name' for engines and PtyScan subclasses.
+p.scans.MF.name = 'BlockFull'
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 200
+p.scans.MF.data.save = None
+
+p.scans.MF.illumination = u.Param(diversity=None)
+p.scans.MF.coherence = u.Param(num_probe_modes=1)
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'ML_serial'
+p.engines.engine00.ML_type = 'Gaussian'
+p.engines.engine00.numiter = 300
+p.engines.engine00.wavefield_precond = True
+
+# prepare and run
+if __name__ == "__main__":
+    P = Ptycho(p,level=5)
diff --git a/test/accelerate_tests/base_tests/WASP_tests.py b/test/accelerate_tests/base_tests/WASP_test.py
similarity index 89%
rename from test/accelerate_tests/base_tests/WASP_tests.py
rename to test/accelerate_tests/base_tests/WASP_test.py
index a0ea4514e..5cf73982c 100644
--- a/test/accelerate_tests/base_tests/WASP_tests.py
+++ b/test/accelerate_tests/base_tests/WASP_test.py
@@ -99,6 +99,7 @@ def test_WASP_serial_base(self):
             engine_params = u.Param()
             engine_params.name = eng
             engine_params.numiter = 10
+            engine_params.clip_object = (0, 2)
             engine_params.random_seed = 721
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
                                            scanmodel="BlockFull", autosave=False, verbose_level="critical"))
@@ -120,12 +121,28 @@ def test_WASP_serial_clip(self):
         if parallel.master:
             self.check_engine_output(out, plotting=False, debug=False)
 
+    def test_WASP_serial_probe_start(self):
+        out = []
+        for eng in ["WASP", "WASP_serial"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 10
+            engine_params.clip_object = (0, 2)
+            engine_params.probe_update_start = 4
+            engine_params.random_seed = 721
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+
+        if parallel.master:
+            self.check_engine_output(out, plotting=False, debug=False)
+
     def test_WASP_serial_alpha_beta(self):
         out = []
         for eng in ["WASP", "WASP_serial"]:
             engine_params = u.Param()
             engine_params.name = eng
             engine_params.numiter = 10
+            engine_params.clip_object = (0, 2)
             engine_params.alpha = 0.64
             engine_params.beta = 0.94
             engine_params.random_seed = 721
diff --git a/test/accelerate_tests/base_tests/array_utils_test.py b/test/accelerate_tests/base_tests/array_utils_test.py
index 364352a5c..4d8081903 100644
--- a/test/accelerate_tests/base_tests/array_utils_test.py
+++ b/test/accelerate_tests/base_tests/array_utils_test.py
@@ -2,11 +2,12 @@
 Tests for the array_utils module
 '''
 
-import unittest
+import unittest, pytest
 import numpy as np
 from ptypy.accelerate.base import FLOAT_TYPE, COMPLEX_TYPE
 from ptypy.accelerate.base import array_utils as au
 
+_numpy_version = tuple(map(int, np.__version__.split(".")))
 
 class ArrayUtilsTest(unittest.TestCase):
 
@@ -291,6 +292,7 @@ def test_crop_pad_3(self):
                          dtype=np.complex64)
         np.testing.assert_array_almost_equal(A, exp_A)
 
+    @pytest.mark.skipif(_numpy_version < (2,0), reason="requires Numpy 2.0 or higher to match expected output")
     def test_fft_filter(self):
         data = np.zeros((256, 512), dtype=COMPLEX_TYPE)
         data[64:-64,128:-128] = 1 + 1.j
@@ -305,43 +307,44 @@ def test_fft_filter(self):
         kernel = np.fft.fftn(rk)
 
         output = au.fft_filter(data, kernel, prefactor, postfactor)
+
+        known_test_output = np.array([-0.0000000e+00+0.00000000e+00j, -0.0000000e+00+0.00000000e+00j,
+                                      0.0000000e+00+0.00000000e+00j,  0.0000000e+00+0.00000000e+00j,
+                                      0.0000000e+00+0.00000000e+00j,  0.0000000e+00+0.00000000e+00j,
+                                      0.0000000e+00-0.00000000e+00j,  0.0000000e+00+0.00000000e+00j,
+                                      -0.0000000e+00+0.00000000e+00j,  0.0000000e+00+0.00000000e+00j,
+                                      -0.0000000e+00+0.00000000e+00j, -0.0000000e+00+0.00000000e+00j,
+                                      0.0000000e+00+0.00000000e+00j,  0.0000000e+00+0.00000000e+00j,
+                                      0.0000000e+00+0.00000000e+00j,  0.0000000e+00+0.00000000e+00j,
+                                      0.0000000e+00+0.00000000e+00j,  0.0000000e+00-0.00000000e+00j,
+                                      0.0000000e+00+0.00000000e+00j,  0.0000000e+00+0.00000000e+00j,
+                                      0.0000000e+00+0.00000000e+00j,  0.0000000e+00+0.00000000e+00j,
+                                      0.0000000e+00+0.00000000e+00j,  0.0000000e+00+0.00000000e+00j,
+                                      0.0000000e+00+0.00000000e+00j,  0.0000000e+00+0.00000000e+00j,
+                                      0.0000000e+00+0.00000000e+00j,  0.0000000e+00+0.00000000e+00j,
+                                      0.0000000e+00+0.00000000e+00j,  0.0000000e+00+0.00000000e+00j,
+                                      0.0000000e+00+0.00000000e+00j,  0.0000000e+00-0.00000000e+00j,
+                                      0.0000000e+00+0.00000000e+00j,  6.1097220e-05+2.92563982e-05j,
+                                      4.0044695e-05+2.52102855e-05j,  8.9999994e+02+9.00000000e+02j,
+                                      8.9999988e+02+8.99999939e+02j,  5.0999994e+02+5.09999939e+02j,
+                                      1.9365043e-05+5.84280206e-05j,  3.0681291e-05+2.31116355e-05j,
+                                      1.2552022e-05-1.01537153e-05j, -1.4034913e-05-1.17988075e-05j,
+                                      -1.9193330e-05+2.72889110e-07j,  1.3895768e-05+1.64778357e-05j,
+                                      6.5228807e-05+2.45708943e-05j,  3.8999994e+02+3.89999939e+02j,
+                                      8.9999988e+02+8.99999878e+02j,  8.9999982e+02+8.99999878e+02j,
+                                      3.0000015e+01+3.00000248e+01j,  3.8863189e-05+3.26705631e-05j,
+                                      2.8768281e-06-1.62116921e-05j, -3.2418033e-05-1.97073969e-05j,
+                                      -6.6843757e-05+7.19546824e-06j,  6.5036993e-06+3.95851657e-06j,
+                                      -2.4053887e-05+9.88548163e-06j,  1.5231475e-05+1.31202614e-06j,
+                                      8.7000000e+01+8.70000305e+01j,  6.1035156e-05+0.00000000e+00j,
+                                      6.1035156e-05+0.00000000e+00j, -2.4943074e-07+6.62429193e-06j,
+                                      1.6712515e-06-2.97475322e-06j,  1.9025241e-05+2.97752194e-07j,
+                                      -9.2436176e-07-3.86252796e-05j, -8.8145862e-06-9.89961700e-06j,
+                                      -1.5782407e-06+1.01533060e-05j, -4.7593076e-06+2.96332291e-05j])
         
-        known_test_output =  np.array([-0.00000000e+00+0.00000000e+00j,  0.00000000e+00-0.00000000e+00j,
-                                       -0.00000000e+00 + 0.00000000e+00j, 0.00000000e+00 - 0.00000000e+00j,
-                                      0.00000000e+00-0.00000000e+00j, -0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00+0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00+0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00+0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                     -0.00000000e+00+0.00000000e+00j, -0.00000000e+00+0.00000000e+00j,
-                                     -0.00000000e+00+0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00-0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00+0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00-0.00000000e+00j,  0.00000000e+00-0.00000000e+00j,
-                                      0.00000000e+00-0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00+0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00+0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00+0.00000000e+00j,  0.00000000e+00-0.00000000e+00j,
-                                      0.00000000e+00-0.00000000e+00j,  0.00000000e+00-0.00000000e+00j,
-                                      0.00000000e+00-0.00000000e+00j,  8.66422277e-14+4.86768828e-14j,
-                                      7.23113320e-14+2.82331542e-14j,  9.00000000e+02+9.00000000e+02j,
-                                      9.00000000e+02+9.00000000e+02j,  5.10000000e+02+5.10000000e+02j,
-                                      1.41172830e-14+3.62223425e-14j,  2.61684238e-14-4.13866575e-14j,
-                                      2.16691314e-14-1.95102733e-14j, -1.36536942e-13-9.94589021e-14j,
-                                     -1.42905371e-13-5.77964697e-14j, -5.00005072e-14+4.08620637e-14j,
-                                      6.38160272e-14+7.61753583e-14j,  3.90000000e+02+3.90000000e+02j,
-                                      9.00000000e+02+9.00000000e+02j,  9.00000000e+02+9.00000000e+02j,
-                                      3.00000000e+01+3.00000000e+01j,  8.63255773e-14+7.08532924e-14j,
-                                      1.80941313e-14-3.85517154e-14j,  7.84277340e-14-1.32008745e-14j,
-                                     -6.57025196e-14-1.72739350e-14j, -6.69570857e-15+6.49622898e-14j,
-                                      6.27436466e-15+7.57162569e-14j,  2.01150157e-15+3.65538558e-14j,
-                                      8.70000000e+01+8.70000000e+01j, -1.13686838e-13-1.70530257e-13j,
-                                      0.00000000e+00-2.27373675e-13j, -1.84492121e-14-9.21502853e-14j,
-                                      2.12418687e-14-8.62209232e-14j,  1.20880692e-13+3.86522371e-14j,
-                                      1.03754734e-13+9.19851759e-14j,  5.50926123e-14+1.17150422e-13j,
-                                     -5.47869215e-14+5.87176511e-14j, -3.52652980e-14+8.44455504e-15j])
-
-        np.testing.assert_array_almost_equal(output.flat[::2000], known_test_output)
+        np.testing.assert_array_almost_equal(output.flat[::2000], known_test_output, decimal=5)
 
+    @pytest.mark.skipif(_numpy_version < (2,0), reason="requires Numpy 2.0 or higher to match expected output")
     def test_fft_filter_batched(self):
         data = np.zeros((2,256, 512), dtype=COMPLEX_TYPE)
         data[:,64:-64,128:-128] = 1 + 1.j
@@ -356,42 +359,42 @@ def test_fft_filter_batched(self):
         kernel = np.fft.fftn(rk)
 
         output = au.fft_filter(data, kernel, prefactor, postfactor)
-        
-        known_test_output =  np.array([-0.00000000e+00+0.00000000e+00j,  0.00000000e+00-0.00000000e+00j,
-                                       -0.00000000e+00 + 0.00000000e+00j, 0.00000000e+00 - 0.00000000e+00j,
-                                      0.00000000e+00-0.00000000e+00j, -0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00+0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00+0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00+0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                     -0.00000000e+00+0.00000000e+00j, -0.00000000e+00+0.00000000e+00j,
-                                     -0.00000000e+00+0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00-0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00+0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00-0.00000000e+00j,  0.00000000e+00-0.00000000e+00j,
-                                      0.00000000e+00-0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00+0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00+0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00+0.00000000e+00j,  0.00000000e+00-0.00000000e+00j,
-                                      0.00000000e+00-0.00000000e+00j,  0.00000000e+00-0.00000000e+00j,
-                                      0.00000000e+00-0.00000000e+00j,  8.66422277e-14+4.86768828e-14j,
-                                      7.23113320e-14+2.82331542e-14j,  9.00000000e+02+9.00000000e+02j,
-                                      9.00000000e+02+9.00000000e+02j,  5.10000000e+02+5.10000000e+02j,
-                                      1.41172830e-14+3.62223425e-14j,  2.61684238e-14-4.13866575e-14j,
-                                      2.16691314e-14-1.95102733e-14j, -1.36536942e-13-9.94589021e-14j,
-                                     -1.42905371e-13-5.77964697e-14j, -5.00005072e-14+4.08620637e-14j,
-                                      6.38160272e-14+7.61753583e-14j,  3.90000000e+02+3.90000000e+02j,
-                                      9.00000000e+02+9.00000000e+02j,  9.00000000e+02+9.00000000e+02j,
-                                      3.00000000e+01+3.00000000e+01j,  8.63255773e-14+7.08532924e-14j,
-                                      1.80941313e-14-3.85517154e-14j,  7.84277340e-14-1.32008745e-14j,
-                                     -6.57025196e-14-1.72739350e-14j, -6.69570857e-15+6.49622898e-14j,
-                                      6.27436466e-15+7.57162569e-14j,  2.01150157e-15+3.65538558e-14j,
-                                      8.70000000e+01+8.70000000e+01j, -1.13686838e-13-1.70530257e-13j,
-                                      0.00000000e+00-2.27373675e-13j, -1.84492121e-14-9.21502853e-14j,
-                                      2.12418687e-14-8.62209232e-14j,  1.20880692e-13+3.86522371e-14j,
-                                      1.03754734e-13+9.19851759e-14j,  5.50926123e-14+1.17150422e-13j,
-                                     -5.47869215e-14+5.87176511e-14j, -3.52652980e-14+8.44455504e-15j])
-
-        np.testing.assert_array_almost_equal(output[1].flat[::2000], known_test_output)
+
+        known_test_output = np.array([ 0.00000000e+00-0.0000000e+00j,  0.00000000e+00-0.0000000e+00j,
+                                       0.00000000e+00+0.0000000e+00j,  0.00000000e+00+0.0000000e+00j,
+                                       0.00000000e+00+0.0000000e+00j,  0.00000000e+00+0.0000000e+00j,
+                                       0.00000000e+00+0.0000000e+00j, 0.00000000e+00+0.0000000e+00j,
+                                       -0.00000000e+00+0.0000000e+00j,  0.00000000e+00-0.0000000e+00j,
+                                       0.00000000e+00-0.0000000e+00j,  0.00000000e+00-0.0000000e+00j,
+                                       0.00000000e+00+0.0000000e+00j, 0.00000000e+00+0.0000000e+00j,
+                                       0.00000000e+00+0.0000000e+00j,  0.00000000e+00+0.0000000e+00j,
+                                       0.00000000e+00+0.0000000e+00j,  0.00000000e+00+0.0000000e+00j,
+                                       0.00000000e+00+0.0000000e+00j,  0.00000000e+00-0.0000000e+00j,
+                                       0.00000000e+00-0.0000000e+00j,  0.00000000e+00-0.0000000e+00j,
+                                       0.00000000e+00+0.0000000e+00j,  0.00000000e+00+0.0000000e+00j,
+                                       0.00000000e+00+0.0000000e+00j,  0.00000000e+00+0.0000000e+00j,
+                                       0.00000000e+00+0.0000000e+00j,  0.00000000e+00+0.0000000e+00j,
+                                       0.00000000e+00+0.0000000e+00j, 0.00000000e+00-0.0000000e+00j,
+                                       0.00000000e+00-0.0000000e+00j,  0.00000000e+00-0.0000000e+00j,
+                                       0.00000000e+00-0.0000000e+00j,  4.86995195e-05-9.1511911e-06j,
+                                       5.89395277e-05+3.6706428e-05j,  8.99999817e+02+9.0000000e+02j,
+                                       8.99999817e+02+9.0000000e+02j,  5.09999969e+02+5.0999997e+02j,
+                                       6.86399580e-05+5.5245564e-05j, -2.15578075e-06-8.0761157e-07j,
+                                       -5.99612467e-05-3.7489859e-05j, -2.08058154e-05-1.7001423e-05j,
+                                       -3.15661709e-05-2.0192698e-05j, -1.17410173e-05-2.3929812e-05j,
+                                       8.41844594e-05+4.9635066e-05j,  3.90000031e+02+3.9000003e+02j,
+                                       8.99999817e+02+8.9999994e+02j,  8.99999817e+02+8.9999994e+02j,
+                                       3.00000153e+01+3.0000000e+01j,  4.75842753e-05+1.7961407e-05j,
+                                       -1.28229876e-05-3.3492659e-05j, -1.50405585e-05+3.0159079e-05j,
+                                       -1.00799960e-04-6.6932058e-05j, -4.90295024e-05-3.6601130e-05j,
+                                       -4.48861247e-05-1.4717044e-05j,  2.60417364e-05-8.3221821e-06j,
+                                       8.69999847e+01+8.7000046e+01j,  4.31583721e-05+4.3158372e-05j,
+                                       4.31583721e-05+4.3158372e-05j,  4.04649109e-06-1.6836095e-05j,
+                                       1.37377283e-05+5.2577798e-06j, -2.30404657e-05-3.4596611e-05j,
+                                       -1.33214944e-05-3.2517899e-05j,  2.45428764e-05-3.5186855e-07j,
+                                       -1.85950885e-05-2.1921931e-05j, -1.65030433e-05-8.0249208e-07j])
+                        
+        np.testing.assert_array_almost_equal(output[1].flat[::2000], known_test_output, decimal=5)
 
 
     def test_complex_gaussian_filter_fft(self):
diff --git a/test/accelerate_tests/base_tests/engine_tests.py b/test/accelerate_tests/base_tests/engine_test.py
similarity index 74%
rename from test/accelerate_tests/base_tests/engine_tests.py
rename to test/accelerate_tests/base_tests/engine_test.py
index dbfc1e9f6..3177906fe 100644
--- a/test/accelerate_tests/base_tests/engine_tests.py
+++ b/test/accelerate_tests/base_tests/engine_test.py
@@ -14,6 +14,7 @@
 import tempfile
 import shutil
 import numpy as np
+import pytest
 
 class MLSerialTest(unittest.TestCase):
 
@@ -23,7 +24,7 @@ def setUp(self):
     def tearDown(self):
         shutil.rmtree(self.outpath)
 
-    def check_engine_output(self, output, plotting=False, debug=False):
+    def check_engine_output(self, output, plotting=False, debug=False, tol=0.1):
         P_ML, P_ML_serial = output
         numiter = len(P_ML.runtime["iter_info"])
         LL_ML = np.array([P_ML.runtime["iter_info"][i]["error"][1] for i in range(numiter)])
@@ -33,6 +34,11 @@ def check_engine_output(self, output, plotting=False, debug=False):
         PRB_ML_serial, PRB_ML = P_ML_serial.probe.S["SMFG00"].data[0], P_ML.probe.S["SMFG00"].data[0]
         eng_ML = P_ML.engines["engine00"]
         eng_ML_serial = P_ML_serial.engines["engine00"]
+        # Normalize the outputs
+        PRB_ML_max = np.abs(PRB_ML).max()
+        PRB_ML_serial_max = np.abs(PRB_ML_serial).max()
+        OBJ_ML_serial *= (PRB_ML_serial_max / PRB_ML_max)
+        PRB_ML_serial /= (PRB_ML_serial_max / PRB_ML_max)
         if debug:
             import matplotlib.pyplot as plt
             plt.figure("ML debug")
@@ -66,15 +72,15 @@ def check_engine_output(self, output, plotting=False, debug=False):
         # np.testing.assert_allclose(eng_ML.debug, eng_ML_serial.debug, atol=1e-7, rtol=1e-7,
         #                             err_msg="The debug arrays are not matching as expected")
         RMSE_ob = (np.mean(np.abs(OBJ_ML_serial - OBJ_ML)**2))
-        RMSE_pr = (np.mean(np.abs(PRB_ML_serial - PRB_ML)**2))
-        # RMSE_LL = (np.mean(np.abs(LL_ML_serial - LL_ML)**2))
-        np.testing.assert_allclose(RMSE_ob, 0.0, atol=1e-2, 
+        RMSE_pr = (np.mean(np.abs(PRB_ML_serial - PRB_ML)**2))        
+        #MSE_LL = (np.mean(np.abs(LL_ML_serial - LL_ML)**2))
+        np.testing.assert_allclose(RMSE_ob, 0.0, atol=tol, 
                                     err_msg="The object arrays are not matching as expected")
-        np.testing.assert_allclose(RMSE_pr, 0.0, atol=1e-2, 
-                                    err_msg="The object arrays are not matching as expected")
-        # np.testing.assert_allclose(RMSE_LL, 0.0, atol=1e-7,
-        #                             err_msg="The log-likelihood errors are not matching as expected")
-    
+        np.testing.assert_allclose(RMSE_pr, 0.0, atol=tol, 
+                                    err_msg="The probe arrays are not matching as expected")
+        #p.testing.assert_allclose(RMSE_LL, 0.0, atol=1e-7,
+        #                           err_msg="The log-likelihood errors are not matching as expected")
+
 
     def test_ML_serial_base(self):
         out = []
@@ -87,7 +93,7 @@ def test_ML_serial_base(self):
             engine_params.reg_del2_amplitude = 1.
             engine_params.scale_precond = False
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False)
 
     def test_ML_serial_regularizer(self):
@@ -101,7 +107,7 @@ def test_ML_serial_regularizer(self):
             engine_params.reg_del2_amplitude = 1.
             engine_params.scale_precond = False
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False)
 
 
@@ -117,7 +123,7 @@ def test_ML_serial_preconditioner(self):
             engine_params.scale_precond = True
             engine_params.scale_probe_object = 1e-6
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False)
 
     def test_ML_serial_floating(self):
@@ -131,9 +137,10 @@ def test_ML_serial_floating(self):
             engine_params.reg_del2_amplitude = 1.
             engine_params.scale_precond = False
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False)
 
+    @pytest.mark.skip(reason="Funny behaviour with this test, most likely related to Gaussian filter, see issue #607")
     def test_ML_serial_smoothing_regularizer(self):
         out = []
         for eng in ["ML", "ML_serial"]:
@@ -147,9 +154,29 @@ def test_ML_serial_smoothing_regularizer(self):
             engine_params.smooth_gradient_decay = 1/10.
             engine_params.scale_precond = False
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False)
 
+    @pytest.mark.skip(reason="Funny behaviour with this test, most likely related to Gaussian filter, see issue #607")
+    def test_ML_serial_wavefield_preconditioner(self):
+        out = []
+        for eng in ["ML", "ML_serial"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 500
+            engine_params.floating_intensities = False
+            engine_params.reg_del2 = False
+            engine_params.reg_del2_amplitude = 1.
+            engine_params.smooth_gradient = 0.
+            engine_params.smooth_gradient_decay = 0.
+            engine_params.scale_precond = False
+            engine_params.wavefield_precond = True
+            engine_params.wavefield_delta_object = 0.1
+            engine_params.wavefield_delta_probe = 0.1
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
+        self.check_engine_output(out, plotting=False, debug=False, tol=0.3)
+
     def test_ML_serial_all(self):
         out = []
         for eng in ["ML", "ML_serial"]:
diff --git a/test/accelerate_tests/base_tests/po_update_kernel_test.py b/test/accelerate_tests/base_tests/po_update_kernel_test.py
index 953c26b54..9b3ca5c1b 100644
--- a/test/accelerate_tests/base_tests/po_update_kernel_test.py
+++ b/test/accelerate_tests/base_tests/po_update_kernel_test.py
@@ -188,7 +188,7 @@ def test_pr_update(self):
                                       err_msg="The probe denominatorhas not been updated as expected")
 
     def test_pr_update_ML(self):
-        # setup  
+        # setup
         addr, object_array, object_array_denominator, probe, exit_wave, probe_denominator = self.prepare_arrays()
 
         # test
@@ -242,6 +242,100 @@ def test_ob_update_ML(self):
         np.testing.assert_array_equal(object_array, expected_object_array,
                                       err_msg="The object array has not been updated as expected")
 
+    def test_pr_update_ML_wavefield(self):
+        # setup
+        addr, object_array, object_array_denominator, probe, exit_wave, probe_denominator = self.prepare_arrays()
+        probe_wavefield = np.zeros_like(probe, dtype=FLOAT_TYPE)
+
+        # test
+        POUK = PoUpdateKernel()
+        POUK.allocate()  # this doesn't do anything, but is the call pattern.
+        POUK.pr_update_ML_wavefield(addr, probe, probe_wavefield, object_array, exit_wave)
+
+        # assert
+        expected_probe = np.array([[[625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j],
+                                    [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j],
+                                    [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j],
+                                    [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j],
+                                    [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j]],
+
+                                   [[786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j],
+                                    [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j],
+                                    [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j],
+                                    [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j],
+                                    [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j]]],
+                                  dtype=COMPLEX_TYPE)
+        np.testing.assert_array_equal(probe, expected_probe,
+                                      err_msg="The probe has not been updated as expected")
+
+        # assert
+        expected_probe_wavefield = np.array(
+              [[[136., 136., 136., 136., 136.],
+                [136., 136., 136., 136., 136.],
+                [136., 136., 136., 136., 136.],
+                [136., 136., 136., 136., 136.],
+                [136., 136., 136., 136., 136.]],
+
+               [[136., 136., 136., 136., 136.],
+                [136., 136., 136., 136., 136.],
+                [136., 136., 136., 136., 136.],
+                [136., 136., 136., 136., 136.],
+                [136., 136., 136., 136., 136.]]], 
+                dtype=FLOAT_TYPE)
+        np.testing.assert_array_equal(probe_wavefield, expected_probe_wavefield,
+                                      err_msg="The probe wavefield has not been updated as expected")
+
+    def test_ob_update_ML_wavefield(self):
+        # setup
+        addr, object_array, object_array_denominator, probe, exit_wave, probe_denominator = self.prepare_arrays()
+        object_array_wavefield = np.zeros_like(object_array, dtype=FLOAT_TYPE)
+        
+        # test
+        POUK = PoUpdateKernel()
+        POUK.allocate()  # this doesn't do anything, but is the call pattern.
+        POUK.ob_update_ML_wavefield(addr, object_array, object_array_wavefield, probe, exit_wave)
+
+        # assert
+        expected_object_array = np.array(
+            [[[29. + 1.j, 105. + 1.j, 105. + 1.j, 105. + 1.j, 105. + 1.j, 77. + 1.j, 1. + 1.j],
+              [153. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 249. + 1.j, 1. + 1.j],
+              [153. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 249. + 1.j, 1. + 1.j],
+              [153. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 249. + 1.j, 1. + 1.j],
+              [153. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 249. + 1.j, 1. + 1.j],
+              [125. + 1.j, 297. + 1.j, 297. + 1.j, 297. + 1.j, 297. + 1.j, 173. + 1.j, 1. + 1.j],
+              [1. + 1.j, 1. + 1.j, 1. + 1.j, 1. + 1.j, 1. + 1.j, 1. + 1.j, 1. + 1.j]],
+
+             [[44. + 4.j, 132. + 4.j, 132. + 4.j, 132. + 4.j, 132. + 4.j, 92. + 4.j, 4. + 4.j],
+              [180. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 276. + 4.j, 4. + 4.j],
+              [180. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 276. + 4.j, 4. + 4.j],
+              [180. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 276. + 4.j, 4. + 4.j],
+              [180. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 276. + 4.j, 4. + 4.j],
+              [140. + 4.j, 324. + 4.j, 324. + 4.j, 324. + 4.j, 324. + 4.j, 188. + 4.j, 4. + 4.j],
+              [4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j]]],
+            dtype=COMPLEX_TYPE)
+        np.testing.assert_array_equal(object_array, expected_object_array,
+                                      err_msg="The object array has not been updated as expected")
+
+        # assert
+        expected_object_array_wavefield = np.array(
+          [[[10., 20., 20., 20., 20., 10.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [10., 20., 20., 20., 20., 10.,  0.],
+            [ 0.,  0.,  0.,  0.,  0.,  0.,  0.]],
+
+           [[10., 20., 20., 20., 20., 10.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [10., 20., 20., 20., 20., 10.,  0.],
+            [ 0.,  0.,  0.,  0.,  0.,  0.,  0.]]], 
+        dtype=FLOAT_TYPE)
+        np.testing.assert_array_equal(object_array_wavefield, expected_object_array_wavefield,
+                                      err_msg="The object array wavefield has not been updated as expected")
 
     def test_pr_update_local(self):
         # setup
diff --git a/test/accelerate_tests/cuda_cupy_tests/WASP_tests.py b/test/accelerate_tests/cuda_cupy_tests/WASP_test.py
similarity index 89%
rename from test/accelerate_tests/cuda_cupy_tests/WASP_tests.py
rename to test/accelerate_tests/cuda_cupy_tests/WASP_test.py
index 7ccda4930..91bc0a1fc 100644
--- a/test/accelerate_tests/cuda_cupy_tests/WASP_tests.py
+++ b/test/accelerate_tests/cuda_cupy_tests/WASP_test.py
@@ -98,6 +98,7 @@ def test_WASP_cupy_base(self):
             engine_params = u.Param()
             engine_params.name = eng
             engine_params.numiter = 10
+            engine_params.clip_object = (0, 2)
             engine_params.random_seed = 721
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
                                            scanmodel="BlockFull", autosave=False, verbose_level="critical"))
@@ -119,12 +120,28 @@ def test_WASP_cupy_clip(self):
         if parallel.master:
             self.check_engine_output(out, plotting=False, debug=False)
 
+    def test_WASP_cupy_probe_start(self):
+        out = []
+        for eng in ["WASP_serial", "WASP_cupy"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 10
+            engine_params.clip_object = (0, 2)
+            engine_params.probe_update_start = 4
+            engine_params.random_seed = 721
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+
+        if parallel.master:
+            self.check_engine_output(out, plotting=False, debug=False)
+
     def test_WASP_cupy_alpha_beta(self):
         out = []
         for eng in ["WASP_serial", "WASP_cupy"]:
             engine_params = u.Param()
             engine_params.name = eng
             engine_params.numiter = 10
+            engine_params.clip_object = (0, 2)
             engine_params.alpha = 0.64
             engine_params.beta = 0.94
             engine_params.random_seed = 721
diff --git a/test/accelerate_tests/cuda_cupy_tests/engine_tests.py b/test/accelerate_tests/cuda_cupy_tests/engine_test.py
similarity index 80%
rename from test/accelerate_tests/cuda_cupy_tests/engine_tests.py
rename to test/accelerate_tests/cuda_cupy_tests/engine_test.py
index fe70b58bc..2eae5c545 100644
--- a/test/accelerate_tests/cuda_cupy_tests/engine_tests.py
+++ b/test/accelerate_tests/cuda_cupy_tests/engine_test.py
@@ -23,7 +23,7 @@ def setUp(self):
     def tearDown(self):
         shutil.rmtree(self.outpath)
 
-    def check_engine_output(self, output, plotting=False, debug=False, scan="MF"):
+    def check_engine_output(self, output, plotting=False, debug=False, scan="MF", tol=0.1):
         key = "S%sG00" %scan
         P_ML_serial, P_ML_cupy = output
         numiter = len(P_ML_serial.runtime["iter_info"])
@@ -36,6 +36,11 @@ def check_engine_output(self, output, plotting=False, debug=False, scan="MF"):
         MED_ML_cupy = np.median(np.angle(OBJ_ML_cupy))
         eng_ML_serial = P_ML_serial.engines["engine00"]
         eng_ML_cupy = P_ML_cupy.engines["engine00"]
+        # Normalize the outputs
+        PRB_ML_serial_max = np.abs(PRB_ML_serial).max()
+        PRB_ML_cupy_max = np.abs(PRB_ML_cupy).max()
+        OBJ_ML_cupy *= (PRB_ML_cupy_max / PRB_ML_serial_max)
+        PRB_ML_cupy /= (PRB_ML_cupy_max / PRB_ML_serial_max)
         if debug:
             import matplotlib.pyplot as plt
             plt.figure("ML serial debug")
@@ -71,13 +76,13 @@ def check_engine_output(self, output, plotting=False, debug=False, scan="MF"):
         RMSE_ob = (np.mean(np.abs(OBJ_ML_cupy - OBJ_ML_serial)**2))
         RMSE_pr = (np.mean(np.abs(PRB_ML_cupy - PRB_ML_serial)**2))
         # RMSE_LL = (np.mean(np.abs(LL_ML_serial - LL_ML)**2))
-        np.testing.assert_allclose(RMSE_ob, 0.0, atol=1e-2, 
-                                    err_msg="The object arrays are not matching as expected")
-        np.testing.assert_allclose(RMSE_pr, 0.0, atol=1e-2, 
+        np.testing.assert_allclose(RMSE_ob, 0.0, atol=tol,
                                     err_msg="The object arrays are not matching as expected")
+        np.testing.assert_allclose(RMSE_pr, 0.0, atol=tol,
+                                    err_msg="The probe arrays are not matching as expected")
         # np.testing.assert_allclose(RMSE_LL, 0.0, atol=1e-7,
         #                             err_msg="The log-likelihood errors are not matching as expected")
-    
+
     def test_ML_cupy_base(self):
         out = []
         for eng in ["ML_serial", "ML_cupy"]:
@@ -89,7 +94,7 @@ def test_ML_cupy_base(self):
             engine_params.reg_del2_amplitude = 1.
             engine_params.scale_precond = False
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False)
 
     def test_ML_cupy_regularizer(self):
@@ -103,7 +108,7 @@ def test_ML_cupy_regularizer(self):
             engine_params.reg_del2_amplitude = 1.
             engine_params.scale_precond = False
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False)
 
     def test_ML_cupy_preconditioner(self):
@@ -118,7 +123,7 @@ def test_ML_cupy_preconditioner(self):
             engine_params.scale_precond = True
             engine_params.scale_probe_object = 1e-6
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False)
 
     def test_ML_cupy_floating(self):
@@ -132,7 +137,7 @@ def test_ML_cupy_floating(self):
             engine_params.reg_del2_amplitude = 1.
             engine_params.scale_precond = False
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False)
 
     def test_ML_cupy_smoothing_regularizer(self):
@@ -148,9 +153,28 @@ def test_ML_cupy_smoothing_regularizer(self):
             engine_params.smooth_gradient_decay = 1/10.
             engine_params.scale_precond = False
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False)
 
+    def test_ML_cupy_wavefield_preconditioner(self):
+        out = []
+        for eng in ["ML_serial", "ML_cupy"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 500
+            engine_params.floating_intensities = False
+            engine_params.reg_del2 = False
+            engine_params.reg_del2_amplitude = 1.
+            engine_params.smooth_gradient = 0
+            engine_params.smooth_gradient_decay = 0.
+            engine_params.scale_precond = False
+            engine_params.wavefield_precond = True
+            engine_params.wavefield_delta_object = 0.1
+            engine_params.wavefield_delta_probe = 0.1
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
+        self.check_engine_output(out, plotting=False, debug=False, tol=0.3)
+
     def test_ML_cupy_all(self):
         out = []
         for eng in ["ML_serial", "ML_cupy"]:
@@ -165,7 +189,7 @@ def test_ML_cupy_all(self):
             engine_params.scale_precond = True
             engine_params.scale_probe_object = 1e-6
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="info"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="info", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False)
 
 if __name__ == "__main__":
diff --git a/test/accelerate_tests/cuda_cupy_tests/multi_gpu_test.py b/test/accelerate_tests/cuda_cupy_tests/multi_gpu_test.py
index 0c234d878..424c6696b 100644
--- a/test/accelerate_tests/cuda_cupy_tests/multi_gpu_test.py
+++ b/test/accelerate_tests/cuda_cupy_tests/multi_gpu_test.py
@@ -11,7 +11,7 @@
     from ptypy.accelerate.cuda_cupy import multi_gpu as mgpu
     from ptypy.utils import parallel
 
-from pkg_resources import parse_version
+from packaging.version import parse
 
 class GpuDataTest(CupyCudaTest):
     """
@@ -36,8 +36,8 @@ def setUp(self):
 
     @unittest.skipIf(parallel.rank != 0, "Only in MPI rank 0")
     def test_version(self):
-        v1 = parse_version("3.1.0")
-        v2 = parse_version(parse_version("3.1.0a").base_version)
+        v1 = parse("3.1.0")
+        v2 = parse(parse("3.1.0a").base_version)
 
         self.assertGreaterEqual(v2, v1)
 
@@ -61,7 +61,7 @@ def multigpu_tester(self, com):
 
     def test_multigpu_auto(self):
         self.multigpu_tester(mgpu.get_multi_gpu_communicator())
-        
+
     def test_multigpu_mpi(self):
         self.multigpu_tester(mgpu.MultiGpuCommunicatorMpi())
 
@@ -71,4 +71,4 @@ def test_multigpu_cudampi(self):
 
     @unittest.skipIf(not mgpu.have_nccl, "NCCL not available")
     def test_multigpu_nccl(self):
-        self.multigpu_tester(mgpu.MultiGpuCommunicatorNccl())
\ No newline at end of file
+        self.multigpu_tester(mgpu.MultiGpuCommunicatorNccl())
diff --git a/test/accelerate_tests/cuda_cupy_tests/po_update_kernel_test.py b/test/accelerate_tests/cuda_cupy_tests/po_update_kernel_test.py
index 5b2751877..bad8da21b 100644
--- a/test/accelerate_tests/cuda_cupy_tests/po_update_kernel_test.py
+++ b/test/accelerate_tests/cuda_cupy_tests/po_update_kernel_test.py
@@ -648,6 +648,127 @@ def test_ob_update_ML_atomics_REGRESSION(self):
     def test_ob_update_ML_tiled_REGRESSION(self):
         self.ob_update_ML_tester(False)
 
+    def pr_update_ML_wavefield_tester(self, atomics=False):
+        '''
+        setup
+        '''
+        addr, object_array, object_array_denominator, probe, exit_wave, probe_denominator = self.prepare_arrays()
+        probe_wavefield = cp.asarray(np.zeros_like(probe, dtype=FLOAT_TYPE))
+        '''
+        test
+        '''
+        POUK = PoUpdateKernel()
+
+        POUK.allocate()  # this doesn't do anything, but is the call pattern.
+
+        if not atomics:
+            addr2 = np.ascontiguousarray(np.transpose(addr.get(), (2, 3, 0, 1)))
+            addr = cp.asarray(addr2)
+
+        POUK.pr_update_ML_wavefield(addr, probe, probe_wavefield, object_array, exit_wave, atomics=atomics)
+
+        expected_probe = np.array([[[625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j],
+                                    [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j],
+                                    [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j],
+                                    [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j],
+                                    [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j]],
+
+                                   [[786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j],
+                                    [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j],
+                                    [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j],
+                                    [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j],
+                                    [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j]]],
+                                  dtype=COMPLEX_TYPE)
+        np.testing.assert_array_equal(probe.get(), expected_probe,
+                                      err_msg="The probe has not been updated as expected")
+
+        expected_probe_wavefield = np.array(
+              [[[136., 136., 136., 136., 136.],
+                [136., 136., 136., 136., 136.],
+                [136., 136., 136., 136., 136.],
+                [136., 136., 136., 136., 136.],
+                [136., 136., 136., 136., 136.]],
+
+               [[136., 136., 136., 136., 136.],
+                [136., 136., 136., 136., 136.],
+                [136., 136., 136., 136., 136.],
+                [136., 136., 136., 136., 136.],
+                [136., 136., 136., 136., 136.]]], 
+                dtype=FLOAT_TYPE)
+        np.testing.assert_array_equal(probe_wavefield.get(), expected_probe_wavefield,
+                                      err_msg="The probe wavefield has not been updated as expected")
+
+    def test_pr_update_ML_wavefield_atomics_REGRESSION(self):
+        self.pr_update_ML_wavefield_tester(True)
+
+    def test_pr_update_ML_wavefield_tiled_REGRESSION(self):
+        self.pr_update_ML_wavefield_tester(False)
+
+    def ob_update_ML_wavefield_tester(self, atomics=True):
+        '''
+        setup
+        '''
+        addr, object_array, object_array_denominator, probe, exit_wave, probe_denominator = self.prepare_arrays()
+        object_array_wavefield = cp.asarray(np.zeros_like(object_array, dtype=FLOAT_TYPE))
+        '''
+        test
+        '''
+        POUK = PoUpdateKernel()
+
+        POUK.allocate()  # this doesn't do anything, but is the call pattern.
+
+        if not atomics:
+            addr2 = np.ascontiguousarray(np.transpose(addr.get(), (2, 3, 0, 1)))
+            addr = cp.asarray(addr2)
+
+        POUK.ob_update_ML_wavefield(addr, object_array, object_array_wavefield, probe, exit_wave, atomics=atomics)
+
+        expected_object_array = np.array(
+            [[[29. + 1.j, 105. + 1.j, 105. + 1.j, 105. + 1.j, 105. + 1.j, 77. + 1.j, 1. + 1.j],
+              [153. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 249. + 1.j, 1. + 1.j],
+              [153. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 249. + 1.j, 1. + 1.j],
+              [153. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 249. + 1.j, 1. + 1.j],
+              [153. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 249. + 1.j, 1. + 1.j],
+              [125. + 1.j, 297. + 1.j, 297. + 1.j, 297. + 1.j, 297. + 1.j, 173. + 1.j, 1. + 1.j],
+              [1. + 1.j, 1. + 1.j, 1. + 1.j, 1. + 1.j, 1. + 1.j, 1. + 1.j, 1. + 1.j]],
+
+             [[44. + 4.j, 132. + 4.j, 132. + 4.j, 132. + 4.j, 132. + 4.j, 92. + 4.j, 4. + 4.j],
+              [180. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 276. + 4.j, 4. + 4.j],
+              [180. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 276. + 4.j, 4. + 4.j],
+              [180. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 276. + 4.j, 4. + 4.j],
+              [180. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 276. + 4.j, 4. + 4.j],
+              [140. + 4.j, 324. + 4.j, 324. + 4.j, 324. + 4.j, 324. + 4.j, 188. + 4.j, 4. + 4.j],
+              [4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j]]],
+            dtype=COMPLEX_TYPE)
+        np.testing.assert_array_equal(object_array.get(), expected_object_array,
+                                      err_msg="The object array has not been updated as expected")
+
+        expected_object_array_wavefield = np.array(
+          [[[10., 20., 20., 20., 20., 10.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [10., 20., 20., 20., 20., 10.,  0.],
+            [ 0.,  0.,  0.,  0.,  0.,  0.,  0.]],
+
+           [[10., 20., 20., 20., 20., 10.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [10., 20., 20., 20., 20., 10.,  0.],
+            [ 0.,  0.,  0.,  0.,  0.,  0.,  0.]]], 
+            dtype=FLOAT_TYPE)
+        np.testing.assert_array_equal(object_array_wavefield.get(), expected_object_array_wavefield,
+                                      err_msg="The object array wavefield has not been updated as expected")
+
+    def test_ob_update_ML_wavefield_atomics_REGRESSION(self):
+        self.ob_update_ML_wavefield_tester(True)
+
+    def test_ob_update_ML_wavefield_tiled_REGRESSION(self):
+        self.ob_update_ML_wavefield_tester(False)
+
     def test_ob_update_local_UNITY(self):
         '''
         setup
diff --git a/test/accelerate_tests/cuda_pycuda_tests/WASP_tests.py b/test/accelerate_tests/cuda_pycuda_tests/WASP_test.py
similarity index 89%
rename from test/accelerate_tests/cuda_pycuda_tests/WASP_tests.py
rename to test/accelerate_tests/cuda_pycuda_tests/WASP_test.py
index f8bf5f0c8..e8527481d 100644
--- a/test/accelerate_tests/cuda_pycuda_tests/WASP_tests.py
+++ b/test/accelerate_tests/cuda_pycuda_tests/WASP_test.py
@@ -98,6 +98,7 @@ def test_WASP_pycuda_base(self):
             engine_params = u.Param()
             engine_params.name = eng
             engine_params.numiter = 10
+            engine_params.clip_object = (0, 2)
             engine_params.random_seed = 721
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
                                            scanmodel="BlockFull", autosave=False, verbose_level="critical"))
@@ -119,12 +120,28 @@ def test_WASP_pycuda_clip(self):
         if parallel.master:
             self.check_engine_output(out, plotting=False, debug=False)
 
+    def test_WASP_pycuda_probe_start(self):
+        out = []
+        for eng in ["WASP_serial", "WASP_pycuda"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 10
+            engine_params.clip_object = (0, 2)
+            engine_params.probe_update_start = 4
+            engine_params.random_seed = 721
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+
+        if parallel.master:
+            self.check_engine_output(out, plotting=False, debug=False)
+
     def test_WASP_pycuda_alpha_beta(self):
         out = []
         for eng in ["WASP_serial", "WASP_pycuda"]:
             engine_params = u.Param()
             engine_params.name = eng
             engine_params.numiter = 10
+            engine_params.clip_object = (0, 2)
             engine_params.alpha = 0.64
             engine_params.beta = 0.94
             engine_params.random_seed = 721
diff --git a/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py b/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py
index be82cc292..d11584b01 100644
--- a/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py
+++ b/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py
@@ -580,7 +580,7 @@ def test_fft_filter_UNITY(self):
 
         output = au.fft_filter(data, kernel, prefactor, postfactor)
 
-        np.testing.assert_allclose(output, data_dev.get(), rtol=1e-5, atol=1e-6)
+        np.testing.assert_allclose(output, data_dev.get(), rtol=1e-5, atol=1e-5)
 
     def test_fft_filter_batched_UNITY(self):
         sh = (2,16, 35)
@@ -607,7 +607,7 @@ def test_fft_filter_batched_UNITY(self):
         output = au.fft_filter(data, kernel, prefactor, postfactor)
         print(data_dev.get())
 
-        np.testing.assert_allclose(output, data_dev.get(), rtol=1e-5, atol=1e-6)
+        np.testing.assert_allclose(output, data_dev.get(), rtol=1e-5, atol=1e-5)
 
     def test_complex_gaussian_filter_fft_little_blurring_UNITY(self):
         # Arrange
@@ -624,7 +624,7 @@ def test_complex_gaussian_filter_fft_little_blurring_UNITY(self):
         out_exp = au.complex_gaussian_filter_fft(data, mfs)
         out = data_dev.get()
         
-        np.testing.assert_allclose(out_exp, out, atol=1e-6)
+        np.testing.assert_allclose(out_exp, out, atol=1e-5)
 
     def test_complex_gaussian_filter_fft_more_blurring_UNITY(self):
         # Arrange
@@ -641,7 +641,7 @@ def test_complex_gaussian_filter_fft_more_blurring_UNITY(self):
         out_exp = au.complex_gaussian_filter_fft(data, mfs)
         out = data_dev.get()
 
-        np.testing.assert_allclose(out_exp, out, atol=1e-6)
+        np.testing.assert_allclose(out_exp, out, atol=1e-5)
 
     def test_complex_gaussian_filter_fft_nonsquare_UNITY(self):
         # Arrange
@@ -660,7 +660,7 @@ def test_complex_gaussian_filter_fft_nonsquare_UNITY(self):
         out_exp = au.complex_gaussian_filter_fft(data, mfs)
         out = data_dev.get()
 
-        np.testing.assert_allclose(out_exp, out, atol=1e-6)
+        np.testing.assert_allclose(out_exp, out, atol=1e-5)
 
     def test_complex_gaussian_filter_fft_batched(self):
         # Arrange
@@ -680,4 +680,4 @@ def test_complex_gaussian_filter_fft_batched(self):
         out_exp = au.complex_gaussian_filter_fft(data, mfs)
         out = data_dev.get()
 
-        np.testing.assert_allclose(out_exp, out, atol=1e-6)
\ No newline at end of file
+        np.testing.assert_allclose(out_exp, out, atol=1e-5)
\ No newline at end of file
diff --git a/test/accelerate_tests/cuda_pycuda_tests/engine_tests.py b/test/accelerate_tests/cuda_pycuda_tests/engine_test.py
similarity index 79%
rename from test/accelerate_tests/cuda_pycuda_tests/engine_tests.py
rename to test/accelerate_tests/cuda_pycuda_tests/engine_test.py
index 20b950c61..b1148b8a6 100644
--- a/test/accelerate_tests/cuda_pycuda_tests/engine_tests.py
+++ b/test/accelerate_tests/cuda_pycuda_tests/engine_test.py
@@ -23,7 +23,7 @@ def setUp(self):
     def tearDown(self):
         shutil.rmtree(self.outpath)
 
-    def check_engine_output(self, output, plotting=False, debug=False, scan="MF"):
+    def check_engine_output(self, output, plotting=False, debug=False, scan="MF", tol=0.1):
         key = "S%sG00" %scan
         P_ML_serial, P_ML_pycuda = output
         numiter = len(P_ML_serial.runtime["iter_info"])
@@ -36,6 +36,11 @@ def check_engine_output(self, output, plotting=False, debug=False, scan="MF"):
         MED_ML_pycuda = np.median(np.angle(OBJ_ML_pycuda))
         eng_ML_serial = P_ML_serial.engines["engine00"]
         eng_ML_pycuda = P_ML_pycuda.engines["engine00"]
+        # Normalize the outputs
+        PRB_ML_serial_max = np.abs(PRB_ML_serial).max()
+        PRB_ML_pycuda_max = np.abs(PRB_ML_pycuda).max()
+        OBJ_ML_pycuda *= (PRB_ML_pycuda_max / PRB_ML_serial_max)
+        PRB_ML_pycuda /= (PRB_ML_pycuda_max / PRB_ML_serial_max)
         if debug:
             import matplotlib.pyplot as plt
             plt.figure("ML serial debug")
@@ -71,13 +76,13 @@ def check_engine_output(self, output, plotting=False, debug=False, scan="MF"):
         RMSE_ob = (np.mean(np.abs(OBJ_ML_pycuda - OBJ_ML_serial)**2))
         RMSE_pr = (np.mean(np.abs(PRB_ML_pycuda - PRB_ML_serial)**2))
         # RMSE_LL = (np.mean(np.abs(LL_ML_serial - LL_ML)**2))
-        np.testing.assert_allclose(RMSE_ob, 0.0, atol=1e-2, 
-                                    err_msg="The object arrays are not matching as expected")
-        np.testing.assert_allclose(RMSE_pr, 0.0, atol=1e-2, 
+        np.testing.assert_allclose(RMSE_ob, 0.0, atol=tol,
                                     err_msg="The object arrays are not matching as expected")
+        np.testing.assert_allclose(RMSE_pr, 0.0, atol=tol,
+                                    err_msg="The probe arrays are not matching as expected")
         # np.testing.assert_allclose(RMSE_LL, 0.0, atol=1e-7,
         #                             err_msg="The log-likelihood errors are not matching as expected")
-    
+
     def test_ML_pycuda_base(self):
         out = []
         for eng in ["ML_serial", "ML_pycuda"]:
@@ -89,7 +94,7 @@ def test_ML_pycuda_base(self):
             engine_params.reg_del2_amplitude = 1.
             engine_params.scale_precond = False
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False)
 
     def test_ML_pycuda_regularizer(self):
@@ -103,7 +108,7 @@ def test_ML_pycuda_regularizer(self):
             engine_params.reg_del2_amplitude = 1.
             engine_params.scale_precond = False
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False)
 
     def test_ML_pycuda_preconditioner(self):
@@ -118,7 +123,7 @@ def test_ML_pycuda_preconditioner(self):
             engine_params.scale_precond = True
             engine_params.scale_probe_object = 1e-6
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False)
 
     def test_ML_pycuda_floating(self):
@@ -132,7 +137,7 @@ def test_ML_pycuda_floating(self):
             engine_params.reg_del2_amplitude = 1.
             engine_params.scale_precond = False
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False)
 
     def test_ML_pycuda_smoothing_regularizer(self):
@@ -148,8 +153,27 @@ def test_ML_pycuda_smoothing_regularizer(self):
             engine_params.smooth_gradient_decay = 1/10.
             engine_params.scale_precond = False
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
-        self.check_engine_output(out, plotting=False, debug=False)
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
+        self.check_engine_output(out, plotting=False, debug=False, tol=0.3)
+
+    def test_ML_pycuda_wavefield_preconditioner(self):
+        out = []
+        for eng in ["ML_serial", "ML_pycuda"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 700
+            engine_params.floating_intensities = False
+            engine_params.reg_del2 = False
+            engine_params.reg_del2_amplitude = 1.
+            engine_params.smooth_gradient = 0
+            engine_params.smooth_gradient_decay = 0.
+            engine_params.scale_precond = False
+            engine_params.wavefield_precond = True
+            engine_params.wavefield_delta_object = 0.1
+            engine_params.wavefield_delta_probe = 0.1
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
+        self.check_engine_output(out, plotting=False, debug=False, tol=0.3)
 
     def test_ML_pycuda_all(self):
         out = []
@@ -165,7 +189,7 @@ def test_ML_pycuda_all(self):
             engine_params.scale_precond = True
             engine_params.scale_probe_object = 1e-6
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="info"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False)
 
 if __name__ == "__main__":
diff --git a/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py b/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py
index be96aed54..6aa3ac537 100644
--- a/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py
+++ b/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py
@@ -13,7 +13,7 @@
     from ptypy.accelerate.cuda_pycuda import multi_gpu as mgpu
     from ptypy.utils import parallel
 
-from pkg_resources import parse_version
+from packaging.version import parse
 
 class GpuDataTest(unittest.TestCase):
     """
@@ -51,8 +51,8 @@ def tearDown(self):
 
     @unittest.skipIf(parallel.rank != 0, "Only in MPI rank 0")
     def test_version(self):
-        v1 = parse_version("3.1.0")
-        v2 = parse_version(parse_version("3.1.0a").base_version)
+        v1 = parse("3.1.0")
+        v2 = parse(parse("3.1.0a").base_version)
 
         self.assertGreaterEqual(v2, v1)
 
diff --git a/test/accelerate_tests/cuda_pycuda_tests/po_update_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/po_update_kernel_test.py
index 5fc1112cd..5a0167d01 100644
--- a/test/accelerate_tests/cuda_pycuda_tests/po_update_kernel_test.py
+++ b/test/accelerate_tests/cuda_pycuda_tests/po_update_kernel_test.py
@@ -648,6 +648,127 @@ def test_ob_update_ML_atomics_REGRESSION(self):
     def test_ob_update_ML_tiled_REGRESSION(self):
         self.ob_update_ML_tester(False)
 
+    def pr_update_ML_wavefield_tester(self, atomics=False):
+        '''
+        setup
+        '''
+        addr, object_array, object_array_denominator, probe, exit_wave, probe_denominator = self.prepare_arrays()
+        probe_wavefield = gpuarray.to_gpu(np.zeros_like(probe, dtype=FLOAT_TYPE))
+        '''
+        test
+        '''
+        POUK = PoUpdateKernel()
+
+        POUK.allocate()  # this doesn't do anything, but is the call pattern.
+
+        if not atomics:
+            addr2 = np.ascontiguousarray(np.transpose(addr.get(), (2, 3, 0, 1)))
+            addr = gpuarray.to_gpu(addr2)
+
+        POUK.pr_update_ML_wavefield(addr, probe, probe_wavefield, object_array, exit_wave, atomics=atomics)
+
+        expected_probe = np.array([[[625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j],
+                                    [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j],
+                                    [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j],
+                                    [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j],
+                                    [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j]],
+
+                                   [[786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j],
+                                    [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j],
+                                    [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j],
+                                    [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j],
+                                    [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j]]],
+                                  dtype=COMPLEX_TYPE)
+        np.testing.assert_array_equal(probe.get(), expected_probe,
+                                      err_msg="The probe has not been updated as expected")
+
+        expected_probe_wavefield = np.array(
+                                  [[[136., 136., 136., 136., 136.],
+                                    [136., 136., 136., 136., 136.],
+                                    [136., 136., 136., 136., 136.],
+                                    [136., 136., 136., 136., 136.],
+                                    [136., 136., 136., 136., 136.]],
+
+                                   [[136., 136., 136., 136., 136.],
+                                    [136., 136., 136., 136., 136.],
+                                    [136., 136., 136., 136., 136.],
+                                    [136., 136., 136., 136., 136.],
+                                    [136., 136., 136., 136., 136.]]], 
+                                    dtype=FLOAT_TYPE)
+        np.testing.assert_array_equal(probe_wavefield.get(), expected_probe_wavefield,
+                                      err_msg="The probe wavefield has not been updated as expected")
+
+    def test_pr_update_ML_wavefield_atomics_REGRESSION(self):
+        self.pr_update_ML_wavefield_tester(True)
+
+    def test_pr_update_ML_wavefield_tiled_REGRESSION(self):
+        self.pr_update_ML_wavefield_tester(False)
+
+    def ob_update_ML_wavefield_tester(self, atomics=True):
+        '''
+        setup
+        '''
+        addr, object_array, object_array_denominator, probe, exit_wave, probe_denominator = self.prepare_arrays()
+        object_array_wavefield = gpuarray.to_gpu(np.zeros_like(object_array, dtype=FLOAT_TYPE))
+        '''
+        test
+        '''
+        POUK = PoUpdateKernel()
+
+        POUK.allocate()  # this doesn't do anything, but is the call pattern.
+
+        if not atomics:
+            addr2 = np.ascontiguousarray(np.transpose(addr.get(), (2, 3, 0, 1)))
+            addr = gpuarray.to_gpu(addr2)
+
+        POUK.ob_update_ML_wavefield(addr, object_array, object_array_wavefield, probe, exit_wave, atomics=atomics)
+
+        expected_object_array = np.array(
+            [[[29. + 1.j, 105. + 1.j, 105. + 1.j, 105. + 1.j, 105. + 1.j, 77. + 1.j, 1. + 1.j],
+              [153. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 249. + 1.j, 1. + 1.j],
+              [153. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 249. + 1.j, 1. + 1.j],
+              [153. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 249. + 1.j, 1. + 1.j],
+              [153. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 249. + 1.j, 1. + 1.j],
+              [125. + 1.j, 297. + 1.j, 297. + 1.j, 297. + 1.j, 297. + 1.j, 173. + 1.j, 1. + 1.j],
+              [1. + 1.j, 1. + 1.j, 1. + 1.j, 1. + 1.j, 1. + 1.j, 1. + 1.j, 1. + 1.j]],
+
+             [[44. + 4.j, 132. + 4.j, 132. + 4.j, 132. + 4.j, 132. + 4.j, 92. + 4.j, 4. + 4.j],
+              [180. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 276. + 4.j, 4. + 4.j],
+              [180. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 276. + 4.j, 4. + 4.j],
+              [180. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 276. + 4.j, 4. + 4.j],
+              [180. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 276. + 4.j, 4. + 4.j],
+              [140. + 4.j, 324. + 4.j, 324. + 4.j, 324. + 4.j, 324. + 4.j, 188. + 4.j, 4. + 4.j],
+              [4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j]]],
+            dtype=COMPLEX_TYPE)
+        np.testing.assert_array_equal(object_array.get(), expected_object_array,
+                                      err_msg="The object array has not been updated as expected")
+
+        expected_object_array_wavefield = np.array(
+          [[[10., 20., 20., 20., 20., 10.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [10., 20., 20., 20., 20., 10.,  0.],
+            [ 0.,  0.,  0.,  0.,  0.,  0.,  0.]],
+
+           [[10., 20., 20., 20., 20., 10.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [10., 20., 20., 20., 20., 10.,  0.],
+            [ 0.,  0.,  0.,  0.,  0.,  0.,  0.]]], 
+            dtype=FLOAT_TYPE)
+        np.testing.assert_array_equal(object_array_wavefield.get(), expected_object_array_wavefield,
+                                      err_msg="The object array wavefield has not been updated as expected")
+
+    def test_ob_update_ML_wavefield_atomics_REGRESSION(self):
+        self.ob_update_ML_wavefield_tester(True)
+
+    def test_ob_update_ML_wavefield_tiled_REGRESSION(self):
+        self.ob_update_ML_wavefield_tester(False)
+
     def test_ob_update_local_UNITY(self):
         '''
         setup
diff --git a/test/utils.py b/test/utils.py
index 5e184faa6..1a4e9dd3b 100644
--- a/test/utils.py
+++ b/test/utils.py
@@ -45,9 +45,11 @@ def PtyscanTestRunner(ptyscan_instance, data_params, save_type='append', auto_fr
 
 
 def EngineTestRunner(engine_params,propagator='farfield',output_path='./', output_file=None,
-                    autosave=True, scanmodel="Full", verbose_level="info", init_correct_probe=False):
+                    autosave=True, scanmodel="Full", verbose_level="info", init_correct_probe=False,
+                    frames_per_block=100000):
 
     p = u.Param()
+    p.frames_per_block = frames_per_block
     p.verbose_level = verbose_level
     p.io = u.Param()
     p.io.home = output_path