From 0c78215c8190283c51b25706fcaf284d8b03917c Mon Sep 17 00:00:00 2001
From: Timothy Poon <62692924+ptim0626@users.noreply.github.com>
Date: Mon, 9 Dec 2024 11:38:21 +0000
Subject: [PATCH 01/12] Drop Python <= 3.8 support and add 3.13 (#588)

* Drop Python <= 3.8 support

* Add py 3.13 in CI tests

* Use conda-incubator to set up conda env

* Ensure correct activation throughout the tests

* Migrate to importlib in script2rst

* Migrate to importlib in resources

* Migrate to packaging.version
---
 .github/workflows/test.yml                    | 32 ++++++++-----------
 dependencies_dev.yml                          |  1 +
 dependencies_full.yml                         |  1 +
 doc/script2rst.py                             | 30 ++++++++---------
 ptypy/accelerate/cuda_cupy/multi_gpu.py       |  8 ++---
 ptypy/accelerate/cuda_pycuda/multi_gpu.py     | 10 +++---
 ptypy/resources/__init__.py                   | 14 ++++----
 pyproject.toml                                |  6 ++--
 .../cuda_cupy_tests/multi_gpu_test.py         | 10 +++---
 .../cuda_pycuda_tests/multi_gpu_test.py       |  6 ++--
 10 files changed, 58 insertions(+), 60 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index d53a9a632..b37bc8c30 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -20,16 +20,19 @@ on:
 jobs:
   build-linux:
     runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash -el {0}
     strategy:
       max-parallel: 10
       fail-fast: false
       matrix:
         python-version:
-          - "3.8"
           - "3.9"
           - "3.10"
           - "3.11"
           - "3.12"
+          - "3.13"
         conda-env:
           - "core"
           - "full"
@@ -37,20 +40,14 @@ jobs:
     steps:
     - name: Checkout
       uses: actions/checkout@v4
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v5
+    - uses: conda-incubator/setup-miniconda@v3
       with:
-        python-version:  ${{ matrix.python-version }}
-    - name: Add conda to system path
-      run: |
-        # $CONDA is an environment variable pointing to the root of the miniconda directory
-        echo $CONDA/bin >> $GITHUB_PATH
-        conda --version
-        conda info
-    - name: Make sure conda is updated
-      run: |
-        conda update conda
-        conda --version
+        auto-update-conda: true
+        auto-activate-base: false
+        conda-remove-defaults: true
+        channels: conda-forge
+        activate-environment: ptypy_env
+        python-version: ${{ matrix.python-version }}
     - name: Install ${{ matrix.conda-env }} dependencies
       run: |
         # replace python version in dependencies
@@ -58,16 +55,13 @@ jobs:
         if [ ${{ matrix.conda-env }} == 'full' ] && [ ${{ matrix.python-version }} == '3.12' ]; then
           sed -i '/- pyfftw/d' dependencies_${{ matrix.conda-env }}.yml
         fi
-        # if [ ${{ matrix.conda-env }} == 'full' ] && [ ${{ matrix.python-version }} == '3.8' ]; then
-        #   sed -i '/- mpi4py/d' dependencies_${{ matrix.conda-env }}.yml
-        # fi
         # if [ ${{ matrix.conda-env }} == 'full' ] && [ ${{ matrix.python-version }} == '3.9' ]; then
         #   sed -i '/- mpi4py/d' dependencies_${{ matrix.conda-env }}.yml
         # fi
         conda install --solver=classic mpich
-        conda env update --file dependencies_${{ matrix.conda-env }}.yml --name base
+        conda env update --file dependencies_${{ matrix.conda-env }}.yml --name ptypy_env
         conda install --solver=classic flake8 pytest pytest-cov
-        conda list	
+        conda list
     - name: Prepare ptypy
       run: |
         # Install ptypy
diff --git a/dependencies_dev.yml b/dependencies_dev.yml
index 5462c145a..6563f9dd8 100644
--- a/dependencies_dev.yml
+++ b/dependencies_dev.yml
@@ -10,6 +10,7 @@ dependencies:
   - pyzmq
   - pep8
   - mpi4py
+  - packaging
   - pillow
   - pyfftw
   - pip
diff --git a/dependencies_full.yml b/dependencies_full.yml
index e43241fce..205bdb19e 100644
--- a/dependencies_full.yml
+++ b/dependencies_full.yml
@@ -9,6 +9,7 @@ dependencies:
   - h5py
   - pyzmq
   - mpi4py
+  - packaging
   - pillow
   - pyfftw
   - pyyaml
diff --git a/doc/script2rst.py b/doc/script2rst.py
index b05148e46..b30d94667 100644
--- a/doc/script2rst.py
+++ b/doc/script2rst.py
@@ -1,5 +1,6 @@
 import sys
 import io
+from importlib.resources import files
 import contextlib
 import os
 
@@ -9,13 +10,12 @@
            'simupod.py',
            'ownengine.py',
            'subclassptyscan.py']
+_ptypy_dir = files('ptypy')
 
 if len(sys.argv) == 1:
-    import pkg_resources
-
     for script in scripts:
-        scr = pkg_resources.resource_filename('ptypy', tutorial_dir+script)
-        if not os.path.exists(scr):
+        scr = _ptypy_dir / (tutorial_dir + script)
+        if not scr.exists():
             print('Using backup tutorial for %s' % script)
             scr = '../tutorial/'+script
         #subprocess.call(['python',sys.argv[0]+' '+scr]) # doesn't work
@@ -50,13 +50,13 @@ def stdoutIO(stdout=None):
 frst.write("""
 .. note::
    This tutorial was generated from the python source
-   :file:`[ptypy_root]/tutorial/%(fname)s` using :file:`ptypy/doc/%(this)s`. 
+   :file:`[ptypy_root]/tutorial/%(fname)s` using :file:`ptypy/doc/%(this)s`.
    You are encouraged to modify the parameters and rerun the tutorial with::
-   
+
      $ python [ptypy_root]/tutorial/%(fname)s
 
 """ % {'fname': os.path.split(script_name)[-1], 'this': sys.argv[0]})
- 
+
 was_comment = True
 
 while True:
@@ -86,7 +86,7 @@ def stdoutIO(stdout=None):
             frst.write('   '+line2[1:].strip()+'\n')
         frst.write('\n')
         continue
-        
+
     if line.startswith('"""'):
         frst.write('.. parsed-literal::\n\n')
         while True:
@@ -95,11 +95,11 @@ def stdoutIO(stdout=None):
                 break
             frst.write('   ' + line2)
         continue
-    
+
     decorator = False
     indent = False
     for key in indent_keys:
-        if line.startswith(key): 
+        if line.startswith(key):
             indent = True
             break
 
@@ -125,12 +125,12 @@ def stdoutIO(stdout=None):
             pt = fpy.tell()
         exec(func+'\n')
         continue
-        
+
     wline = line.strip()
     if not wline:
         frst.write('\n')
         continue
-    
+
     with stdoutIO() as sout:
         exec(wline)
         out = sout.getvalue()
@@ -150,9 +150,9 @@ def stdoutIO(stdout=None):
             if was_comment:
                 wline = '\n::\n\n'+wline
                 was_comment = False
-                
+
             frst.write(wline+'\n')
-        
+
         #print out
         if out.strip():
             print(out)
@@ -160,5 +160,5 @@ def stdoutIO(stdout=None):
                 frst.write(' '*3+l+'\n')
             out = ''
 
-    
+
 
diff --git a/ptypy/accelerate/cuda_cupy/multi_gpu.py b/ptypy/accelerate/cuda_cupy/multi_gpu.py
index 6ccbce9a7..e631c3405 100644
--- a/ptypy/accelerate/cuda_cupy/multi_gpu.py
+++ b/ptypy/accelerate/cuda_cupy/multi_gpu.py
@@ -10,12 +10,12 @@
   - OpenMPI in a conda install needs to have the environment variable
   --> if cuda support isn't enabled, the application simply crashes with a seg fault
 
-2) For NCCL peer-to-peer transfers, the EXCLUSIVE compute mode cannot be used. 
+2) For NCCL peer-to-peer transfers, the EXCLUSIVE compute mode cannot be used.
    It should be in DEFAULT mode.
 
 """
 
-from pkg_resources import parse_version
+from packaging.version import parse
 import numpy as np
 import cupy as cp
 from ptypy.utils import parallel
@@ -44,7 +44,7 @@
 have_cuda_mpi = (mpi4py is not None) and \
     "OMPI_MCA_opal_cuda_support" in os.environ and \
     os.environ["OMPI_MCA_opal_cuda_support"] == "true" and \
-    parse_version(parse_version(mpi4py.__version__).base_version) >= parse_version("3.1.0") and \
+    parse(parse(mpi4py.__version__).base_version) >= parse("3.1.0") and \
     not ('PTYPY_USE_MPI' in os.environ)
 
 
@@ -114,7 +114,7 @@ def allReduceSum(self, arr):
 
         count, datatype = self.__get_NCCL_count_dtype(arr)
 
-        self.com.allReduce(arr.data.ptr, arr.data.ptr, count, datatype, nccl.NCCL_SUM, 
+        self.com.allReduce(arr.data.ptr, arr.data.ptr, count, datatype, nccl.NCCL_SUM,
             cp.cuda.get_current_stream().ptr)
 
     def __get_NCCL_count_dtype(self, arr):
diff --git a/ptypy/accelerate/cuda_pycuda/multi_gpu.py b/ptypy/accelerate/cuda_pycuda/multi_gpu.py
index 73138c3ee..09ec60552 100644
--- a/ptypy/accelerate/cuda_pycuda/multi_gpu.py
+++ b/ptypy/accelerate/cuda_pycuda/multi_gpu.py
@@ -5,7 +5,7 @@
 
 Findings:
 
-1) NCCL works with unit tests, but not in the engines. It seems to 
+1) NCCL works with unit tests, but not in the engines. It seems to
 add something to the existing pycuda Context or create a new one,
 as a later event recording on an exit wave transfer fails with
 'ivalid resource handle' Cuda Error. This error typically happens if for example
@@ -22,14 +22,14 @@
   - OpenMPI in a conda install needs to have the environment variable
   --> if cuda support isn't enabled, the application simply crashes with a seg fault
 
-4) For NCCL peer-to-peer transfers, the EXCLUSIVE compute mode cannot be used. 
+4) For NCCL peer-to-peer transfers, the EXCLUSIVE compute mode cannot be used.
    It should be in DEFAULT mode.
 
 5) NCCL support has been dropped from PyCUDA module, but can be used with CuPy module instead
 
 """
 
-from pkg_resources import parse_version
+from packaging.version import parse
 import numpy as np
 from pycuda import gpuarray
 import pycuda.driver as cuda
@@ -54,7 +54,7 @@
 have_cuda_mpi = (mpi4py is not None) and \
     "OMPI_MCA_opal_cuda_support" in os.environ and \
     os.environ["OMPI_MCA_opal_cuda_support"] == "true" and \
-    parse_version(parse_version(mpi4py.__version__).base_version) >= parse_version("3.1.0") and \
+    parse(parse(mpi4py.__version__).base_version) >= parse("3.1.0") and \
     hasattr(gpuarray.GPUArray, '__cuda_array_interface__') and \
     not ('PTYPY_USE_MPI' in os.environ)
 
@@ -97,7 +97,7 @@ def allReduceSum(self, arr):
         if parallel.MPIenabled:
             comm = parallel.comm
             comm.Allreduce(parallel.MPI.IN_PLACE, arr)
-            
+
 
 # pick the appropriate communicator depending on installed packages
 def get_multi_gpu_communicator(use_cuda_mpi=True):
diff --git a/ptypy/resources/__init__.py b/ptypy/resources/__init__.py
index 50bab0b5e..becdece26 100644
--- a/ptypy/resources/__init__.py
+++ b/ptypy/resources/__init__.py
@@ -1,15 +1,17 @@
-import pkg_resources
+from importlib.resources import files
 import numpy as np
 
-flowerfile = pkg_resources.resource_filename(__name__,'flowers.png')
-moonfile = pkg_resources.resource_filename(__name__,'moon.png')
-treefile = pkg_resources.resource_filename(__name__,'tree.png')
+_res_dir = files('ptypy.resources')
+flowerfile =  _res_dir / 'flowers.png'
+moonfile = _res_dir / 'moon.png'
+treefile = _res_dir / 'tree.png'
+
 
 def flower_obj(shape=None):
     from ptypy import utils as u
     import numpy as np
     from matplotlib.image import imread
-    
+
     im = u.rgb2complex(imread(flowerfile))
     if shape is not None:
         sh = u.expect2(shape)
@@ -37,7 +39,7 @@ def moon_pr(shape=None):
     from ptypy import utils as u
     import numpy as np
     from matplotlib.image import imread
-    
+
     im = u.rgb2complex(imread(moonfile))
     if shape is not None:
         sh = u.expect2(shape)
diff --git a/pyproject.toml b/pyproject.toml
index 4b35cf554..d3493bce4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,7 +11,7 @@ authors = [
 ]
 description = "Ptychography Reconstruction for Python"
 readme = "README.rst"
-requires-python = ">=3.7"
+requires-python = ">=3.9"
 classifiers = [
     "Programming Language :: Python :: 3",
     "Development Status :: 3 - Alpha",
@@ -31,7 +31,7 @@ dependencies = [
 [project.optional-dependencies]
 full = ["mpi4py","matplotlib","pyzmq","pillow", "pyfftw", "pyyaml"]
 cupy-cuda11x = ["mpi4py","matplotlib","pyzmq","pillow", "pyfftw", "cupy-cuda11x"]
-cupy-cuda12x = ["mpi4py","matplotlib","pyzmq","pillow", "pyfftw", "cupy-cuda12x"] 
+cupy-cuda12x = ["mpi4py","matplotlib","pyzmq","pillow", "pyfftw", "cupy-cuda12x"]
 
 [project.scripts]
 "ptypy.plot" = "ptypy.cli.plotter:ptypy_plot"
@@ -69,4 +69,4 @@ ptypy = "ptypy"
 
 [tool.setuptools.package-data]
 ptypy = ["resources/*",]
-"ptypy.accelerate.cuda_common" = ["*.cu", "*.cuh"]
\ No newline at end of file
+"ptypy.accelerate.cuda_common" = ["*.cu", "*.cuh"]
diff --git a/test/accelerate_tests/cuda_cupy_tests/multi_gpu_test.py b/test/accelerate_tests/cuda_cupy_tests/multi_gpu_test.py
index 0c234d878..424c6696b 100644
--- a/test/accelerate_tests/cuda_cupy_tests/multi_gpu_test.py
+++ b/test/accelerate_tests/cuda_cupy_tests/multi_gpu_test.py
@@ -11,7 +11,7 @@
     from ptypy.accelerate.cuda_cupy import multi_gpu as mgpu
     from ptypy.utils import parallel
 
-from pkg_resources import parse_version
+from packaging.version import parse
 
 class GpuDataTest(CupyCudaTest):
     """
@@ -36,8 +36,8 @@ def setUp(self):
 
     @unittest.skipIf(parallel.rank != 0, "Only in MPI rank 0")
     def test_version(self):
-        v1 = parse_version("3.1.0")
-        v2 = parse_version(parse_version("3.1.0a").base_version)
+        v1 = parse("3.1.0")
+        v2 = parse(parse("3.1.0a").base_version)
 
         self.assertGreaterEqual(v2, v1)
 
@@ -61,7 +61,7 @@ def multigpu_tester(self, com):
 
     def test_multigpu_auto(self):
         self.multigpu_tester(mgpu.get_multi_gpu_communicator())
-        
+
     def test_multigpu_mpi(self):
         self.multigpu_tester(mgpu.MultiGpuCommunicatorMpi())
 
@@ -71,4 +71,4 @@ def test_multigpu_cudampi(self):
 
     @unittest.skipIf(not mgpu.have_nccl, "NCCL not available")
     def test_multigpu_nccl(self):
-        self.multigpu_tester(mgpu.MultiGpuCommunicatorNccl())
\ No newline at end of file
+        self.multigpu_tester(mgpu.MultiGpuCommunicatorNccl())
diff --git a/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py b/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py
index be96aed54..6aa3ac537 100644
--- a/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py
+++ b/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py
@@ -13,7 +13,7 @@
     from ptypy.accelerate.cuda_pycuda import multi_gpu as mgpu
     from ptypy.utils import parallel
 
-from pkg_resources import parse_version
+from packaging.version import parse
 
 class GpuDataTest(unittest.TestCase):
     """
@@ -51,8 +51,8 @@ def tearDown(self):
 
     @unittest.skipIf(parallel.rank != 0, "Only in MPI rank 0")
     def test_version(self):
-        v1 = parse_version("3.1.0")
-        v2 = parse_version(parse_version("3.1.0a").base_version)
+        v1 = parse("3.1.0")
+        v2 = parse(parse("3.1.0a").base_version)
 
         self.assertGreaterEqual(v2, v1)
 

From ac2563d6026c5944dd31308d50deadccfef649be Mon Sep 17 00:00:00 2001
From: "Benedikt J. Daurer" <bjdaurer@gmail.com>
Date: Fri, 13 Dec 2024 18:08:14 +0000
Subject: [PATCH 02/12] Fix in compilation of filtered_fft, import from
 setuptools instead of distutils (#591)

---
 cufft/extensions.py | 5 +++--
 cufft/setup.py      | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/cufft/extensions.py b/cufft/extensions.py
index 4d8b7f598..468d8d1cb 100644
--- a/cufft/extensions.py
+++ b/cufft/extensions.py
@@ -4,8 +4,8 @@
 import os, re
 import subprocess
 import sysconfig
-from distutils.unixccompiler import UnixCCompiler
-from distutils.command.build_ext import build_ext
+from setuptools._distutils.unixccompiler import UnixCCompiler
+from setuptools.command.build_ext import build_ext
 
 
 def find_in_path(name, path):
@@ -116,6 +116,7 @@ def _compile(self, obj, src, ext, cc_args, extra_postargs, pp_opts):
         compiler_command = [self.CUDA["nvcc"]] + self.NVCC_FLAGS + self.OPTFLAGS + ["-Xcompiler"] + self.CXXFLAGS + CPPFLAGS
         compiler_exec = " ".join(compiler_command)
         self.set_executable('compiler_so', compiler_exec)
+        self.set_executable('compiler_so_cxx', compiler_exec)
         postargs = [] # we don't actually have any postargs
         super(NvccCompiler, self)._compile(obj, src, ext, cc_args, postargs, pp_opts) # the _compile method
         # reset the default compiler_so, which we might have changed for cuda
diff --git a/cufft/setup.py b/cufft/setup.py
index a8ef7c61e..b02262bdf 100644
--- a/cufft/setup.py
+++ b/cufft/setup.py
@@ -2,7 +2,7 @@
 
 # we should aim to remove the distutils dependency
 import setuptools
-from distutils.core import setup, Extension
+from setuptools import setup, Extension
 import os
 
 ext_modules = []

From 021709f5c1fc0d428bb56bdc634cb6b29a908bb2 Mon Sep 17 00:00:00 2001
From: "Benedikt J. Daurer" <bjdaurer@gmail.com>
Date: Fri, 13 Dec 2024 20:53:04 +0000
Subject: [PATCH 03/12] FFT filter changes with Numpy v2.0 (#592)

* Include accelerate base tests by default

* Change expected output in FFT filter tests to reflect support for single precision in Numpy > 2.0

* Adjust tolerance in pycuda fft filter tests
---
 pyproject.toml                                |   1 +
 .../base_tests/array_utils_test.py            | 142 +++++++++---------
 .../cuda_pycuda_tests/array_utils_test.py     |  12 +-
 3 files changed, 78 insertions(+), 77 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index d3493bce4..3f4b0c9b1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -55,6 +55,7 @@ testpaths = [
   "test/ptyscan_tests",
   "test/template_tests",
   "test/util_tests",
+  "test/accelerate_tests/base_tests"
 ]
 
 # this is all BETA according to setuptools
diff --git a/test/accelerate_tests/base_tests/array_utils_test.py b/test/accelerate_tests/base_tests/array_utils_test.py
index 364352a5c..78540f978 100644
--- a/test/accelerate_tests/base_tests/array_utils_test.py
+++ b/test/accelerate_tests/base_tests/array_utils_test.py
@@ -305,42 +305,42 @@ def test_fft_filter(self):
         kernel = np.fft.fftn(rk)
 
         output = au.fft_filter(data, kernel, prefactor, postfactor)
+
+        known_test_output = np.array([-0.0000000e+00+0.00000000e+00j, -0.0000000e+00+0.00000000e+00j,
+                                      0.0000000e+00+0.00000000e+00j,  0.0000000e+00+0.00000000e+00j,
+                                      0.0000000e+00+0.00000000e+00j,  0.0000000e+00+0.00000000e+00j,
+                                      0.0000000e+00-0.00000000e+00j,  0.0000000e+00+0.00000000e+00j,
+                                      -0.0000000e+00+0.00000000e+00j,  0.0000000e+00+0.00000000e+00j,
+                                      -0.0000000e+00+0.00000000e+00j, -0.0000000e+00+0.00000000e+00j,
+                                      0.0000000e+00+0.00000000e+00j,  0.0000000e+00+0.00000000e+00j,
+                                      0.0000000e+00+0.00000000e+00j,  0.0000000e+00+0.00000000e+00j,
+                                      0.0000000e+00+0.00000000e+00j,  0.0000000e+00-0.00000000e+00j,
+                                      0.0000000e+00+0.00000000e+00j,  0.0000000e+00+0.00000000e+00j,
+                                      0.0000000e+00+0.00000000e+00j,  0.0000000e+00+0.00000000e+00j,
+                                      0.0000000e+00+0.00000000e+00j,  0.0000000e+00+0.00000000e+00j,
+                                      0.0000000e+00+0.00000000e+00j,  0.0000000e+00+0.00000000e+00j,
+                                      0.0000000e+00+0.00000000e+00j,  0.0000000e+00+0.00000000e+00j,
+                                      0.0000000e+00+0.00000000e+00j,  0.0000000e+00+0.00000000e+00j,
+                                      0.0000000e+00+0.00000000e+00j,  0.0000000e+00-0.00000000e+00j,
+                                      0.0000000e+00+0.00000000e+00j,  6.1097220e-05+2.92563982e-05j,
+                                      4.0044695e-05+2.52102855e-05j,  8.9999994e+02+9.00000000e+02j,
+                                      8.9999988e+02+8.99999939e+02j,  5.0999994e+02+5.09999939e+02j,
+                                      1.9365043e-05+5.84280206e-05j,  3.0681291e-05+2.31116355e-05j,
+                                      1.2552022e-05-1.01537153e-05j, -1.4034913e-05-1.17988075e-05j,
+                                      -1.9193330e-05+2.72889110e-07j,  1.3895768e-05+1.64778357e-05j,
+                                      6.5228807e-05+2.45708943e-05j,  3.8999994e+02+3.89999939e+02j,
+                                      8.9999988e+02+8.99999878e+02j,  8.9999982e+02+8.99999878e+02j,
+                                      3.0000015e+01+3.00000248e+01j,  3.8863189e-05+3.26705631e-05j,
+                                      2.8768281e-06-1.62116921e-05j, -3.2418033e-05-1.97073969e-05j,
+                                      -6.6843757e-05+7.19546824e-06j,  6.5036993e-06+3.95851657e-06j,
+                                      -2.4053887e-05+9.88548163e-06j,  1.5231475e-05+1.31202614e-06j,
+                                      8.7000000e+01+8.70000305e+01j,  6.1035156e-05+0.00000000e+00j,
+                                      6.1035156e-05+0.00000000e+00j, -2.4943074e-07+6.62429193e-06j,
+                                      1.6712515e-06-2.97475322e-06j,  1.9025241e-05+2.97752194e-07j,
+                                      -9.2436176e-07-3.86252796e-05j, -8.8145862e-06-9.89961700e-06j,
+                                      -1.5782407e-06+1.01533060e-05j, -4.7593076e-06+2.96332291e-05j])
         
-        known_test_output =  np.array([-0.00000000e+00+0.00000000e+00j,  0.00000000e+00-0.00000000e+00j,
-                                       -0.00000000e+00 + 0.00000000e+00j, 0.00000000e+00 - 0.00000000e+00j,
-                                      0.00000000e+00-0.00000000e+00j, -0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00+0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00+0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00+0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                     -0.00000000e+00+0.00000000e+00j, -0.00000000e+00+0.00000000e+00j,
-                                     -0.00000000e+00+0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00-0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00+0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00-0.00000000e+00j,  0.00000000e+00-0.00000000e+00j,
-                                      0.00000000e+00-0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00+0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00+0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00+0.00000000e+00j,  0.00000000e+00-0.00000000e+00j,
-                                      0.00000000e+00-0.00000000e+00j,  0.00000000e+00-0.00000000e+00j,
-                                      0.00000000e+00-0.00000000e+00j,  8.66422277e-14+4.86768828e-14j,
-                                      7.23113320e-14+2.82331542e-14j,  9.00000000e+02+9.00000000e+02j,
-                                      9.00000000e+02+9.00000000e+02j,  5.10000000e+02+5.10000000e+02j,
-                                      1.41172830e-14+3.62223425e-14j,  2.61684238e-14-4.13866575e-14j,
-                                      2.16691314e-14-1.95102733e-14j, -1.36536942e-13-9.94589021e-14j,
-                                     -1.42905371e-13-5.77964697e-14j, -5.00005072e-14+4.08620637e-14j,
-                                      6.38160272e-14+7.61753583e-14j,  3.90000000e+02+3.90000000e+02j,
-                                      9.00000000e+02+9.00000000e+02j,  9.00000000e+02+9.00000000e+02j,
-                                      3.00000000e+01+3.00000000e+01j,  8.63255773e-14+7.08532924e-14j,
-                                      1.80941313e-14-3.85517154e-14j,  7.84277340e-14-1.32008745e-14j,
-                                     -6.57025196e-14-1.72739350e-14j, -6.69570857e-15+6.49622898e-14j,
-                                      6.27436466e-15+7.57162569e-14j,  2.01150157e-15+3.65538558e-14j,
-                                      8.70000000e+01+8.70000000e+01j, -1.13686838e-13-1.70530257e-13j,
-                                      0.00000000e+00-2.27373675e-13j, -1.84492121e-14-9.21502853e-14j,
-                                      2.12418687e-14-8.62209232e-14j,  1.20880692e-13+3.86522371e-14j,
-                                      1.03754734e-13+9.19851759e-14j,  5.50926123e-14+1.17150422e-13j,
-                                     -5.47869215e-14+5.87176511e-14j, -3.52652980e-14+8.44455504e-15j])
-
-        np.testing.assert_array_almost_equal(output.flat[::2000], known_test_output)
+        np.testing.assert_array_almost_equal(output.flat[::2000], known_test_output, decimal=5)
 
     def test_fft_filter_batched(self):
         data = np.zeros((2,256, 512), dtype=COMPLEX_TYPE)
@@ -356,42 +356,42 @@ def test_fft_filter_batched(self):
         kernel = np.fft.fftn(rk)
 
         output = au.fft_filter(data, kernel, prefactor, postfactor)
-        
-        known_test_output =  np.array([-0.00000000e+00+0.00000000e+00j,  0.00000000e+00-0.00000000e+00j,
-                                       -0.00000000e+00 + 0.00000000e+00j, 0.00000000e+00 - 0.00000000e+00j,
-                                      0.00000000e+00-0.00000000e+00j, -0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00+0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00+0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00+0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                     -0.00000000e+00+0.00000000e+00j, -0.00000000e+00+0.00000000e+00j,
-                                     -0.00000000e+00+0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00-0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00+0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00-0.00000000e+00j,  0.00000000e+00-0.00000000e+00j,
-                                      0.00000000e+00-0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00+0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00+0.00000000e+00j,  0.00000000e+00+0.00000000e+00j,
-                                      0.00000000e+00+0.00000000e+00j,  0.00000000e+00-0.00000000e+00j,
-                                      0.00000000e+00-0.00000000e+00j,  0.00000000e+00-0.00000000e+00j,
-                                      0.00000000e+00-0.00000000e+00j,  8.66422277e-14+4.86768828e-14j,
-                                      7.23113320e-14+2.82331542e-14j,  9.00000000e+02+9.00000000e+02j,
-                                      9.00000000e+02+9.00000000e+02j,  5.10000000e+02+5.10000000e+02j,
-                                      1.41172830e-14+3.62223425e-14j,  2.61684238e-14-4.13866575e-14j,
-                                      2.16691314e-14-1.95102733e-14j, -1.36536942e-13-9.94589021e-14j,
-                                     -1.42905371e-13-5.77964697e-14j, -5.00005072e-14+4.08620637e-14j,
-                                      6.38160272e-14+7.61753583e-14j,  3.90000000e+02+3.90000000e+02j,
-                                      9.00000000e+02+9.00000000e+02j,  9.00000000e+02+9.00000000e+02j,
-                                      3.00000000e+01+3.00000000e+01j,  8.63255773e-14+7.08532924e-14j,
-                                      1.80941313e-14-3.85517154e-14j,  7.84277340e-14-1.32008745e-14j,
-                                     -6.57025196e-14-1.72739350e-14j, -6.69570857e-15+6.49622898e-14j,
-                                      6.27436466e-15+7.57162569e-14j,  2.01150157e-15+3.65538558e-14j,
-                                      8.70000000e+01+8.70000000e+01j, -1.13686838e-13-1.70530257e-13j,
-                                      0.00000000e+00-2.27373675e-13j, -1.84492121e-14-9.21502853e-14j,
-                                      2.12418687e-14-8.62209232e-14j,  1.20880692e-13+3.86522371e-14j,
-                                      1.03754734e-13+9.19851759e-14j,  5.50926123e-14+1.17150422e-13j,
-                                     -5.47869215e-14+5.87176511e-14j, -3.52652980e-14+8.44455504e-15j])
-
-        np.testing.assert_array_almost_equal(output[1].flat[::2000], known_test_output)
+
+        known_test_output = np.array([ 0.00000000e+00-0.0000000e+00j,  0.00000000e+00-0.0000000e+00j,
+                                       0.00000000e+00+0.0000000e+00j,  0.00000000e+00+0.0000000e+00j,
+                                       0.00000000e+00+0.0000000e+00j,  0.00000000e+00+0.0000000e+00j,
+                                       0.00000000e+00+0.0000000e+00j, 0.00000000e+00+0.0000000e+00j,
+                                       -0.00000000e+00+0.0000000e+00j,  0.00000000e+00-0.0000000e+00j,
+                                       0.00000000e+00-0.0000000e+00j,  0.00000000e+00-0.0000000e+00j,
+                                       0.00000000e+00+0.0000000e+00j, 0.00000000e+00+0.0000000e+00j,
+                                       0.00000000e+00+0.0000000e+00j,  0.00000000e+00+0.0000000e+00j,
+                                       0.00000000e+00+0.0000000e+00j,  0.00000000e+00+0.0000000e+00j,
+                                       0.00000000e+00+0.0000000e+00j,  0.00000000e+00-0.0000000e+00j,
+                                       0.00000000e+00-0.0000000e+00j,  0.00000000e+00-0.0000000e+00j,
+                                       0.00000000e+00+0.0000000e+00j,  0.00000000e+00+0.0000000e+00j,
+                                       0.00000000e+00+0.0000000e+00j,  0.00000000e+00+0.0000000e+00j,
+                                       0.00000000e+00+0.0000000e+00j,  0.00000000e+00+0.0000000e+00j,
+                                       0.00000000e+00+0.0000000e+00j, 0.00000000e+00-0.0000000e+00j,
+                                       0.00000000e+00-0.0000000e+00j,  0.00000000e+00-0.0000000e+00j,
+                                       0.00000000e+00-0.0000000e+00j,  4.86995195e-05-9.1511911e-06j,
+                                       5.89395277e-05+3.6706428e-05j,  8.99999817e+02+9.0000000e+02j,
+                                       8.99999817e+02+9.0000000e+02j,  5.09999969e+02+5.0999997e+02j,
+                                       6.86399580e-05+5.5245564e-05j, -2.15578075e-06-8.0761157e-07j,
+                                       -5.99612467e-05-3.7489859e-05j, -2.08058154e-05-1.7001423e-05j,
+                                       -3.15661709e-05-2.0192698e-05j, -1.17410173e-05-2.3929812e-05j,
+                                       8.41844594e-05+4.9635066e-05j,  3.90000031e+02+3.9000003e+02j,
+                                       8.99999817e+02+8.9999994e+02j,  8.99999817e+02+8.9999994e+02j,
+                                       3.00000153e+01+3.0000000e+01j,  4.75842753e-05+1.7961407e-05j,
+                                       -1.28229876e-05-3.3492659e-05j, -1.50405585e-05+3.0159079e-05j,
+                                       -1.00799960e-04-6.6932058e-05j, -4.90295024e-05-3.6601130e-05j,
+                                       -4.48861247e-05-1.4717044e-05j,  2.60417364e-05-8.3221821e-06j,
+                                       8.69999847e+01+8.7000046e+01j,  4.31583721e-05+4.3158372e-05j,
+                                       4.31583721e-05+4.3158372e-05j,  4.04649109e-06-1.6836095e-05j,
+                                       1.37377283e-05+5.2577798e-06j, -2.30404657e-05-3.4596611e-05j,
+                                       -1.33214944e-05-3.2517899e-05j,  2.45428764e-05-3.5186855e-07j,
+                                       -1.85950885e-05-2.1921931e-05j, -1.65030433e-05-8.0249208e-07j])
+                        
+        np.testing.assert_array_almost_equal(output[1].flat[::2000], known_test_output, decimal=5)
 
 
     def test_complex_gaussian_filter_fft(self):
diff --git a/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py b/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py
index be82cc292..d11584b01 100644
--- a/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py
+++ b/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py
@@ -580,7 +580,7 @@ def test_fft_filter_UNITY(self):
 
         output = au.fft_filter(data, kernel, prefactor, postfactor)
 
-        np.testing.assert_allclose(output, data_dev.get(), rtol=1e-5, atol=1e-6)
+        np.testing.assert_allclose(output, data_dev.get(), rtol=1e-5, atol=1e-5)
 
     def test_fft_filter_batched_UNITY(self):
         sh = (2,16, 35)
@@ -607,7 +607,7 @@ def test_fft_filter_batched_UNITY(self):
         output = au.fft_filter(data, kernel, prefactor, postfactor)
         print(data_dev.get())
 
-        np.testing.assert_allclose(output, data_dev.get(), rtol=1e-5, atol=1e-6)
+        np.testing.assert_allclose(output, data_dev.get(), rtol=1e-5, atol=1e-5)
 
     def test_complex_gaussian_filter_fft_little_blurring_UNITY(self):
         # Arrange
@@ -624,7 +624,7 @@ def test_complex_gaussian_filter_fft_little_blurring_UNITY(self):
         out_exp = au.complex_gaussian_filter_fft(data, mfs)
         out = data_dev.get()
         
-        np.testing.assert_allclose(out_exp, out, atol=1e-6)
+        np.testing.assert_allclose(out_exp, out, atol=1e-5)
 
     def test_complex_gaussian_filter_fft_more_blurring_UNITY(self):
         # Arrange
@@ -641,7 +641,7 @@ def test_complex_gaussian_filter_fft_more_blurring_UNITY(self):
         out_exp = au.complex_gaussian_filter_fft(data, mfs)
         out = data_dev.get()
 
-        np.testing.assert_allclose(out_exp, out, atol=1e-6)
+        np.testing.assert_allclose(out_exp, out, atol=1e-5)
 
     def test_complex_gaussian_filter_fft_nonsquare_UNITY(self):
         # Arrange
@@ -660,7 +660,7 @@ def test_complex_gaussian_filter_fft_nonsquare_UNITY(self):
         out_exp = au.complex_gaussian_filter_fft(data, mfs)
         out = data_dev.get()
 
-        np.testing.assert_allclose(out_exp, out, atol=1e-6)
+        np.testing.assert_allclose(out_exp, out, atol=1e-5)
 
     def test_complex_gaussian_filter_fft_batched(self):
         # Arrange
@@ -680,4 +680,4 @@ def test_complex_gaussian_filter_fft_batched(self):
         out_exp = au.complex_gaussian_filter_fft(data, mfs)
         out = data_dev.get()
 
-        np.testing.assert_allclose(out_exp, out, atol=1e-6)
\ No newline at end of file
+        np.testing.assert_allclose(out_exp, out, atol=1e-5)
\ No newline at end of file

From 89de9cc125f3ff3a095dc0d086f41b1bdef557d0 Mon Sep 17 00:00:00 2001
From: Timothy Poon <62692924+ptim0626@users.noreply.github.com>
Date: Thu, 6 Mar 2025 16:50:43 +0000
Subject: [PATCH 04/12] Control the start of probe update in WASP (#601)

* Use probe_update_start to control the probe update in WASP

* Add WASP tests with probe_update_start

* Update WASP publication

* Remove get_cmap as suggested in https://github.com/matplotlib/matplotlib/issues/20853

* Handle colormap registration in different versions of matplotlib

* Skip FFT filter tests if Numpy < 2.0

---------

Co-authored-by: Benedikt Daurer <benedikt.daurer@diamond.ac.uk>
---
 ptypy/custom/WASP.py                          | 28 ++++++-----
 ptypy/custom/WASP_cupy.py                     |  8 ++--
 ptypy/custom/WASP_pycuda.py                   |  9 ++--
 ptypy/custom/WASP_serial.py                   |  8 ++--
 ptypy/utils/plot_utils.py                     | 48 ++++++++++---------
 .../accelerate_tests/base_tests/WASP_tests.py | 17 +++++++
 .../base_tests/array_utils_test.py            |  5 +-
 .../cuda_cupy_tests/WASP_tests.py             | 17 +++++++
 .../cuda_pycuda_tests/WASP_tests.py           | 17 +++++++
 9 files changed, 111 insertions(+), 46 deletions(-)

diff --git a/ptypy/custom/WASP.py b/ptypy/custom/WASP.py
index 275224b9a..3ef84bb5c 100644
--- a/ptypy/custom/WASP.py
+++ b/ptypy/custom/WASP.py
@@ -124,11 +124,11 @@ def __init__(self, ptycho_parent, pars=None):
         self.article = dict(
                 title='WASP: Weighted Average of Sequential Projections for ptychographic phase retrieval',
                 author='A. M. Maiden, W. Mei and P. Li',
-                journal='Optica',
-                volume=42,
+                journal='Optics Express',
+                volume=32,
                 year=2024,
-                page=42,
-                doi='doi',
+                page=21327,
+                doi='10.1364/OE.516946',
                 comment='Weighted Average of Sequential Projections',
                 )
         self.ptycho.citations.add_article(**self.article)
@@ -237,7 +237,8 @@ def overlap_update(self):
             # update object first, then probe, and accumulate their sum for
             # averaging after going through all the views
             self.object_update(view, ex_old)
-            self.probe_update(view, ex_old, ob_old)
+            if self.p.probe_update_start <= self.curiter:
+                self.probe_update(view, ex_old, ob_old)
 
         # WASP
         self.wasp_averaging()
@@ -323,16 +324,17 @@ def wasp_averaging(self):
 
             self.clip_object(s)
 
-        for name, p in self.pr.storages.items():
-            pr_sum_nmr = self.pr_sum_nmr.storages[name].data
-            pr_sum_dnm = self.pr_sum_dnm.storages[name].data
+        if self.p.probe_update_start <= self.curiter:
+            for name, p in self.pr.storages.items():
+                pr_sum_nmr = self.pr_sum_nmr.storages[name].data
+                pr_sum_dnm = self.pr_sum_dnm.storages[name].data
 
-            parallel.allreduce(pr_sum_nmr)
-            parallel.allreduce(pr_sum_dnm)
+                parallel.allreduce(pr_sum_nmr)
+                parallel.allreduce(pr_sum_dnm)
 
-            # avoid division by zero
-            is_zero = np.isclose(pr_sum_dnm, 0)
-            p.data = np.where(is_zero, pr_sum_nmr, pr_sum_nmr / pr_sum_dnm)
+                # avoid division by zero
+                is_zero = np.isclose(pr_sum_dnm, 0)
+                p.data = np.where(is_zero, pr_sum_nmr, pr_sum_nmr / pr_sum_dnm)
 
     def clip_object(self, ob):
         """Copied from _ProjectionEngine
diff --git a/ptypy/custom/WASP_cupy.py b/ptypy/custom/WASP_cupy.py
index 06d1507fb..c23448c49 100644
--- a/ptypy/custom/WASP_cupy.py
+++ b/ptypy/custom/WASP_cupy.py
@@ -338,8 +338,9 @@ def engine_iterate(self, num=1):
                     ob_old = ob.copy()
                     POK.ob_update_wasp(addr, ob, pr, ex, aux, ob_sum_nmr,
                                        ob_sum_dnm, alpha=self.p.alpha)
-                    POK.pr_update_wasp(addr, pr, ob_old, ex, aux, pr_sum_nmr,
-                                       pr_sum_dnm, beta=self.p.beta)
+                    if self.p.probe_update_start <= self.curiter:
+                        POK.pr_update_wasp(addr, pr, ob_old, ex, aux, pr_sum_nmr,
+                                           pr_sum_dnm, beta=self.p.beta)
 
                     ## compute log-likelihood
                     if self.p.compute_log_likelihood:
@@ -355,7 +356,8 @@ def engine_iterate(self, num=1):
                 self.multigpu.allReduceSum(pr_sum_dnm)
 
                 POK.avg_wasp(ob, ob_sum_nmr, ob_sum_dnm)
-                POK.avg_wasp(pr, pr_sum_nmr, pr_sum_dnm)
+                if self.p.probe_update_start <= self.curiter:
+                    POK.avg_wasp(pr, pr_sum_nmr, pr_sum_dnm)
 
                 # Clip object
                 if self.p.clip_object is not None:
diff --git a/ptypy/custom/WASP_pycuda.py b/ptypy/custom/WASP_pycuda.py
index e96bce1b5..e631e855a 100644
--- a/ptypy/custom/WASP_pycuda.py
+++ b/ptypy/custom/WASP_pycuda.py
@@ -338,9 +338,9 @@ def engine_iterate(self, num=1):
                     ob_old = ob.copy()
                     POK.ob_update_wasp(addr, ob, pr, ex, aux, ob_sum_nmr,
                                        ob_sum_dnm, alpha=self.p.alpha)
-                    POK.pr_update_wasp(addr, pr, ob_old, ex, aux, pr_sum_nmr,
-                                       pr_sum_dnm, beta=self.p.beta)
-
+                    if self.p.probe_update_start <= self.curiter:
+                        POK.pr_update_wasp(addr, pr, ob_old, ex, aux, pr_sum_nmr,
+                                           pr_sum_dnm, beta=self.p.beta)
 
                     ## compute log-likelihood
                     if self.p.compute_log_likelihood:
@@ -356,7 +356,8 @@ def engine_iterate(self, num=1):
                 self.multigpu.allReduceSum(pr_sum_dnm)
 
                 POK.avg_wasp(ob, ob_sum_nmr, ob_sum_dnm)
-                POK.avg_wasp(pr, pr_sum_nmr, pr_sum_dnm)
+                if self.p.probe_update_start <= self.curiter:
+                    POK.avg_wasp(pr, pr_sum_nmr, pr_sum_dnm)
 
                 # Clip object
                 if self.p.clip_object is not None:
diff --git a/ptypy/custom/WASP_serial.py b/ptypy/custom/WASP_serial.py
index 96391ea27..ccd961b2a 100644
--- a/ptypy/custom/WASP_serial.py
+++ b/ptypy/custom/WASP_serial.py
@@ -292,8 +292,9 @@ def engine_iterate(self, num=1):
                     ob_old = ob.copy()
                     POK.ob_update_wasp(addr, ob, pr, ex, aux, ob_sum_nmr,
                                        ob_sum_dnm, alpha=self.p.alpha)
-                    POK.pr_update_wasp(addr, pr, ob_old, ex, aux, pr_sum_nmr,
-                                       pr_sum_dnm, beta=self.p.beta)
+                    if self.p.probe_update_start <= self.curiter:
+                        POK.pr_update_wasp(addr, pr, ob_old, ex, aux, pr_sum_nmr,
+                                           pr_sum_dnm, beta=self.p.beta)
 
                     self.benchmark.wasp_ob_pr_update += time.time() - t1
                     self.benchmark.calls_wasp_ob_pr_update += 1
@@ -321,7 +322,8 @@ def engine_iterate(self, num=1):
                 parallel.allreduce(pr_sum_dnm)
 
                 POK.avg_wasp(ob, ob_sum_nmr, ob_sum_dnm)
-                POK.avg_wasp(pr, pr_sum_nmr, pr_sum_dnm)
+                if self.p.probe_update_start <= self.curiter:
+                    POK.avg_wasp(pr, pr_sum_nmr, pr_sum_dnm)
 
                 self.benchmark.wasp_averaging += time.time() - t1
                 self.benchmark.calls_wasp_averaging += 1
diff --git a/ptypy/utils/plot_utils.py b/ptypy/utils/plot_utils.py
index 2f85e72b5..a4a81a596 100644
--- a/ptypy/utils/plot_utils.py
+++ b/ptypy/utils/plot_utils.py
@@ -128,7 +128,7 @@ def pause(timeout=-1, message=None):
                     print(message)
                 time.sleep(timeout)
 
-# BD: With version 9.1.0 of PIL, _MODE_CONV has been removed, 
+# BD: With version 9.1.0 of PIL, _MODE_CONV has been removed,
 #     see here: https://github.com/python-pillow/Pillow/pull/6057
 #     can't see a reason why this is still needed, therefore commenting it out
 # FIXME: Is this still needed?
@@ -179,7 +179,7 @@ def complex2hsv(cin, vmin=0., vmax=None):
     else:
         assert vmin < vmax
         v = (v.clip(vmin, vmax)-vmin)/(vmax-vmin)
-    
+
     return np.asarray((h, s, v))
 
 
@@ -345,7 +345,7 @@ def imsave(a, filename=None, vmin=None, vmax=None, cmap=None):
     uses a matplotlib colormap with name 'gray'
     """
     if str(cmap) == cmap:
-        cmap = mpl.colormaps.get_cmap(cmap)
+        cmap = mpl.colormaps[cmap]
 
     if a.dtype.kind == 'c':
         i = complex2rgb(a, vmin=vmin, vmax=vmax)
@@ -415,8 +415,12 @@ def imload(filename):
                                                       (0.340,   1,    1),
                                                       (0.650,   0,    0),
                                                       (1.000,   0,    0))}
-                                                      
-mpl.colormaps.register(cmap=LinearSegmentedColormap(name='franzmap', segmentdata=franzmap_cm, N=256))
+
+_lscm = LinearSegmentedColormap(name='franzmap', segmentdata=franzmap_cm, N=256)
+try:
+    mpl.colormaps.register(cmap=_lscm)
+except AttributeError:
+    matplotlib.cm.register_cmap(cmap=_lscm)
 
 def franzmap():
     """\
@@ -426,7 +430,7 @@ def franzmap():
     im = mpl.pyplot.gci()
 
     if im is not None:
-        im.set_cmap(matplotlib.cm.get_cmap('franzmap'))
+        im.set_cmap(mpl.colormaps['franzmap'])
     mpl.pyplot.draw_if_interactive()
 
 
@@ -595,46 +599,46 @@ class PtyAxis(object):
     """
     Plot environment for matplotlib to allow for a Image plot with color axis,
     potentially of a potentially complex array.
-    
+
     Please note that this class may undergo changes or become obsolete altogether.
     """
     def __init__(self, ax=None, data=None, channel='r', cmap=None, fontsize=8, **kwargs):
         """
-        
+
         Parameters
         ----------
-        
+
         ax : pyplot.axis
             An axis in matplotlib figure. If ``None`` a figure with a single
             axis will be created.
-            
+
         data : numpy.ndarray
-            The (complex) twodimensional data to be displayed. 
-            
+            The (complex) twodimensional data to be displayed.
+
         channel : str
             Choose
-            
+
             - ``'a'`` to plot absolute/modulus value of the data,
             - ``'p'`` to plot phase value of the data,
             - ``'a'`` to plot real value of the data,
             - ``'a'`` to plot imaginary value of the data,
-            - ``'a'`` to plot a composite image where phase channel maps to hue and 
+            - ``'a'`` to plot a composite image where phase channel maps to hue and
               modulus channel maps to brightness of the color.
-              
-        cmap : str 
+
+        cmap : str
             String representation of one of matplotlibs colormaps.
-            
+
         fontsize : int
             Base font size of labels, etc.
-            
+
         Keyword Arguments
         -----------------
         vmin, vmax : float
             Minimum and maximum value for colormap scaling
-            
+
         rmramp : bool
             Remove phase ramp if ``True``, default is ``False``
-            
+
         """
         if ax is None:
             fig = plt.figure()
@@ -679,10 +683,10 @@ def set_channel(self, channel, update=True):
 
     def set_cmap(self, cmap, update=True):
         try:
-            self.cmap = mpl.colormaps.get_cmap(cmap)
+            self.cmap = mpl.colormaps[cmap]
         except:
             logger.debug("Colormap `%s` not found. Using `gray`" % str(cmap))
-            self.cmap = mpl.colormaps.get_cmap('gray')
+            self.cmap = mpl.colormaps['gray']
         if update:
             self._update()
             self._update_colorscale()
diff --git a/test/accelerate_tests/base_tests/WASP_tests.py b/test/accelerate_tests/base_tests/WASP_tests.py
index a0ea4514e..5cf73982c 100644
--- a/test/accelerate_tests/base_tests/WASP_tests.py
+++ b/test/accelerate_tests/base_tests/WASP_tests.py
@@ -99,6 +99,7 @@ def test_WASP_serial_base(self):
             engine_params = u.Param()
             engine_params.name = eng
             engine_params.numiter = 10
+            engine_params.clip_object = (0, 2)
             engine_params.random_seed = 721
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
                                            scanmodel="BlockFull", autosave=False, verbose_level="critical"))
@@ -120,12 +121,28 @@ def test_WASP_serial_clip(self):
         if parallel.master:
             self.check_engine_output(out, plotting=False, debug=False)
 
+    def test_WASP_serial_probe_start(self):
+        out = []
+        for eng in ["WASP", "WASP_serial"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 10
+            engine_params.clip_object = (0, 2)
+            engine_params.probe_update_start = 4
+            engine_params.random_seed = 721
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+
+        if parallel.master:
+            self.check_engine_output(out, plotting=False, debug=False)
+
     def test_WASP_serial_alpha_beta(self):
         out = []
         for eng in ["WASP", "WASP_serial"]:
             engine_params = u.Param()
             engine_params.name = eng
             engine_params.numiter = 10
+            engine_params.clip_object = (0, 2)
             engine_params.alpha = 0.64
             engine_params.beta = 0.94
             engine_params.random_seed = 721
diff --git a/test/accelerate_tests/base_tests/array_utils_test.py b/test/accelerate_tests/base_tests/array_utils_test.py
index 78540f978..4d8081903 100644
--- a/test/accelerate_tests/base_tests/array_utils_test.py
+++ b/test/accelerate_tests/base_tests/array_utils_test.py
@@ -2,11 +2,12 @@
 Tests for the array_utils module
 '''
 
-import unittest
+import unittest, pytest
 import numpy as np
 from ptypy.accelerate.base import FLOAT_TYPE, COMPLEX_TYPE
 from ptypy.accelerate.base import array_utils as au
 
+_numpy_version = tuple(map(int, np.__version__.split(".")))
 
 class ArrayUtilsTest(unittest.TestCase):
 
@@ -291,6 +292,7 @@ def test_crop_pad_3(self):
                          dtype=np.complex64)
         np.testing.assert_array_almost_equal(A, exp_A)
 
+    @pytest.mark.skipif(_numpy_version < (2,0), reason="requires Numpy 2.0 or higher to match expected output")
     def test_fft_filter(self):
         data = np.zeros((256, 512), dtype=COMPLEX_TYPE)
         data[64:-64,128:-128] = 1 + 1.j
@@ -342,6 +344,7 @@ def test_fft_filter(self):
         
         np.testing.assert_array_almost_equal(output.flat[::2000], known_test_output, decimal=5)
 
+    @pytest.mark.skipif(_numpy_version < (2,0), reason="requires Numpy 2.0 or higher to match expected output")
     def test_fft_filter_batched(self):
         data = np.zeros((2,256, 512), dtype=COMPLEX_TYPE)
         data[:,64:-64,128:-128] = 1 + 1.j
diff --git a/test/accelerate_tests/cuda_cupy_tests/WASP_tests.py b/test/accelerate_tests/cuda_cupy_tests/WASP_tests.py
index 7ccda4930..91bc0a1fc 100644
--- a/test/accelerate_tests/cuda_cupy_tests/WASP_tests.py
+++ b/test/accelerate_tests/cuda_cupy_tests/WASP_tests.py
@@ -98,6 +98,7 @@ def test_WASP_cupy_base(self):
             engine_params = u.Param()
             engine_params.name = eng
             engine_params.numiter = 10
+            engine_params.clip_object = (0, 2)
             engine_params.random_seed = 721
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
                                            scanmodel="BlockFull", autosave=False, verbose_level="critical"))
@@ -119,12 +120,28 @@ def test_WASP_cupy_clip(self):
         if parallel.master:
             self.check_engine_output(out, plotting=False, debug=False)
 
+    def test_WASP_cupy_probe_start(self):
+        out = []
+        for eng in ["WASP_serial", "WASP_cupy"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 10
+            engine_params.clip_object = (0, 2)
+            engine_params.probe_update_start = 4
+            engine_params.random_seed = 721
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+
+        if parallel.master:
+            self.check_engine_output(out, plotting=False, debug=False)
+
     def test_WASP_cupy_alpha_beta(self):
         out = []
         for eng in ["WASP_serial", "WASP_cupy"]:
             engine_params = u.Param()
             engine_params.name = eng
             engine_params.numiter = 10
+            engine_params.clip_object = (0, 2)
             engine_params.alpha = 0.64
             engine_params.beta = 0.94
             engine_params.random_seed = 721
diff --git a/test/accelerate_tests/cuda_pycuda_tests/WASP_tests.py b/test/accelerate_tests/cuda_pycuda_tests/WASP_tests.py
index f8bf5f0c8..e8527481d 100644
--- a/test/accelerate_tests/cuda_pycuda_tests/WASP_tests.py
+++ b/test/accelerate_tests/cuda_pycuda_tests/WASP_tests.py
@@ -98,6 +98,7 @@ def test_WASP_pycuda_base(self):
             engine_params = u.Param()
             engine_params.name = eng
             engine_params.numiter = 10
+            engine_params.clip_object = (0, 2)
             engine_params.random_seed = 721
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
                                            scanmodel="BlockFull", autosave=False, verbose_level="critical"))
@@ -119,12 +120,28 @@ def test_WASP_pycuda_clip(self):
         if parallel.master:
             self.check_engine_output(out, plotting=False, debug=False)
 
+    def test_WASP_pycuda_probe_start(self):
+        out = []
+        for eng in ["WASP_serial", "WASP_pycuda"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 10
+            engine_params.clip_object = (0, 2)
+            engine_params.probe_update_start = 4
+            engine_params.random_seed = 721
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+
+        if parallel.master:
+            self.check_engine_output(out, plotting=False, debug=False)
+
     def test_WASP_pycuda_alpha_beta(self):
         out = []
         for eng in ["WASP_serial", "WASP_pycuda"]:
             engine_params = u.Param()
             engine_params.name = eng
             engine_params.numiter = 10
+            engine_params.clip_object = (0, 2)
             engine_params.alpha = 0.64
             engine_params.beta = 0.94
             engine_params.random_seed = 721

From 9b18a2e29bc319ca1f025bb2a2a18de58d14024b Mon Sep 17 00:00:00 2001
From: "Benedikt J. Daurer" <bjdaurer@gmail.com>
Date: Fri, 28 Mar 2025 19:39:42 +0000
Subject: [PATCH 05/12] Renamed engine tests to be picked up by pytest (#597)

* Renamed engine tests to be picked up by pytest

* Fix parsing of sigma in smoothing regulariser

* Disable smoothing regulariser test

* Disable CI hack for pyfftw and Python 3.12
---
 .github/workflows/test.yml                     |  6 +++---
 ptypy/accelerate/base/array_utils.py           | 18 ++++++++++++------
 ptypy/engines/ML.py                            |  2 +-
 .../base_tests/{WASP_tests.py => WASP_test.py} |  0
 .../{engine_tests.py => engine_test.py}        |  6 ++++--
 .../{WASP_tests.py => WASP_test.py}            |  0
 .../{WASP_tests.py => WASP_test.py}            |  0
 7 files changed, 20 insertions(+), 12 deletions(-)
 rename test/accelerate_tests/base_tests/{WASP_tests.py => WASP_test.py} (100%)
 rename test/accelerate_tests/base_tests/{engine_tests.py => engine_test.py} (96%)
 rename test/accelerate_tests/cuda_cupy_tests/{WASP_tests.py => WASP_test.py} (100%)
 rename test/accelerate_tests/cuda_pycuda_tests/{WASP_tests.py => WASP_test.py} (100%)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index b37bc8c30..18aff26cf 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -52,9 +52,9 @@ jobs:
       run: |
         # replace python version in dependencies
         sed -i 's/python/python=${{ matrix.python-version }}/' dependencies_${{ matrix.conda-env }}.yml
-        if [ ${{ matrix.conda-env }} == 'full' ] && [ ${{ matrix.python-version }} == '3.12' ]; then
-          sed -i '/- pyfftw/d' dependencies_${{ matrix.conda-env }}.yml
-        fi
+        # if [ ${{ matrix.conda-env }} == 'full' ] && [ ${{ matrix.python-version }} == '3.12' ]; then
+        #   sed -i '/- pyfftw/d' dependencies_${{ matrix.conda-env }}.yml
+        # fi
         # if [ ${{ matrix.conda-env }} == 'full' ] && [ ${{ matrix.python-version }} == '3.9' ]; then
         #   sed -i '/- mpi4py/d' dependencies_${{ matrix.conda-env }}.yml
         # fi
diff --git a/ptypy/accelerate/base/array_utils.py b/ptypy/accelerate/base/array_utils.py
index bec4cc8e4..5af588413 100644
--- a/ptypy/accelerate/base/array_utils.py
+++ b/ptypy/accelerate/base/array_utils.py
@@ -70,14 +70,20 @@ def complex_gaussian_filter(input, mfs):
     '''
     takes 2D and 3D arrays. Complex input, complex output. mfs has len 0<x<=2
     '''
-    if len(mfs) > 2:
+    if isinstance(mfs, list) and len(mfs) > 2:
         raise NotImplementedError("Only batches of 2D arrays allowed!")
+    if isinstance(mfs, (int, float)):
+        mfs_list = [mfs, mfs]
+        if input.ndim == 3:
+            mfs_list = [0, mfs, mfs]
+    elif isinstance(mfs, (list,tuple)):
+        mfs_list = list(mfs)
+        if input.ndim == 3:
+            mfs_list.insert(0,0)
+    else:
+        raise NotImplementedError(f"Cannot interpret mfs of type {type(mfs)}")
 
-    if input.ndim == 3:
-        mfs = np.insert(mfs, 0, 0)
-
-    return (ndi.gaussian_filter(np.real(input), mfs) + 1j * ndi.gaussian_filter(np.imag(input), mfs)).astype(
-        input.dtype)
+    return (ndi.gaussian_filter(np.real(input), mfs_list).astype(input.dtype) + 1j * ndi.gaussian_filter(np.imag(input), mfs_list))
 
 
 def complex_gaussian_filter_fft(input, mfs):
diff --git a/ptypy/engines/ML.py b/ptypy/engines/ML.py
index 9d5bb7d1a..b5c93dfbe 100644
--- a/ptypy/engines/ML.py
+++ b/ptypy/engines/ML.py
@@ -1290,7 +1290,7 @@ def __init__(self, sigma):
             self.sigma = sigma
 
         def __call__(self, x):
-            return u.c_gf(x, [0, self.sigma, self.sigma])
+            return  u.c_gf(x, [0, self.sigma, self.sigma])
 
     # from scipy.signal import correlate2d
     # class HannFilt:
diff --git a/test/accelerate_tests/base_tests/WASP_tests.py b/test/accelerate_tests/base_tests/WASP_test.py
similarity index 100%
rename from test/accelerate_tests/base_tests/WASP_tests.py
rename to test/accelerate_tests/base_tests/WASP_test.py
diff --git a/test/accelerate_tests/base_tests/engine_tests.py b/test/accelerate_tests/base_tests/engine_test.py
similarity index 96%
rename from test/accelerate_tests/base_tests/engine_tests.py
rename to test/accelerate_tests/base_tests/engine_test.py
index dbfc1e9f6..855536993 100644
--- a/test/accelerate_tests/base_tests/engine_tests.py
+++ b/test/accelerate_tests/base_tests/engine_test.py
@@ -14,6 +14,7 @@
 import tempfile
 import shutil
 import numpy as np
+import pytest
 
 class MLSerialTest(unittest.TestCase):
 
@@ -68,9 +69,9 @@ def check_engine_output(self, output, plotting=False, debug=False):
         RMSE_ob = (np.mean(np.abs(OBJ_ML_serial - OBJ_ML)**2))
         RMSE_pr = (np.mean(np.abs(PRB_ML_serial - PRB_ML)**2))
         # RMSE_LL = (np.mean(np.abs(LL_ML_serial - LL_ML)**2))
-        np.testing.assert_allclose(RMSE_ob, 0.0, atol=1e-2, 
+        np.testing.assert_allclose(RMSE_ob, 0.0, atol=1e-1, 
                                     err_msg="The object arrays are not matching as expected")
-        np.testing.assert_allclose(RMSE_pr, 0.0, atol=1e-2, 
+        np.testing.assert_allclose(RMSE_pr, 0.0, atol=1e-1, 
                                     err_msg="The object arrays are not matching as expected")
         # np.testing.assert_allclose(RMSE_LL, 0.0, atol=1e-7,
         #                             err_msg="The log-likelihood errors are not matching as expected")
@@ -134,6 +135,7 @@ def test_ML_serial_floating(self):
                                            scanmodel="BlockFull", autosave=False, verbose_level="critical"))
         self.check_engine_output(out, plotting=False, debug=False)
 
+    @pytest.mark.skip(reason="Funny behaviour with this test, most likely related to Gaussian filter, see issue #607")
     def test_ML_serial_smoothing_regularizer(self):
         out = []
         for eng in ["ML", "ML_serial"]:
diff --git a/test/accelerate_tests/cuda_cupy_tests/WASP_tests.py b/test/accelerate_tests/cuda_cupy_tests/WASP_test.py
similarity index 100%
rename from test/accelerate_tests/cuda_cupy_tests/WASP_tests.py
rename to test/accelerate_tests/cuda_cupy_tests/WASP_test.py
diff --git a/test/accelerate_tests/cuda_pycuda_tests/WASP_tests.py b/test/accelerate_tests/cuda_pycuda_tests/WASP_test.py
similarity index 100%
rename from test/accelerate_tests/cuda_pycuda_tests/WASP_tests.py
rename to test/accelerate_tests/cuda_pycuda_tests/WASP_test.py

From d1ac423a5e36f497210f0633e0dd000200e05be2 Mon Sep 17 00:00:00 2001
From: "Benedikt J. Daurer" <bjdaurer@gmail.com>
Date: Mon, 28 Apr 2025 15:38:31 +0100
Subject: [PATCH 06/12] Ignore F824 for linting in actions (#608)

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 18aff26cf..e71bda774 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -69,7 +69,7 @@ jobs:
     - name: Lint with flake8
       run: |
         # stop the build if there are Python syntax errors or undefined names
-        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        flake8 . --count --select=E9,F63,F7,F82 --ignore=F824 --show-source --statistics
         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
         # flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
     - name: Test with pytest

From 7e5abf6df0344496c33a1c557b7f6e40efc296d5 Mon Sep 17 00:00:00 2001
From: "Benedikt J. Daurer" <bjdaurer@gmail.com>
Date: Mon, 28 Apr 2025 16:59:09 +0100
Subject: [PATCH 07/12] Cleaning up actions (#609)

* Cleaning up actions

* Make sure to use miniforge
---
 .github/workflows/test.yml | 46 +++++---------------------------------
 1 file changed, 5 insertions(+), 41 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index e71bda774..1567443c1 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -48,55 +48,19 @@ jobs:
         channels: conda-forge
         activate-environment: ptypy_env
         python-version: ${{ matrix.python-version }}
+        miniforge-version: latest
     - name: Install ${{ matrix.conda-env }} dependencies
       run: |
-        # replace python version in dependencies
         sed -i 's/python/python=${{ matrix.python-version }}/' dependencies_${{ matrix.conda-env }}.yml
-        # if [ ${{ matrix.conda-env }} == 'full' ] && [ ${{ matrix.python-version }} == '3.12' ]; then
-        #   sed -i '/- pyfftw/d' dependencies_${{ matrix.conda-env }}.yml
-        # fi
-        # if [ ${{ matrix.conda-env }} == 'full' ] && [ ${{ matrix.python-version }} == '3.9' ]; then
-        #   sed -i '/- mpi4py/d' dependencies_${{ matrix.conda-env }}.yml
-        # fi
-        conda install --solver=classic mpich
+        conda install -c conda-forge mpich
         conda env update --file dependencies_${{ matrix.conda-env }}.yml --name ptypy_env
-        conda install --solver=classic flake8 pytest pytest-cov
+        conda install -c conda-forge flake8 pytest pytest-cov
         conda list
     - name: Prepare ptypy
-      run: |
-        # Install ptypy
-        pip install .
+      run: pip install .
     - name: Lint with flake8
       run: |
         # stop the build if there are Python syntax errors or undefined names
         flake8 . --count --select=E9,F63,F7,F82 --ignore=F824 --show-source --statistics
-        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        # flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
     - name: Test with pytest
-      run: |
-        # pytest ptypy/test -v --doctest-modules --junitxml=junit/test-results.xml --cov=ptypy --cov-report=xml --cov-report=html --cov-config=.coveragerc
-        pytest -v
-#    - name: cobertura-report
-#      if: github.event_name == 'pull_request' && (github.event.action == 'opened' || github.event.action == 'reopened' || github.event.action == 'synchronize')
-#      uses: 5monkeys/cobertura-action@v7
-#      with:
-#        # The GITHUB_TOKEN for this repo
-#        repo_token:  ${{ secrets.GITHUB_TOKEN }}
-#        # Path to the cobertura file.
-#        path: coverage.xml
-#        # If files with 100% should be skipped from report.
-#        skip_covered: true
-#        # Minimum allowed coverage percentage as an integer.
-#        minimum_coverage: 90
-#        only_changed_files: true
-#    - name: Junit Report to Annotations
-#      uses: ashley-taylor/junit-report-annotations-action@master
-#      with:
-#        # github token
-#        access-token: ${{ secrets.GITHUB_TOKEN }}
-#        # glob to junit xml files
-#        path: junit/test-results.xml
-#        # include summary annotation
-#        includeSummary: true
-#        # max number of failed tests to include
-#        numFailures: 10
+      run: pytest -v

From 45dd0c0bd3dc0e4c9920b6fb8afa36cfaad080ea Mon Sep 17 00:00:00 2001
From: Benedikt Daurer <benedikt.daurer@diamond.ac.uk>
Date: Mon, 28 Apr 2025 19:30:43 +0100
Subject: [PATCH 08/12] Bump version to 0.9.0

---
 ptypy/version.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ptypy/version.py b/ptypy/version.py
index 16317abfa..0ae9661c4 100644
--- a/ptypy/version.py
+++ b/ptypy/version.py
@@ -1,6 +1,6 @@
 
-short_version = '0.8.2'
-version = '0.8.2'
+short_version = '0.9.0'
+version = '0.9.0'
 release = False
 
 if not release:

From 205248a954e4acc519dba744428372999367a4ee Mon Sep 17 00:00:00 2001
From: Jari <jfowkes@users.noreply.github.com>
Date: Mon, 28 Apr 2025 19:55:36 +0100
Subject: [PATCH 09/12] Add wavefield preconditioner (#584)

* Add Lipschitz preconditioner

* Move Lipschitz preconditioner before smoothing preconditioner

* Use sqrt of fluence and apply twice

* Take sqrt of the sum

* Start serializing wavefield precond

* Make requested changes to ML.py

* Make changes to ML_serial and add kernel functions

* Fix smoothing call in ML_serial

* Fix abs2 call in kernels

* Add kernel regression tests

* Tidy up new kernel tests

* Fix new kernel tests

* Changes to ML_cupy and ML_pycuda

* Fix pycuda.cumath import

* Add cupy and pycuda kernels

* Add serial wavefield precond test

* Add cupy and pycuda wavefield precond tests

* Add cupy and pycuda kernel regression tests

* Add atomic CUDA kernels

* Fix types in CUDA kernels

* Add non-atomic CUDA kernels

* Bugfixes in wavefield kernels

* Better abs2 in wavefield kernels

* Fixed wavefield kernels and tests

* Revert change in templates

* Fixed logic in ML engine

* Fix login in ML_serial

* Final fixes to make wavefield precond work with GPU engines

* Remove notebook

* More bugfixes in ML cupy/pycuda engine

* Skip ML wavefield engine test

---------

Co-authored-by: Benedikt Daurer <benedikt.daurer@diamond.ac.uk>
---
 ptypy/accelerate/base/engines/ML_serial.py    |  67 ++++++++--
 ptypy/accelerate/base/kernels.py              |  26 ++++
 .../cuda_common/ob_update2_ML_wavefield.cu    | 123 ++++++++++++++++++
 .../cuda_common/ob_update_ML_wavefield.cu     |  73 +++++++++++
 .../cuda_common/pr_update2_ML_wavefield.cu    | 123 ++++++++++++++++++
 .../cuda_common/pr_update_ML_wavefield.cu     |  73 +++++++++++
 ptypy/accelerate/cuda_cupy/engines/ML_cupy.py |  53 +++++++-
 ptypy/accelerate/cuda_cupy/kernels.py         | 105 +++++++++++++++
 .../cuda_pycuda/engines/ML_pycuda.py          |  51 +++++++-
 ptypy/accelerate/cuda_pycuda/kernels.py       |  88 +++++++++++++
 ptypy/engines/ML.py                           | 102 +++++++++++++--
 templates/engines/cupy/moonflower_ML_cupy.py  |   4 +-
 .../cupy/moonflower_ML_cupy_wavefield.py      |  58 +++++++++
 .../numpy/moonflower_ML_Gaussian_wavefield.py |  56 ++++++++
 .../engines/pycuda/moonflower_ML_pycuda.py    |   2 +-
 .../pycuda/moonflower_ML_pycuda_wavefield.py  |  58 +++++++++
 .../serial/moonflower_ML_serial_wavefield.py  |  59 +++++++++
 .../base_tests/engine_test.py                 |  22 +++-
 .../base_tests/po_update_kernel_test.py       |  96 +++++++++++++-
 .../cuda_cupy_tests/engine_tests.py           |  25 +++-
 .../cuda_cupy_tests/po_update_kernel_test.py  | 121 +++++++++++++++++
 .../cuda_pycuda_tests/engine_tests.py         |  25 +++-
 .../po_update_kernel_test.py                  | 121 +++++++++++++++++
 23 files changed, 1494 insertions(+), 37 deletions(-)
 create mode 100644 ptypy/accelerate/cuda_common/ob_update2_ML_wavefield.cu
 create mode 100644 ptypy/accelerate/cuda_common/ob_update_ML_wavefield.cu
 create mode 100644 ptypy/accelerate/cuda_common/pr_update2_ML_wavefield.cu
 create mode 100644 ptypy/accelerate/cuda_common/pr_update_ML_wavefield.cu
 create mode 100644 templates/engines/cupy/moonflower_ML_cupy_wavefield.py
 create mode 100644 templates/engines/numpy/moonflower_ML_Gaussian_wavefield.py
 create mode 100644 templates/engines/pycuda/moonflower_ML_pycuda_wavefield.py
 create mode 100644 templates/engines/serial/moonflower_ML_serial_wavefield.py

diff --git a/ptypy/accelerate/base/engines/ML_serial.py b/ptypy/accelerate/base/engines/ML_serial.py
index 1e953d26a..3618d0f9c 100644
--- a/ptypy/accelerate/base/engines/ML_serial.py
+++ b/ptypy/accelerate/base/engines/ML_serial.py
@@ -50,6 +50,7 @@ def __init__(self, ptycho_parent, pars=None):
         self.diff_info = {}
         self.cn2_ob_grad = 0.
         self.cn2_pr_grad = 0.
+        self.sqrt = np.sqrt
 
     def engine_initialize(self):
         """
@@ -162,6 +163,13 @@ def _get_smooth_gradient(self, data, sigma):
 
     def _replace_ob_grad(self):
         new_ob_grad = self.ob_grad_new
+
+        # Wavefield preconditioner for the object
+        if self.p.wavefield_precond:
+            for name, s in new_ob_grad.storages.items():
+                s.data /= np.sqrt(self.ob_fln.storages[name].data
+                                  + self.p.wavefield_delta_object)
+
         # Smoothing preconditioner
         if self.smooth_gradient:
             self.smooth_gradient.sigma *= (1. - self.p.smooth_gradient_decay)
@@ -175,7 +183,8 @@ def _replace_ob_grad(self):
 
     def _replace_pr_grad(self):
         new_pr_grad = self.pr_grad_new
-        # probe support
+
+        # Probe support
         if self.p.probe_update_start <= self.curiter:
             # Apply probe support if needed
             for name, s in new_pr_grad.storages.items():
@@ -183,6 +192,12 @@ def _replace_pr_grad(self):
         else:
             new_pr_grad.fill(0.)
 
+        # Wavefield preconditioner for the probe
+        if self.p.wavefield_precond:
+            for name, s in new_pr_grad.storages.items():
+                s.data /= np.sqrt(self.pr_fln.storages[name].data
+                                  + self.p.wavefield_delta_probe)
+
         norm = Cnorm2(new_pr_grad)
         dot = np.real(Cdot(new_pr_grad, self.pr_grad))
         self.pr_grad << new_pr_grad
@@ -240,19 +255,36 @@ def engine_iterate(self, num=1):
             self.cn2_pr_grad = cn2_new_pr_grad
 
             dt = self.ptycho.FType
+
             # 3. Next conjugate
             self.ob_h *= dt(bt / self.tmin)
-
-            # Smoothing preconditioner
-            if self.smooth_gradient:
+            # Wavefield preconditioner for the object (with and without smoothing preconditioner)
+            if self.p.wavefield_precond:
                 for name, s in self.ob_h.storages.items():
-                    s.data[:] -= self._get_smooth_gradient(self.ob_grad.storages[name].data, self.smooth_gradient.sigma)
+                    if self.smooth_gradient:
+                        s.data[:] -= self._get_smooth_gradient(self.ob_grad.storages[name].data
+                                      / self.sqrt(self.ob_fln.storages[name].data + self.p.wavefield_delta_object)
+                                      , self.smooth_gradient.sigma)
+                    else:
+                        s.data[:] -= (self.ob_grad.storages[name].data
+                                      / self.sqrt(self.ob_fln.storages[name].data + self.p.wavefield_delta_object))
             else:
-                self.ob_h -= self.ob_grad
+                # Smoothing preconditioner for the object
+                if self.smooth_gradient:
+                    for name, s in self.ob_h.storages.items():
+                        s.data[:] -= self._get_smooth_gradient(self.ob_grad.storages[name].data, self.smooth_gradient.sigma)
+                else:
+                    self.ob_h -= self.ob_grad
 
             self.pr_h *= dt(bt / self.tmin)
             self.pr_grad *= dt(self.scale_p_o)
-            self.pr_h -= self.pr_grad
+            # Wavefield preconditioner for the probe
+            if self.p.wavefield_precond:
+                for name, s in self.pr_h.storages.items():
+                    s.data[:] -= (self.pr_grad.storages[name].data
+                                  / self.sqrt(self.pr_fln.storages[name].data + self.p.wavefield_delta_probe))
+            else:
+                self.pr_h -= self.pr_grad
 
             # In principle, the way things are now programmed this part
             # could be iterated over in a real Newton-Raphson style.
@@ -416,6 +448,11 @@ def new_grad(self):
         pr_grad = self.engine.pr_grad_new
         ob_grad << 0.
         pr_grad << 0.
+        if self.engine.p.wavefield_precond:
+            ob_fln = self.engine.ob_fln
+            pr_fln = self.engine.pr_fln
+            ob_fln << 0.
+            pr_fln << 0.
 
         # We need an array for MPI
         LL = np.array([0.])
@@ -450,6 +487,11 @@ def new_grad(self):
             prg = pr_grad.S[pID].data
             I = prep.I
 
+            # local references for wavefield precond
+            if self.engine.p.wavefield_precond:
+                obf = ob_fln.S[oID].data
+                prf = pr_fln.S[pID].data
+
             # make propagated exit (to buffer)
             AWK.build_aux_no_ex(aux, addr, ob, pr, add=False)
 
@@ -465,8 +507,12 @@ def new_grad(self):
             GDK.error_reduce(addr, err_phot)
             aux[:] = BW(aux)
 
-            POK.ob_update_ML(addr, obg, pr, aux)
-            POK.pr_update_ML(addr, prg, ob, aux)
+            if self.engine.p.wavefield_precond:
+                POK.ob_update_ML_wavefield(addr, obg, obf, pr, aux)
+                POK.pr_update_ML_wavefield(addr, prg, prf, ob, aux)
+            else:
+                POK.ob_update_ML(addr, obg, pr, aux)
+                POK.pr_update_ML(addr, prg, ob, aux)
 
         for dID, prep in self.engine.diff_info.items():
             err_phot = prep.err_phot
@@ -480,6 +526,9 @@ def new_grad(self):
         # MPI reduction of gradients
         ob_grad.allreduce()
         pr_grad.allreduce()
+        if self.engine.p.wavefield_precond:
+            ob_fln.allreduce()
+            pr_fln.allreduce()
         parallel.allreduce(LL)
 
         # Object regularizer
diff --git a/ptypy/accelerate/base/kernels.py b/ptypy/accelerate/base/kernels.py
index c5343f940..22eb37747 100644
--- a/ptypy/accelerate/base/kernels.py
+++ b/ptypy/accelerate/base/kernels.py
@@ -608,6 +608,32 @@ def pr_update_ML(self, addr, pr, ob, ex, fac=2.0):
                 ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] * fac
         return
 
+    def ob_update_ML_wavefield(self, addr, ob, obf, pr, ex, fac=2.0):
+
+        sh = addr.shape
+        flat_addr = addr.reshape(sh[0] * sh[1], sh[2], sh[3])
+        rows, cols = ex.shape[-2:]
+        for ind, (prc, obc, exc, mac, dic) in enumerate(flat_addr):
+            ob[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols] += \
+                pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols].conj() * \
+                ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] * fac
+            obf[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols] += \
+                abs2(pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols])
+        return
+
+    def pr_update_ML_wavefield(self, addr, pr, prf, ob, ex, fac=2.0):
+
+        sh = addr.shape
+        flat_addr = addr.reshape(sh[0] * sh[1], sh[2], sh[3])
+        rows, cols = ex.shape[-2:]
+        for ind, (prc, obc, exc, mac, dic) in enumerate(flat_addr):
+            pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols] += \
+                ob[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols].conj() * \
+                ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] * fac
+            prf[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols] += \
+                abs2(ob[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols])
+        return
+
     def ob_update_local(self, addr, ob, pr, ex, aux, prn, a=0., b=1.):
         sh = addr.shape
         flat_addr = addr.reshape(sh[0] * sh[1], sh[2], sh[3])
diff --git a/ptypy/accelerate/cuda_common/ob_update2_ML_wavefield.cu b/ptypy/accelerate/cuda_common/ob_update2_ML_wavefield.cu
new file mode 100644
index 000000000..649ce4a89
--- /dev/null
+++ b/ptypy/accelerate/cuda_common/ob_update2_ML_wavefield.cu
@@ -0,0 +1,123 @@
+/** ob_update2_ML_wavefield.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double)
+ * - MATH_TYPE: the data type used for computation
+ * - ACC_TYPE: accumulator for the ob field
+ *
+ * NOTE: This version of ob_update goes over all tiles that need to be accumulated
+ * in a single thread block to avoid global atomic additions (as in ob_update_ML_wavefield.cu).
+ * This requires a local array of NUM_MODES size to store the local updates.
+ * GPU registers per thread are limited (255 32bit registers on V100),
+ * and at some point the registers will spill into shared or global memory
+ * and the kernel will get considerably slower.
+ */
+
+#include "common.cuh"
+
+#define pr_dlayer(k) addr[(k)]
+#define ex_dlayer(k) addr[6 * num_pods + (k)]
+#define obj_dlayer(k) addr[3 * num_pods + (k)]
+#define obj_roi_row(k) addr[4 * num_pods + (k)]
+#define obj_roi_column(k) addr[5 * num_pods + (k)]
+
+extern "C" __global__ void ob_update2_ML_wavefield(int pr_sh,
+                                                   int ob_modes,
+                                                   int num_pods,
+                                                   int ob_sh_rows,
+                                                   int ob_sh_cols,
+                                                   int pr_modes,
+                                                   complex<OUT_TYPE>* ob_g,
+                                                   const complex<IN_TYPE>* __restrict__ pr_g,
+                                                   const complex<IN_TYPE>* __restrict__ ex_g,
+                                                   OUT_TYPE* ob_f,
+                                                   const int* addr,
+                                                   IN_TYPE fac_)
+{
+  int y = blockIdx.y * BDIM_Y + threadIdx.y;
+  int dy = ob_sh_rows;
+  int z = blockIdx.x * BDIM_X + threadIdx.x;
+  int dz = ob_sh_cols;
+  MATH_TYPE fac = fac_;
+  complex<ACC_TYPE> ob[NUM_MODES];
+  ACC_TYPE of[NUM_MODES];
+
+  int txy = threadIdx.y * BDIM_X + threadIdx.x;
+  assert(ob_modes <= NUM_MODES);
+
+  if (y < dy && z < dz)
+  {
+#pragma unroll
+    for (int i = 0; i < NUM_MODES; ++i)
+    {
+      auto idx = i * dy * dz + y * dz + z;
+      assert(idx < ob_modes * ob_sh_rows * ob_sh_cols);
+      ob[i] = ob_g[idx];
+      of[i] = ob_f[idx];
+    }
+  }
+
+  __shared__ int addresses[BDIM_X * BDIM_Y * 5];
+
+  for (int p = 0; p < num_pods; p += BDIM_X * BDIM_Y)
+  {
+    int mi = BDIM_X * BDIM_Y;
+    if (mi > num_pods - p)
+      mi = num_pods - p;
+
+    if (p > 0)
+      __syncthreads();
+
+    if (txy < mi)
+    {
+      assert(p + txy < num_pods);
+      assert(txy < BDIM_X * BDIM_Y);
+      addresses[txy * 5 + 0] = pr_dlayer(p + txy);
+      addresses[txy * 5 + 1] = ex_dlayer(p + txy);
+      addresses[txy * 5 + 2] = obj_dlayer(p + txy);
+      assert(obj_dlayer(p + txy) < NUM_MODES);
+      assert(addresses[txy * 5 + 2] < NUM_MODES);
+      addresses[txy * 5 + 3] = obj_roi_row(p + txy);
+      addresses[txy * 5 + 4] = obj_roi_column(p + txy);
+    }
+
+    __syncthreads();
+
+    if (y >= dy || z >= dz)
+      continue;
+
+#pragma unroll 4
+    for (int i = 0; i < mi; ++i)
+    {
+      int* ad = addresses + i * 5;
+      int v1 = y - ad[3];
+      int v2 = z - ad[4];
+      if (v1 >= 0 && v1 < pr_sh && v2 >= 0 && v2 < pr_sh)
+      {
+        auto pridx = ad[0] * pr_sh * pr_sh + v1 * pr_sh + v2;
+        assert(pridx < pr_modes * pr_sh * pr_sh);
+        complex<MATH_TYPE> pr = pr_g[pridx];
+        int idx = ad[2];
+        assert(idx < NUM_MODES);
+        auto cpr = conj(pr);
+        auto exidx = ad[1] * pr_sh * pr_sh + v1 * pr_sh + v2;
+        complex<MATH_TYPE> ex_val = ex_g[exidx];
+        complex<ACC_TYPE> add_val = cpr * ex_val * fac;
+        ob[idx] += add_val;
+        complex<MATH_TYPE> abs2_val = cpr * pr;
+        ACC_TYPE add_val2 = abs2_val.real();
+        of[idx] += add_val2;
+      }
+    }
+  }
+
+  if (y < dy && z < dz)
+  {
+    for (int i = 0; i < NUM_MODES; ++i)
+    {
+      ob_g[i * dy * dz + y * dz + z] = ob[i];
+      ob_f[i * dy * dz + y * dz + z] = of[i];
+    }
+  }
+}
diff --git a/ptypy/accelerate/cuda_common/ob_update_ML_wavefield.cu b/ptypy/accelerate/cuda_common/ob_update_ML_wavefield.cu
new file mode 100644
index 000000000..bab2fd620
--- /dev/null
+++ b/ptypy/accelerate/cuda_common/ob_update_ML_wavefield.cu
@@ -0,0 +1,73 @@
+/** ob_update_ML_wavefield.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double)
+ * - MATH_TYPE: the data type used for computation
+ */
+
+#include "common.cuh"
+
+template <class T>
+__device__ inline void atomicAdd(complex<T>* x, const complex<T>& y)
+{
+  auto xf = reinterpret_cast<T*>(x);
+  atomicAdd(xf, y.real());
+  atomicAdd(xf + 1, y.imag());
+}
+
+extern "C"
+{
+  __global__ void ob_update_ML_wavefield(const complex<IN_TYPE>* __restrict__ exit_wave,
+                                         int A,
+                                         int B,
+                                         int C,
+                                         const complex<IN_TYPE>* __restrict__ probe,
+                                         int D,
+                                         int E,
+                                         int F,
+                                         complex<OUT_TYPE>* obj,
+                                         int G,
+                                         int H,
+                                         int I,
+                                         OUT_TYPE* obj_fln,
+                                         const int* __restrict__ addr,
+                                         IN_TYPE fac_)
+  {
+    const int bid = blockIdx.x;
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
+    const int addr_stride = 15;
+    MATH_TYPE fac = fac_;
+
+    const int* oa = addr + 3 + bid * addr_stride;
+    const int* pa = addr + bid * addr_stride;
+    const int* ea = addr + 6 + bid * addr_stride;
+
+    probe += pa[0] * E * F + pa[1] * F + pa[2];
+    obj += oa[0] * H * I + oa[1] * I + oa[2];
+    obj_fln += oa[0] * H * I + oa[1] * I + oa[2];
+
+    assert(oa[0] * H * I + oa[1] * I + oa[2] + (B - 1) * I + C - 1 < G * H * I);
+
+    exit_wave += ea[0] * B * C;
+
+    for (int b = ty; b < B; b += blockDim.y)
+    {
+      for (int c = tx; c < C; c += blockDim.x)
+      {
+        complex<MATH_TYPE> probe_val = probe[b * F + c];
+        complex<MATH_TYPE> exit_val = exit_wave[b * C + c];
+        complex<MATH_TYPE> add_val_m = conj(probe_val) * exit_val * fac;
+        complex<OUT_TYPE> add_val = add_val_m;
+
+        atomicAdd(&obj[b * I + c], add_val);
+
+        complex<MATH_TYPE> abs2_val = conj(probe_val) * probe_val;
+        OUT_TYPE add_val2 = abs2_val.real();
+
+        atomicAdd(&obj_fln[b * I + c], add_val2);
+      }
+    }
+  }
+}
diff --git a/ptypy/accelerate/cuda_common/pr_update2_ML_wavefield.cu b/ptypy/accelerate/cuda_common/pr_update2_ML_wavefield.cu
new file mode 100644
index 000000000..567d928be
--- /dev/null
+++ b/ptypy/accelerate/cuda_common/pr_update2_ML_wavefield.cu
@@ -0,0 +1,123 @@
+/** pr_update2_ML_wavefield.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double)
+ * - MATH_TYPE: the data type used for computation
+ * - ACC_TYPE: accumulator type for local pr array
+ *
+ * NOTE: This version of pr_update goes over all tiles that need to be accumulated
+ * in a single thread block to avoid global atomic additions (as in pr_update_ML_wavefield.cu).
+ * This requires a local array of NUM_MODES size to store the local updates.
+ * GPU registers per thread are limited (255 32bit registers on V100),
+ * and at some point the registers will spill into shared or global memory
+ * and the kernel will get considerably slower.
+ */
+
+#include "common.cuh"
+
+#define pr_dlayer(k) addr[(k)]
+#define ex_dlayer(k) addr[6 * num_pods + (k)]
+#define obj_dlayer(k) addr[3 * num_pods + (k)]
+#define obj_roi_row(k) addr[4 * num_pods + (k)]
+#define obj_roi_column(k) addr[5 * num_pods + (k)]
+
+extern "C" __global__ void pr_update2_ML_wavefield(int pr_sh,
+                                                   int ob_sh_row,
+                                                   int ob_sh_col,
+                                                   int pr_modes,
+                                                   int ob_modes,
+                                                   int num_pods,
+                                                   complex<OUT_TYPE>* pr_g,
+                                                   const complex<IN_TYPE>* __restrict__ ob_g,
+                                                   const complex<IN_TYPE>* __restrict__ ex_g,
+                                                   OUT_TYPE* pr_f,
+                                                   const int* addr,
+                                                   IN_TYPE fac_)
+{
+  int y = blockIdx.y * BDIM_Y + threadIdx.y;
+  int dy = pr_sh;
+  int z = blockIdx.x * BDIM_X + threadIdx.x;
+  int dz = pr_sh;
+  MATH_TYPE fac = fac_;
+  complex<ACC_TYPE> pr[NUM_MODES];
+  ACC_TYPE pf[NUM_MODES];
+
+  int txy = threadIdx.y * BDIM_X + threadIdx.x;
+  assert(pr_modes <= NUM_MODES);
+
+  if (y < pr_sh && z < pr_sh)
+  {
+#pragma unroll
+    for (int i = 0; i < NUM_MODES; ++i)
+    {
+      auto idx = i * dy * dz + y * dz + z;
+      assert(idx < pr_modes * pr_sh * pr_sh);
+      pr[i] = pr_g[idx];
+      pf[i] = pr_f[idx];
+    }
+  }
+
+  __shared__ int addresses[BDIM_X * BDIM_Y * 5];
+
+  for (int p = 0; p < num_pods; p += BDIM_X * BDIM_Y)
+  {
+    int mi = BDIM_X * BDIM_Y;
+    if (mi > num_pods - p)
+      mi = num_pods - p;
+
+    if (p > 0)
+      __syncthreads();
+
+    if (txy < mi)
+    {
+      assert(p + txy < num_pods);
+      assert(txy < BDIM_X * BDIM_Y);
+      addresses[txy * 5 + 0] = pr_dlayer(p + txy);
+      addresses[txy * 5 + 1] = ex_dlayer(p + txy);
+      addresses[txy * 5 + 2] = obj_dlayer(p + txy);
+      assert(obj_dlayer(p + txy) < NUM_MODES);
+      assert(addresses[txy * 5 + 2] < NUM_MODES);
+      addresses[txy * 5 + 3] = obj_roi_row(p + txy);
+      addresses[txy * 5 + 4] = obj_roi_column(p + txy);
+    }
+
+    __syncthreads();
+
+    if (y >= pr_sh || z >= pr_sh)
+      continue;
+
+#pragma unroll 4
+    for (int i = 0; i < mi; ++i)
+    {
+      int* ad = addresses + i * 5;
+      int v1 = y + ad[3];
+      int v2 = z + ad[4];
+      if (v1 >= 0 && v1 < ob_sh_row && v2 >= 0 && v2 < ob_sh_col)
+      {
+        auto obidx = ad[2] * ob_sh_row * ob_sh_col + v1 * ob_sh_col + v2;
+        assert(obidx < ob_modes * ob_sh_row * ob_sh_col);
+        complex<MATH_TYPE> ob = ob_g[obidx];
+        int idx = ad[0];
+        assert(idx < NUM_MODES);
+        auto cob = conj(ob);
+        auto exidx = ad[1] * pr_sh * pr_sh + y * pr_sh + z;
+        complex<MATH_TYPE> ex_val = ex_g[exidx];
+        complex<ACC_TYPE> add_val = cob * ex_val * fac;
+        pr[idx] += add_val;
+        complex<MATH_TYPE> abs2_val = cob * ob;
+        ACC_TYPE add_val2 = abs2_val.real();
+        pf[idx] += add_val2;
+      }
+    }
+  }
+
+  if (y < pr_sh && z < pr_sh)
+  {
+    for (int i = 0; i < NUM_MODES; ++i)
+    {
+      pr_g[i * dy * dz + y * dz + z] = pr[i];
+      pr_f[i * dy * dz + y * dz + z] = pf[i];
+    }
+  }
+}
diff --git a/ptypy/accelerate/cuda_common/pr_update_ML_wavefield.cu b/ptypy/accelerate/cuda_common/pr_update_ML_wavefield.cu
new file mode 100644
index 000000000..de236fc7c
--- /dev/null
+++ b/ptypy/accelerate/cuda_common/pr_update_ML_wavefield.cu
@@ -0,0 +1,73 @@
+#/** pr_update_ML_wavefield.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double)
+ * - MATH_TYPE: the data type used for computation
+ */
+
+#include "common.cuh"
+
+template <class T>
+__device__ inline void atomicAdd(complex<T>* x, const complex<T>& y)
+{
+  auto xf = reinterpret_cast<T*>(x);
+  atomicAdd(xf, y.real());
+  atomicAdd(xf + 1, y.imag());
+}
+
+extern "C"
+{
+  __global__ void pr_update_ML_wavefield(const complex<IN_TYPE>* __restrict__ exit_wave,
+                                                  int A,
+                                                  int B,
+                                                  int C,
+                                                  complex<OUT_TYPE>* probe,
+                                                  int D,
+                                                  int E,
+                                                  int F,
+                                                  const complex<IN_TYPE>* __restrict__ obj,
+                                                  int G,
+                                                  int H,
+                                                  int I,
+                                                  OUT_TYPE* probe_fln,
+                                                  const int* __restrict__ addr,
+                                                  IN_TYPE fac_)
+  {
+    const int bid = blockIdx.x;
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
+    const int addr_stride = 15;
+    MATH_TYPE fac = fac_;
+
+    const int* oa = addr + 3 + bid * addr_stride;
+    const int* pa = addr + bid * addr_stride;
+    const int* ea = addr + 6 + bid * addr_stride;
+
+    probe += pa[0] * E * F + pa[1] * F + pa[2];
+    probe_fln += pa[0] * E * F + pa[1] * F + pa[2];
+    obj += oa[0] * H * I + oa[1] * I + oa[2];
+
+    assert(oa[0] * H * I + oa[1] * I + oa[2] + (B - 1) * I + C - 1 < G * H * I);
+
+    exit_wave += ea[0] * B * C;
+
+    for (int b = ty; b < B; b += blockDim.y)
+    {
+      for (int c = tx; c < C; c += blockDim.x)
+      {
+        complex<MATH_TYPE> obj_val = obj[b * I + c];
+        complex<MATH_TYPE> exit_val = exit_wave[b * C + c];
+        complex<MATH_TYPE> add_val_m = conj(obj_val) * exit_val * fac;
+        complex<OUT_TYPE> add_val = add_val_m;
+
+        atomicAdd(&probe[b * F + c], add_val);
+
+        complex<MATH_TYPE> abs2_val = conj(obj_val) * obj_val;
+        OUT_TYPE add_val2 = abs2_val.real();
+
+        atomicAdd(&probe_fln[b * F + c], add_val2);
+      }
+    }
+  }
+}
diff --git a/ptypy/accelerate/cuda_cupy/engines/ML_cupy.py b/ptypy/accelerate/cuda_cupy/engines/ML_cupy.py
index caa0a192d..5f35fbd67 100644
--- a/ptypy/accelerate/cuda_cupy/engines/ML_cupy.py
+++ b/ptypy/accelerate/cuda_cupy/engines/ML_cupy.py
@@ -69,6 +69,7 @@ def __init__(self, ptycho_parent, pars=None):
         Maximum likelihood reconstruction engine.
         """
         super().__init__(ptycho_parent, pars)
+        self.sqrt = cp.sqrt
 
     def engine_initialize(self):
         """
@@ -280,6 +281,13 @@ def _get_smooth_gradient(self, data, sigma):
 
     def _replace_ob_grad(self):
         new_ob_grad = self.ob_grad_new
+
+        # Wavefield preconditioner for the object
+        if self.p.wavefield_precond:
+            for name, s in new_ob_grad.storages.items():
+                s.gpu /= cp.sqrt(self.ob_fln.storages[name].gpu
+                                 + self.p.wavefield_delta_object)
+
         # Smoothing preconditioner
         if self.smooth_gradient:
             self.smooth_gradient.sigma *= (1. - self.p.smooth_gradient_decay)
@@ -291,7 +299,8 @@ def _replace_ob_grad(self):
 
     def _replace_pr_grad(self):
         new_pr_grad = self.pr_grad_new
-        # probe support
+
+        # Probe support
         if self.p.probe_update_start <= self.curiter:
             # Apply probe support if needed
             for name, s in new_pr_grad.storages.items():
@@ -299,6 +308,12 @@ def _replace_pr_grad(self):
         else:
             new_pr_grad.fill(0.)
 
+        # Wavefield preconditioner for the probe
+        if self.p.wavefield_precond:
+            for name, s in new_pr_grad.storages.items():
+                s.gpu /= cp.sqrt(self.pr_fln.storages[name].gpu
+                                 + self.p.wavefield_delta_probe)
+
         return self._replace_grad(self.pr_grad, new_pr_grad)
 
     def _replace_grad(self, grad, new_grad):
@@ -318,7 +333,7 @@ def engine_iterate(self, num=1):
         return err
 
     def position_update(self):
-        """ 
+        """
         Position refinement
         """
         if not self.do_position_refinement or (not self.curiter):
@@ -503,6 +518,12 @@ def new_grad(self):
         qu_htod = self.engine.qu_htod
         queue = self.engine.queue
 
+        if self.engine.p.wavefield_precond:
+            ob_fln = self.engine.ob_fln
+            pr_fln = self.engine.pr_fln
+            ob_fln << 0.
+            pr_fln << 0.
+
         self.engine._set_pr_ob_ref_for_data('gpu')
         ob_grad << 0.
         pr_grad << 0.
@@ -536,6 +557,9 @@ def new_grad(self):
             obg = ob_grad.S[oID].data
             pr = self.engine.pr.S[pID].data
             prg = pr_grad.S[pID].data
+            if self.engine.p.wavefield_precond:
+                obf = ob_fln.S[oID].data
+                prf = pr_fln.S[pID].data
 
             # Schedule w & I to device
             ev_w, w, data_w = self.engine.w_data.to_gpu(
@@ -564,11 +588,17 @@ def new_grad(self):
 
             use_atomics = self.p.object_update_cuda_atomics
             addr = prep.addr_gpu if use_atomics else prep.addr2_gpu
-            POK.ob_update_ML(addr, obg, pr, aux, atomics=use_atomics)
+            if self.engine.p.wavefield_precond:
+                POK.ob_update_ML_wavefield(addr, obg, obf, pr, aux, atomics=use_atomics)
+            else:
+                POK.ob_update_ML(addr, obg, pr, aux, atomics=use_atomics)
 
             use_atomics = self.p.probe_update_cuda_atomics
             addr = prep.addr_gpu if use_atomics else prep.addr2_gpu
-            POK.pr_update_ML(addr, prg, ob, aux, atomics=use_atomics)
+            if self.engine.p.wavefield_precond:
+                POK.pr_update_ML_wavefield(addr, prg, prf, ob, aux, atomics=use_atomics)
+            else:
+                POK.pr_update_ML(addr, prg, ob, aux, atomics=use_atomics)
 
         queue.synchronize()
         self.engine.dID_list.reverse()
@@ -591,10 +621,18 @@ def new_grad(self):
             s.gpu.get(out=s.cpu)
         for s in pr_grad.S.values():
             s.gpu.get(out=s.cpu)
+        if self.engine.p.wavefield_precond:
+            for s in ob_fln.S.values():
+                s.gpu.get(out=s.cpu)
+            for s in pr_fln.S.values():
+                s.gpu.get(out=s.cpu)
         self.engine._set_pr_ob_ref_for_data('cpu')
 
         ob_grad.allreduce()
         pr_grad.allreduce()
+        if self.engine.p.wavefield_precond:
+            ob_fln.allreduce()
+            pr_fln.allreduce()
         parallel.allreduce(LL)
 
         # HtoD cause we continue on gpu
@@ -602,6 +640,11 @@ def new_grad(self):
             s.gpu.set(s.cpu)
         for s in pr_grad.S.values():
             s.gpu.set(s.cpu)
+        if self.engine.p.wavefield_precond:
+            for s in ob_fln.S.values():
+                s.gpu.set(s.cpu)
+            for s in pr_fln.S.values():
+                s.gpu.set(s.cpu)
         self.engine._set_pr_ob_ref_for_data('gpu')
 
         # Object regularizer
@@ -746,7 +789,7 @@ def delxf(x, axis=-1):
         self.dot = lambda x, y: self.AUK.dot(x, y).get().item()
 
         self._grad_reg_kernel = cp.ElementwiseKernel(
-            "float32 fac, complex64 py, complex64 px, complex64 my, complex64 mx", 
+            "float32 fac, complex64 py, complex64 px, complex64 my, complex64 mx",
             "complex64 out",
             "out = (px+py-my-mx) * fac",
             "grad_reg",
diff --git a/ptypy/accelerate/cuda_cupy/kernels.py b/ptypy/accelerate/cuda_cupy/kernels.py
index b743e9b08..07cfe8a42 100644
--- a/ptypy/accelerate/cuda_cupy/kernels.py
+++ b/ptypy/accelerate/cuda_cupy/kernels.py
@@ -909,6 +909,18 @@ def __init__(self, queue_thread=None,
             'MATH_TYPE': self.math_type
         })
         self.pr_update2_ML_cuda = None
+        self.ob_update_ML_wavefield_cuda = load_kernel("ob_update_ML_wavefield", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type
+        })
+        self.ob_update2_ML_wavefield_cuda = None
+        self.pr_update_ML_wavefield_cuda = load_kernel("pr_update_ML_wavefield", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type
+        })
+        self.pr_update2_ML_wavefield_cuda = None
         self.ob_update_local_cuda = load_kernel("ob_update_local", {
             'IN_TYPE': 'float',
             'OUT_TYPE': 'float',
@@ -1096,6 +1108,54 @@ def ob_update_ML(self, addr, ob, pr, ex, fac=2.0, atomics=True):
                                           np.float32(
                                         fac) if ex.dtype == np.complex64 else np.float64(fac)))
 
+    def ob_update_ML_wavefield(self, addr, ob, of, pr, ex, fac=2.0, atomics=True):
+        obsh = [np.int32(ax) for ax in ob.shape]
+        prsh = [np.int32(ax) for ax in pr.shape]
+        exsh = [np.int32(ax) for ax in ex.shape]
+
+        if self.queue is not None:
+            self.queue.use()
+        if atomics:
+            if addr.shape[3] != 3 or addr.shape[2] != 5:
+                raise ValueError(
+                    'Address not in required shape for tiled ob_update')
+
+            num_pods = np.int32(addr.shape[0] * addr.shape[1])
+            self.ob_update_ML_wavefield_cuda(grid=(int(num_pods), 1, 1),
+                                             block=(32, 32, 1),
+                                             args=(ex, num_pods, exsh[1], exsh[2],
+                                                   pr, prsh[0], prsh[1], prsh[2],
+                                                   ob, obsh[0], obsh[1], obsh[2],
+                                                   of,
+                                                   addr,
+                                                   np.float32(
+                                                 fac) if ex.dtype == np.complex64 else np.float64(fac)))
+        else:
+            if addr.shape[0] != 5 or addr.shape[1] != 3:
+                raise ValueError(
+                    'Address not in required shape for tiled ob_update')
+
+            num_pods = np.int32(addr.shape[2] * addr.shape[3])
+            if not self.ob_update2_ML_wavefield_cuda:
+                self.ob_update2_ML_wavefield_cuda = load_kernel("ob_update2_ML_wavefield", {
+                    "NUM_MODES": obsh[0],
+                    "BDIM_X": 16,
+                    "BDIM_Y": 16,
+                    'IN_TYPE': 'float',
+                    'OUT_TYPE': 'float',
+                    'MATH_TYPE': self.math_type,
+                    'ACC_TYPE': self.accumulator_type
+                })
+            grid = [int((x+15)//16) for x in ob.shape[-2:]]
+            grid = (grid[1], grid[0], int(1))
+            self.ob_update2_ML_wavefield_cuda(grid=grid,
+                                              block=(16, 16, 1),
+                                              args=(prsh[-1], obsh[0], num_pods,
+                                                    obsh[-2], obsh[-1], prsh[0],
+                                                    ob, pr, ex, of, addr,
+                                                    np.float32(
+                                                  fac) if ex.dtype == np.complex64 else np.float64(fac)))
+
     def pr_update_ML(self, addr, pr, ob, ex, fac=2.0, atomics=False):
         obsh = [np.int32(ax) for ax in ob.shape]
         prsh = [np.int32(ax) for ax in pr.shape]
@@ -1140,6 +1200,51 @@ def pr_update_ML(self, addr, pr, ob, ex, fac=2.0, atomics=False):
                                           np.float32(
                                         fac) if ex.dtype == np.complex64 else np.float64(fac)))
 
+    def pr_update_ML_wavefield(self, addr, pr, pf, ob, ex, fac=2.0, atomics=False):
+        obsh = [np.int32(ax) for ax in ob.shape]
+        prsh = [np.int32(ax) for ax in pr.shape]
+        if self.queue is not None:
+            self.queue.use()
+        if atomics:
+            if addr.shape[3] != 3 or addr.shape[2] != 5:
+                raise ValueError(
+                    'Address not in required shape for tiled pr_update')
+            num_pods = np.int32(addr.shape[0] * addr.shape[1])
+            self.pr_update_ML_wavefield_cuda(grid=(int(num_pods), 1, 1),
+                                             block=(32, 32, 1),
+                                             args=(ex, num_pods, prsh[1], prsh[2],
+                                                   pr, prsh[0], prsh[1], prsh[2],
+                                                   ob, obsh[0], obsh[1], obsh[2],
+                                                   pf,
+                                                   addr,
+                                                   np.float32(
+                                                 fac) if ex.dtype == np.complex64 else np.float64(fac)))
+        else:
+            if addr.shape[0] != 5 or addr.shape[1] != 3:
+                raise ValueError(
+                    'Address not in required shape for tiled pr_update')
+            num_pods = np.int32(addr.shape[2] * addr.shape[3])
+            if not self.pr_update2_ML_wavefield_cuda:
+                self.pr_update2_ML_wavefield_cuda = load_kernel("pr_update2_ML_wavefield", {
+                    "NUM_MODES": prsh[0],
+                    "BDIM_X": 16,
+                    "BDIM_Y": 16,
+                    'IN_TYPE': 'float',
+                    'OUT_TYPE': 'float',
+                    'MATH_TYPE': self.math_type,
+                    'ACC_TYPE': self.accumulator_type
+                })
+
+            grid = [int((x+15)//16) for x in pr.shape[-2:]]
+            grid = (grid[0], grid[1], int(1))
+            self.pr_update2_ML_wavefield_cuda(grid=grid,
+                                              block=(16, 16, 1),
+                                              args=(prsh[-1], obsh[-2], obsh[-1],
+                                                    prsh[0], obsh[0], num_pods,
+                                                    pr, ob, ex, pf, addr,
+                                                    np.float32(
+                                                  fac) if ex.dtype == np.complex64 else np.float64(fac)))
+
     def ob_update_local(self, addr, ob, pr, ex, aux, prn, a=0., b=1.):
         if self.queue is not None:
             self.queue.use()
diff --git a/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py
index d527d9f15..a05839b47 100644
--- a/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py
+++ b/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py
@@ -14,7 +14,7 @@
 import numpy as np
 from pycuda import gpuarray
 import pycuda.driver as cuda
-import pycuda.cumath
+import pycuda.cumath as cm
 
 from ptypy.engines import register
 from ptypy.accelerate.base.engines.ML_serial import ML_serial, BaseModelSerial
@@ -73,6 +73,7 @@ def __init__(self, ptycho_parent, pars=None):
         Maximum likelihood reconstruction engine.
         """
         super().__init__(ptycho_parent, pars)
+        self.sqrt = cm.sqrt
 
     def engine_initialize(self):
         """
@@ -275,6 +276,13 @@ def _get_smooth_gradient(self, data, sigma):
 
     def _replace_ob_grad(self):
         new_ob_grad = self.ob_grad_new
+
+        # Wavefield preconditioner for the object
+        if self.p.wavefield_precond:
+            for name, s in new_ob_grad.storages.items():
+                s.gpu /= cm.sqrt(self.ob_fln.storages[name].gpu
+                              + self.p.wavefield_delta_object)
+
         # Smoothing preconditioner
         if self.smooth_gradient:
             self.smooth_gradient.sigma *= (1. - self.p.smooth_gradient_decay)
@@ -285,7 +293,8 @@ def _replace_ob_grad(self):
 
     def _replace_pr_grad(self):
         new_pr_grad = self.pr_grad_new
-        # probe support
+
+        # Probe support
         if self.p.probe_update_start <= self.curiter:
             # Apply probe support if needed
             for name, s in new_pr_grad.storages.items():
@@ -293,6 +302,12 @@ def _replace_pr_grad(self):
         else:
             new_pr_grad.fill(0.)
 
+        # Wavefield preconditioner for the probe
+        if self.p.wavefield_precond:
+            for name, s in new_pr_grad.storages.items():
+                s.gpu /= cm.sqrt(self.pr_fln.storages[name].gpu
+                              + self.p.wavefield_delta_probe)
+
         return self._replace_grad(self.pr_grad , new_pr_grad)
 
     def _replace_grad(self, grad, new_grad):
@@ -485,6 +500,12 @@ def new_grad(self):
         qu_htod = self.engine.qu_htod
         queue = self.engine.queue
 
+        if self.engine.p.wavefield_precond:
+            ob_fln = self.engine.ob_fln
+            pr_fln = self.engine.pr_fln
+            ob_fln << 0.
+            pr_fln << 0.
+
         self.engine._set_pr_ob_ref_for_data('gpu')
         ob_grad << 0.
         pr_grad << 0.
@@ -518,6 +539,9 @@ def new_grad(self):
             obg = ob_grad.S[oID].data
             pr = self.engine.pr.S[pID].data
             prg = pr_grad.S[pID].data
+            if self.engine.p.wavefield_precond:
+                obf = ob_fln.S[oID].data
+                prf = pr_fln.S[pID].data
 
             # Schedule w & I to device
             ev_w, w, data_w = self.engine.w_data.to_gpu(prep.weights, dID, qu_htod)
@@ -545,11 +569,17 @@ def new_grad(self):
 
             use_atomics = self.p.object_update_cuda_atomics
             addr = prep.addr_gpu if use_atomics else prep.addr2_gpu
-            POK.ob_update_ML(addr, obg, pr, aux, atomics=use_atomics)
+            if self.engine.p.wavefield_precond:
+                POK.ob_update_ML_wavefield(addr, obg, obf, pr, aux, atomics=use_atomics)
+            else:
+                POK.ob_update_ML(addr, obg, pr, aux, atomics=use_atomics)
 
             use_atomics = self.p.probe_update_cuda_atomics
             addr = prep.addr_gpu if use_atomics else prep.addr2_gpu
-            POK.pr_update_ML(addr, prg, ob, aux, atomics=use_atomics)
+            if self.engine.p.wavefield_precond:
+                POK.pr_update_ML_wavefield(addr, prg, prf, ob, aux, atomics=use_atomics)
+            else:
+                POK.pr_update_ML(addr, prg, ob, aux, atomics=use_atomics)
 
         queue.synchronize()
         self.engine.dID_list.reverse()
@@ -571,10 +601,18 @@ def new_grad(self):
             s.gpu.get(s.cpu)
         for s in pr_grad.S.values():
             s.gpu.get(s.cpu)
+        if self.engine.p.wavefield_precond:
+            for s in ob_fln.S.values():
+                s.gpu.get(s.cpu)
+            for s in pr_fln.S.values():
+                s.gpu.get(s.cpu)
         self.engine._set_pr_ob_ref_for_data('cpu')
 
         ob_grad.allreduce()
         pr_grad.allreduce()
+        if self.engine.p.wavefield_precond:
+            ob_fln.allreduce()
+            pr_fln.allreduce()
         parallel.allreduce(LL)
 
         # HtoD cause we continue on gpu
@@ -582,6 +620,11 @@ def new_grad(self):
             s.gpu.set(s.cpu)
         for s in pr_grad.S.values():
             s.gpu.set(s.cpu)
+        if self.engine.p.wavefield_precond:
+            for s in ob_fln.S.values():
+                s.gpu.set(s.cpu)
+            for s in pr_fln.S.values():
+                s.gpu.set(s.cpu)        
         self.engine._set_pr_ob_ref_for_data('gpu')
 
         # Object regularizer
diff --git a/ptypy/accelerate/cuda_pycuda/kernels.py b/ptypy/accelerate/cuda_pycuda/kernels.py
index 8fcebb6ef..bab47a8a3 100644
--- a/ptypy/accelerate/cuda_pycuda/kernels.py
+++ b/ptypy/accelerate/cuda_pycuda/kernels.py
@@ -880,6 +880,18 @@ def __init__(self, queue_thread=None,
             'MATH_TYPE': self.math_type
         })
         self.pr_update2_ML_cuda = None
+        self.ob_update_ML_wavefield_cuda = load_kernel("ob_update_ML_wavefield", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type
+        })
+        self.ob_update2_ML_wavefield_cuda = None
+        self.pr_update_ML_wavefield_cuda = load_kernel("pr_update_ML_wavefield", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type
+        })
+        self.pr_update2_ML_wavefield_cuda = None
         self.ob_update_local_cuda = load_kernel("ob_update_local", {
             'IN_TYPE': 'float',
             'OUT_TYPE': 'float',
@@ -1045,6 +1057,46 @@ def ob_update_ML(self, addr, ob, pr, ex, fac=2.0, atomics=True):
                                     np.float32(fac) if ex.dtype == np.complex64 else np.float64(fac),
                                     block=(16, 16, 1), grid=grid, stream=self.queue)
 
+    def ob_update_ML_wavefield(self, addr, ob, of, pr, ex, fac=2.0, atomics=True):
+        obsh = [np.int32(ax) for ax in ob.shape]
+        prsh = [np.int32(ax) for ax in pr.shape]
+        exsh = [np.int32(ax) for ax in ex.shape]
+
+        if atomics:
+            if addr.shape[3] != 3 or addr.shape[2] != 5:
+                raise ValueError('Address not in required shape for tiled ob_update')
+
+            num_pods = np.int32(addr.shape[0] * addr.shape[1])
+            self.ob_update_ML_wavefield_cuda(ex, num_pods, exsh[1], exsh[2],
+                                             pr, prsh[0], prsh[1], prsh[2],
+                                             ob, obsh[0], obsh[1], obsh[2],
+                                             of,
+                                             addr,
+                                             np.float32(fac) if ex.dtype == np.complex64 else np.float64(fac),
+                                             block=(32, 32, 1), grid=(int(num_pods), 1, 1), stream=self.queue)
+        else:
+            if addr.shape[0] != 5 or addr.shape[1] != 3:
+                raise ValueError('Address not in required shape for tiled ob_update')
+
+            num_pods = np.int32(addr.shape[2] * addr.shape[3])
+            if not self.ob_update2_ML_wavefield_cuda:
+                self.ob_update2_ML_wavefield_cuda = load_kernel("ob_update2_ML_wavefield", {
+                    "NUM_MODES": obsh[0],
+                    "BDIM_X": 16,
+                    "BDIM_Y": 16,
+                    'IN_TYPE': 'float',
+                    'OUT_TYPE': 'float',
+                    'MATH_TYPE': self.math_type,
+                    'ACC_TYPE': self.accumulator_type
+                })
+            grid = [int((x+15)//16) for x in ob.shape[-2:]]
+            grid = (grid[1], grid[0], int(1))
+            self.ob_update2_ML_wavefield_cuda(prsh[-1], obsh[0], num_pods,
+                                              obsh[-2], obsh[-1], prsh[0],
+                                              ob, pr, ex, of, addr,
+                                              np.float32(fac) if ex.dtype == np.complex64 else np.float64(fac),
+                                              block=(16, 16, 1), grid=grid, stream=self.queue)
+
     def pr_update_ML(self, addr, pr, ob, ex, fac=2.0, atomics=False):
         obsh = [np.int32(ax) for ax in ob.shape]
         prsh = [np.int32(ax) for ax in pr.shape]
@@ -1081,6 +1133,42 @@ def pr_update_ML(self, addr, pr, ob, ex, fac=2.0, atomics=False):
                                  np.float32(fac) if ex.dtype == np.complex64 else np.float64(fac),
                                  block=(16, 16, 1), grid=grid, stream=self.queue)
 
+    def pr_update_ML_wavefield(self, addr, pr, pf, ob, ex, fac=2.0, atomics=False):
+        obsh = [np.int32(ax) for ax in ob.shape]
+        prsh = [np.int32(ax) for ax in pr.shape]
+        if atomics:
+            if addr.shape[3] != 3 or addr.shape[2] != 5:
+                raise ValueError('Address not in required shape for tiled pr_update')
+            num_pods = np.int32(addr.shape[0] * addr.shape[1])
+            self.pr_update_ML_wavefield_cuda(ex, num_pods, prsh[1], prsh[2],
+                                          pr, prsh[0], prsh[1], prsh[2],
+                                          ob, obsh[0], obsh[1], obsh[2],
+                                          pf,
+                                          addr,
+                                          np.float32(fac) if ex.dtype == np.complex64 else np.float64(fac),
+                                          block=(32, 32, 1), grid=(int(num_pods), 1, 1), stream=self.queue)
+        else:
+            if addr.shape[0] != 5 or addr.shape[1] != 3:
+                raise ValueError('Address not in required shape for tiled pr_update')
+            num_pods = np.int32(addr.shape[2] * addr.shape[3])
+            if not self.pr_update2_ML_wavefield_cuda:
+                self.pr_update2_ML_wavefield_cuda = load_kernel("pr_update2_ML_wavefield", {
+                    "NUM_MODES": prsh[0],
+                    "BDIM_X": 16,
+                    "BDIM_Y": 16,
+                    'IN_TYPE': 'float',
+                    'OUT_TYPE': 'float',
+                    'MATH_TYPE': self.math_type,
+                    'ACC_TYPE': self.accumulator_type
+                })
+
+            grid = [int((x+15)//16) for x in pr.shape[-2:]]
+            grid = (grid[0], grid[1], int(1))
+            self.pr_update2_ML_wavefield_cuda(prsh[-1], obsh[-2], obsh[-1],
+                                           prsh[0], obsh[0], num_pods,
+                                           pr, ob, ex, pf, addr,
+                                           np.float32(fac) if ex.dtype == np.complex64 else np.float64(fac),
+                                           block=(16, 16, 1), grid=grid, stream=self.queue)
 
     def ob_update_local(self, addr, ob, pr, ex, aux, prn, a=0., b=1.):
         prn_max = gpuarray.max(prn, stream=self.queue)
diff --git a/ptypy/engines/ML.py b/ptypy/engines/ML.py
index b5c93dfbe..8cafd78c8 100644
--- a/ptypy/engines/ML.py
+++ b/ptypy/engines/ML.py
@@ -106,6 +106,22 @@ class ML(PositionCorrectionEngine):
     help = How many coefficients to be used in the the linesearch
     doc = choose between the 'quadratic' approximation (default) or 'all'
 
+    [wavefield_precond]
+    default = False
+    type = bool
+    help = Whether to use the wavefield preconditioner
+    doc = This parameter can give faster convergence.
+
+    [wavefield_delta_object]
+    default = 0.1
+    type = float
+    help = Wavefield preconditioner damping constant for the object.
+
+    [wavefield_delta_probe]
+    default = 0.1
+    type = float
+    help = Wavefield preconditioner damping constant for the probe.
+
     """
 
     SUPPORTED_MODELS = [Full, Vanilla, Bragg3dModel, BlockVanilla, BlockFull, GradFull, BlockGradFull]
@@ -137,6 +153,9 @@ def __init__(self, ptycho_parent, pars=None):
         # Probe gradient
         self.pr_grad_new = None
 
+        # Object and probe fluence maps
+        self.ob_fln = None
+        self.pr_fln = None
 
         # Other
         self.tmin = None
@@ -172,6 +191,11 @@ def engine_initialize(self):
         self.pr_grad_new = self.pr.copy(self.pr.ID + '_grad_new', fill=0.)
         self.pr_h = self.pr.copy(self.pr.ID + '_h', fill=0.)
 
+        # Object and probe fluence maps
+        if self.p.wavefield_precond:
+            self.ob_fln = self.ob.copy(self.ob.ID + '_fln', fill=0., dtype='real')
+            self.pr_fln = self.pr.copy(self.pr.ID + '_fln', fill=0., dtype='real')
+
         self.tmin = 1.
 
         # Other options
@@ -230,6 +254,12 @@ def engine_iterate(self, num=1):
             else:
                 new_pr_grad.fill(0.)
 
+            # Wavefield preconditioner
+            if self.p.wavefield_precond:
+                for name, s in new_ob_grad.storages.items():
+                    new_ob_grad.storages[name].data /= np.sqrt(self.ob_fln.storages[name].data + self.p.wavefield_delta_object)
+                    new_pr_grad.storages[name].data /= np.sqrt(self.pr_fln.storages[name].data + self.p.wavefield_delta_probe)
+
             # Smoothing preconditioner
             if self.smooth_gradient:
                 self.smooth_gradient.sigma *= (1. - self.p.smooth_gradient_decay)
@@ -276,19 +306,35 @@ def engine_iterate(self, num=1):
             self.pr_grad << new_pr_grad
 
             dt = self.ptycho.FType
+
             # 3. Next conjugate
             self.ob_h *= bt / self.tmin
-
-            # Smoothing preconditioner
-            if self.smooth_gradient:
+            # Wavefield preconditioner for the object (with and without smoothing preconditioner)
+            if self.p.wavefield_precond:
                 for name, s in self.ob_h.storages.items():
-                    s.data[:] -= self.smooth_gradient(self.ob_grad.storages[name].data)
+                    if self.smooth_gradient:
+                        s.data[:] -= self.smooth_gradient(self.ob_grad.storages[name].data 
+                                      / np.sqrt(self.ob_fln.storages[name].data + self.p.wavefield_delta_object))
+                    else:
+                        s.data[:] -= (self.ob_grad.storages[name].data 
+                                      / np.sqrt(self.ob_fln.storages[name].data + self.p.wavefield_delta_object)) 
             else:
-                self.ob_h -= self.ob_grad
+                # Smoothing preconditioner for the object
+                if self.smooth_gradient:
+                    for name, s in self.ob_h.storages.items():
+                        s.data[:] -= self.smooth_gradient(self.ob_grad.storages[name].data)
+                else:
+                    self.ob_h -= self.ob_grad
 
             self.pr_h *= bt / self.tmin
             self.pr_grad *= self.scale_p_o
-            self.pr_h -= self.pr_grad
+            # Wavefield preconditioner for the probe
+            if self.p.wavefield_precond:
+                for name, s in self.pr_h.storages.items():
+                    s.data[:] -= (self.pr_grad.storages[name].data 
+                                  / np.sqrt(self.pr_fln.storages[name].data + self.p.wavefield_delta_probe))
+            else:
+                self.pr_h -= self.pr_grad
 
             # In principle, the way things are now programmed this part
             # could be iterated over in a real Newton-Raphson style.
@@ -309,9 +355,9 @@ def engine_iterate(self, num=1):
                 self.tmin = dt(-0.5 * B[1] / B[2])
             else:
                 raise NotImplementedError("poly_line_coeffs should be 'quadratic' or 'all'")
-                
+
             tc += time.time() - t2
-                
+
             self.ob_h *= self.tmin
             self.pr_h *= self.tmin
             self.ob += self.ob_h
@@ -353,6 +399,11 @@ def engine_finalize(self):
         del self.pr_grad_new
         del self.ptycho.containers[self.pr_h.ID]
         del self.pr_h
+        if self.p.wavefield_precond:
+            del self.ptycho.containers[self.ob_fln.ID]
+            del self.ob_fln
+            del self.ptycho.containers[self.pr_fln.ID]
+            del self.pr_fln
 
         # Save floating intensities into runtime
         self.ptycho.runtime["float_intens"] = parallel.gather_dict(self.ML_model.float_intens_coeff)
@@ -377,6 +428,8 @@ def __init__(self, MLengine):
         self.ob = self.engine.ob
         self.ob_grad = self.engine.ob_grad_new
         self.pr_grad = self.engine.pr_grad_new
+        self.ob_fln = self.engine.ob_fln
+        self.pr_fln = self.engine.pr_fln
         self.pr = self.engine.pr
         self.float_intens_coeff = {}
 
@@ -490,6 +543,9 @@ def new_grad(self):
         """
         self.ob_grad.fill(0.)
         self.pr_grad.fill(0.)
+        if self.p.wavefield_precond:
+            self.ob_fln.fill(0.)
+            self.pr_fln.fill(0.)
 
         # We need an array for MPI
         LL = np.array([0.])
@@ -531,6 +587,11 @@ def new_grad(self):
                 self.ob_grad[pod.ob_view] += 2. * xi * pod.probe.conj()
                 self.pr_grad[pod.pr_view] += 2. * xi * pod.object.conj()
 
+                # Compute fluence maps for object and probe
+                if self.p.wavefield_precond:
+                    self.ob_fln[pod.ob_view] += u.abs2(pod.probe)
+                    self.pr_fln[pod.pr_view] += u.abs2(pod.object)
+
             diff_view.error = LLL
             error_dct[dname] = np.array([0, LLL / np.prod(DI.shape), 0])
             LL += LLL
@@ -538,6 +599,9 @@ def new_grad(self):
         # MPI reduction of gradients
         self.ob_grad.allreduce()
         self.pr_grad.allreduce()
+        if self.p.wavefield_precond:
+            self.ob_fln.allreduce()
+            self.pr_fln.allreduce()
         parallel.allreduce(LL)
 
         # Object regularizer
@@ -733,6 +797,9 @@ def new_grad(self):
         """
         self.ob_grad.fill(0.)
         self.pr_grad.fill(0.)
+        if self.p.wavefield_precond:
+            self.ob_fln.fill(0.)
+            self.pr_fln.fill(0.)
 
         # We need an array for MPI
         LL = np.array([0.])
@@ -774,6 +841,11 @@ def new_grad(self):
                 self.ob_grad[pod.ob_view] += 2 * xi * pod.probe.conj()
                 self.pr_grad[pod.pr_view] += 2 * xi * pod.object.conj()
 
+                # Compute fluence maps for object and probe
+                if self.p.wavefield_precond:
+                    self.ob_fln[pod.ob_view] += u.abs2(pod.probe)
+                    self.pr_fln[pod.pr_view] += u.abs2(pod.object)
+
             diff_view.error = LLL
             error_dct[dname] = np.array([0, LLL / np.prod(DI.shape), 0])
             LL += LLL
@@ -781,6 +853,9 @@ def new_grad(self):
         # MPI reduction of gradients
         self.ob_grad.allreduce()
         self.pr_grad.allreduce()
+        if self.p.wavefield_precond:
+            self.ob_fln.allreduce()
+            self.pr_fln.allreduce()
         parallel.allreduce(LL)
 
         # Object regularizer
@@ -989,6 +1064,9 @@ def new_grad(self):
         """
         self.ob_grad.fill(0.)
         self.pr_grad.fill(0.)
+        if self.p.wavefield_precond:
+            self.ob_fln.fill(0.)
+            self.pr_fln.fill(0.)
 
         # We need an array for MPI
         LL = np.array([0.])
@@ -1030,6 +1108,11 @@ def new_grad(self):
                 self.ob_grad[pod.ob_view] += 2. * xi * pod.probe.conj()
                 self.pr_grad[pod.pr_view] += 2. * xi * pod.object.conj()
 
+                # Compute fluence maps for object and probe
+                if self.p.wavefield_precond:
+                    self.ob_fln[pod.ob_view] += u.abs2(pod.probe)
+                    self.pr_fln[pod.pr_view] += u.abs2(pod.object)
+
             diff_view.error = LLL
             error_dct[dname] = np.array([0, LLL / np.prod(DA.shape), 0])
             LL += LLL
@@ -1037,6 +1120,9 @@ def new_grad(self):
         # MPI reduction of gradients
         self.ob_grad.allreduce()
         self.pr_grad.allreduce()
+        if self.p.wavefield_precond:
+            self.ob_fln.allreduce()
+            self.pr_fln.allreduce()
         parallel.allreduce(LL)
 
         # Object regularizer
diff --git a/templates/engines/cupy/moonflower_ML_cupy.py b/templates/engines/cupy/moonflower_ML_cupy.py
index af0427cfb..61845bdd3 100644
--- a/templates/engines/cupy/moonflower_ML_cupy.py
+++ b/templates/engines/cupy/moonflower_ML_cupy.py
@@ -16,7 +16,7 @@
 
 # for verbose output
 p.verbose_level = "info"
-p.frames_per_block = 400
+p.frames_per_block = 100
 # set home path
 p.io = u.Param()
 p.io.home =  "/".join([tmpdir, "ptypy"])
@@ -33,7 +33,7 @@
 p.scans.MF.data= u.Param()
 p.scans.MF.data.name = 'MoonFlowerScan'
 p.scans.MF.data.shape = 128
-p.scans.MF.data.num_frames = 100
+p.scans.MF.data.num_frames = 200
 p.scans.MF.data.save = None
 
 p.scans.MF.illumination = u.Param(diversity=None)
diff --git a/templates/engines/cupy/moonflower_ML_cupy_wavefield.py b/templates/engines/cupy/moonflower_ML_cupy_wavefield.py
new file mode 100644
index 000000000..0e2d896b2
--- /dev/null
+++ b/templates/engines/cupy/moonflower_ML_cupy_wavefield.py
@@ -0,0 +1,58 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+
+from ptypy.core import Ptycho
+from ptypy import utils as u
+import ptypy
+ptypy.load_gpu_engines(arch="cupy")
+
+import tempfile
+tmpdir = tempfile.gettempdir()
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = "info"
+p.frames_per_block = 100
+# set home path
+p.io = u.Param()
+p.io.home =  "/".join([tmpdir, "ptypy"])
+p.io.autosave = u.Param(active=False)
+p.io.autoplot = u.Param(active=False)
+p.io.interaction = u.Param(active=False)
+
+# max 200 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+# now you have to specify which ScanModel to use with scans.XX.name,
+# just as you have to give 'name' for engines and PtyScan subclasses.
+p.scans.MF.name = 'BlockFull' 
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 200
+p.scans.MF.data.save = None
+
+p.scans.MF.illumination = u.Param(diversity=None)
+p.scans.MF.coherence = u.Param(num_probe_modes=1)
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'ML_cupy'
+p.engines.engine00.numiter = 300
+p.engines.engine00.numiter_contiguous = 5
+p.engines.engine00.wavefield_precond = True
+
+# prepare and run
+if __name__ == "__main__":
+    P = Ptycho(p,level=5)
diff --git a/templates/engines/numpy/moonflower_ML_Gaussian_wavefield.py b/templates/engines/numpy/moonflower_ML_Gaussian_wavefield.py
new file mode 100644
index 000000000..b5f409f68
--- /dev/null
+++ b/templates/engines/numpy/moonflower_ML_Gaussian_wavefield.py
@@ -0,0 +1,56 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+#import ptypy
+from ptypy.core import Ptycho
+from ptypy import utils as u
+
+import tempfile
+tmpdir = tempfile.gettempdir()
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = "info"
+
+# set home path
+p.io = u.Param()
+p.io.home = "/".join([tmpdir, "ptypy"])
+
+# saving intermediate results
+p.io.autosave = u.Param(active=False)
+
+# opens plotting GUI if interaction set to active)
+p.io.autoplot = u.Param(active=True)
+p.io.interaction = u.Param(active=True)
+
+# max 100 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+p.scans.MF.name = 'BlockFull'
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 200
+p.scans.MF.data.save = None
+
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'ML'
+p.engines.engine00.ML_type = 'Gaussian'
+p.engines.engine00.numiter = 300
+p.engines.engine00.wavefield_precond = True
+
+# prepare and run
+if __name__ == "__main__":
+    P = Ptycho(p,level=5)
diff --git a/templates/engines/pycuda/moonflower_ML_pycuda.py b/templates/engines/pycuda/moonflower_ML_pycuda.py
index 2938275f7..8afcf3517 100644
--- a/templates/engines/pycuda/moonflower_ML_pycuda.py
+++ b/templates/engines/pycuda/moonflower_ML_pycuda.py
@@ -16,7 +16,7 @@
 
 # for verbose output
 p.verbose_level = "info"
-p.frames_per_block = 400
+p.frames_per_block = 100
 # set home path
 p.io = u.Param()
 p.io.home =  "/".join([tmpdir, "ptypy"])
diff --git a/templates/engines/pycuda/moonflower_ML_pycuda_wavefield.py b/templates/engines/pycuda/moonflower_ML_pycuda_wavefield.py
new file mode 100644
index 000000000..04c0528f3
--- /dev/null
+++ b/templates/engines/pycuda/moonflower_ML_pycuda_wavefield.py
@@ -0,0 +1,58 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+
+from ptypy.core import Ptycho
+from ptypy import utils as u
+import ptypy
+ptypy.load_gpu_engines(arch="pycuda")
+
+import tempfile
+tmpdir = tempfile.gettempdir()
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = "info"
+p.frames_per_block = 100
+# set home path
+p.io = u.Param()
+p.io.home =  "/".join([tmpdir, "ptypy"])
+p.io.autosave = u.Param(active=False)
+p.io.autoplot = u.Param(active=False)
+p.io.interaction = u.Param(active=False)
+
+# max 200 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+# now you have to specify which ScanModel to use with scans.XX.name,
+# just as you have to give 'name' for engines and PtyScan subclasses.
+p.scans.MF.name = 'BlockFull' 
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 200
+p.scans.MF.data.save = None
+
+p.scans.MF.illumination = u.Param(diversity=None)
+p.scans.MF.coherence = u.Param(num_probe_modes=1)
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'ML_pycuda'
+p.engines.engine00.numiter = 300
+p.engines.engine00.numiter_contiguous = 5
+p.engines.engine00.wavefield_precond = True
+
+# prepare and run
+if __name__ == "__main__":
+    P = Ptycho(p,level=5)
diff --git a/templates/engines/serial/moonflower_ML_serial_wavefield.py b/templates/engines/serial/moonflower_ML_serial_wavefield.py
new file mode 100644
index 000000000..3d2d2dc84
--- /dev/null
+++ b/templates/engines/serial/moonflower_ML_serial_wavefield.py
@@ -0,0 +1,59 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+from ptypy.core import Ptycho
+from ptypy import utils as u
+
+import ptypy
+ptypy.load_gpu_engines(arch="serial")
+
+import tempfile
+tmpdir = tempfile.gettempdir()
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = "info"
+p.frames_per_block = 100
+
+# set home path
+p.io = u.Param()
+p.io.home = "/".join([tmpdir, "ptypy"])
+p.io.autosave = u.Param(active=False)
+p.io.autoplot = u.Param(active=False)
+p.io.interaction = u.Param(active=False)
+
+# max 200 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+# now you have to specify which ScanModel to use with scans.XX.name,
+# just as you have to give 'name' for engines and PtyScan subclasses.
+p.scans.MF.name = 'BlockFull'
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 200
+p.scans.MF.data.save = None
+
+p.scans.MF.illumination = u.Param(diversity=None)
+p.scans.MF.coherence = u.Param(num_probe_modes=1)
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'ML_serial'
+p.engines.engine00.ML_type = 'Gaussian'
+p.engines.engine00.numiter = 300
+p.engines.engine00.wavefield_precond = True
+
+# prepare and run
+if __name__ == "__main__":
+    P = Ptycho(p,level=5)
diff --git a/test/accelerate_tests/base_tests/engine_test.py b/test/accelerate_tests/base_tests/engine_test.py
index 855536993..30cb84a55 100644
--- a/test/accelerate_tests/base_tests/engine_test.py
+++ b/test/accelerate_tests/base_tests/engine_test.py
@@ -75,7 +75,7 @@ def check_engine_output(self, output, plotting=False, debug=False):
                                     err_msg="The object arrays are not matching as expected")
         # np.testing.assert_allclose(RMSE_LL, 0.0, atol=1e-7,
         #                             err_msg="The log-likelihood errors are not matching as expected")
-    
+
 
     def test_ML_serial_base(self):
         out = []
@@ -152,6 +152,26 @@ def test_ML_serial_smoothing_regularizer(self):
                                            scanmodel="BlockFull", autosave=False, verbose_level="critical"))
         self.check_engine_output(out, plotting=False, debug=False)
 
+    @pytest.mark.skip(reason="Funny behaviour with this test, most likely due to numerical instabilities, see issue #612")
+    def test_ML_serial_wavefield_preconditioner(self):
+        out = []
+        for eng in ["ML", "ML_serial"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 100
+            engine_params.floating_intensities = False
+            engine_params.reg_del2 = False
+            engine_params.reg_del2_amplitude = 1.
+            engine_params.smooth_gradient = 0.
+            engine_params.smooth_gradient_decay = 0.
+            engine_params.scale_precond = False
+            engine_params.wavefield_precond = True
+            engine_params.wavefield_delta_object = 0.1
+            engine_params.wavefield_delta_probe = 0.1
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+        self.check_engine_output(out, plotting=False, debug=False)
+
     def test_ML_serial_all(self):
         out = []
         for eng in ["ML", "ML_serial"]:
diff --git a/test/accelerate_tests/base_tests/po_update_kernel_test.py b/test/accelerate_tests/base_tests/po_update_kernel_test.py
index 953c26b54..9b3ca5c1b 100644
--- a/test/accelerate_tests/base_tests/po_update_kernel_test.py
+++ b/test/accelerate_tests/base_tests/po_update_kernel_test.py
@@ -188,7 +188,7 @@ def test_pr_update(self):
                                       err_msg="The probe denominatorhas not been updated as expected")
 
     def test_pr_update_ML(self):
-        # setup  
+        # setup
         addr, object_array, object_array_denominator, probe, exit_wave, probe_denominator = self.prepare_arrays()
 
         # test
@@ -242,6 +242,100 @@ def test_ob_update_ML(self):
         np.testing.assert_array_equal(object_array, expected_object_array,
                                       err_msg="The object array has not been updated as expected")
 
+    def test_pr_update_ML_wavefield(self):
+        # setup
+        addr, object_array, object_array_denominator, probe, exit_wave, probe_denominator = self.prepare_arrays()
+        probe_wavefield = np.zeros_like(probe, dtype=FLOAT_TYPE)
+
+        # test
+        POUK = PoUpdateKernel()
+        POUK.allocate()  # this doesn't do anything, but is the call pattern.
+        POUK.pr_update_ML_wavefield(addr, probe, probe_wavefield, object_array, exit_wave)
+
+        # assert
+        expected_probe = np.array([[[625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j],
+                                    [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j],
+                                    [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j],
+                                    [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j],
+                                    [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j]],
+
+                                   [[786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j],
+                                    [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j],
+                                    [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j],
+                                    [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j],
+                                    [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j]]],
+                                  dtype=COMPLEX_TYPE)
+        np.testing.assert_array_equal(probe, expected_probe,
+                                      err_msg="The probe has not been updated as expected")
+
+        # assert
+        expected_probe_wavefield = np.array(
+              [[[136., 136., 136., 136., 136.],
+                [136., 136., 136., 136., 136.],
+                [136., 136., 136., 136., 136.],
+                [136., 136., 136., 136., 136.],
+                [136., 136., 136., 136., 136.]],
+
+               [[136., 136., 136., 136., 136.],
+                [136., 136., 136., 136., 136.],
+                [136., 136., 136., 136., 136.],
+                [136., 136., 136., 136., 136.],
+                [136., 136., 136., 136., 136.]]], 
+                dtype=FLOAT_TYPE)
+        np.testing.assert_array_equal(probe_wavefield, expected_probe_wavefield,
+                                      err_msg="The probe wavefield has not been updated as expected")
+
+    def test_ob_update_ML_wavefield(self):
+        # setup
+        addr, object_array, object_array_denominator, probe, exit_wave, probe_denominator = self.prepare_arrays()
+        object_array_wavefield = np.zeros_like(object_array, dtype=FLOAT_TYPE)
+        
+        # test
+        POUK = PoUpdateKernel()
+        POUK.allocate()  # this doesn't do anything, but is the call pattern.
+        POUK.ob_update_ML_wavefield(addr, object_array, object_array_wavefield, probe, exit_wave)
+
+        # assert
+        expected_object_array = np.array(
+            [[[29. + 1.j, 105. + 1.j, 105. + 1.j, 105. + 1.j, 105. + 1.j, 77. + 1.j, 1. + 1.j],
+              [153. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 249. + 1.j, 1. + 1.j],
+              [153. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 249. + 1.j, 1. + 1.j],
+              [153. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 249. + 1.j, 1. + 1.j],
+              [153. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 249. + 1.j, 1. + 1.j],
+              [125. + 1.j, 297. + 1.j, 297. + 1.j, 297. + 1.j, 297. + 1.j, 173. + 1.j, 1. + 1.j],
+              [1. + 1.j, 1. + 1.j, 1. + 1.j, 1. + 1.j, 1. + 1.j, 1. + 1.j, 1. + 1.j]],
+
+             [[44. + 4.j, 132. + 4.j, 132. + 4.j, 132. + 4.j, 132. + 4.j, 92. + 4.j, 4. + 4.j],
+              [180. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 276. + 4.j, 4. + 4.j],
+              [180. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 276. + 4.j, 4. + 4.j],
+              [180. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 276. + 4.j, 4. + 4.j],
+              [180. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 276. + 4.j, 4. + 4.j],
+              [140. + 4.j, 324. + 4.j, 324. + 4.j, 324. + 4.j, 324. + 4.j, 188. + 4.j, 4. + 4.j],
+              [4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j]]],
+            dtype=COMPLEX_TYPE)
+        np.testing.assert_array_equal(object_array, expected_object_array,
+                                      err_msg="The object array has not been updated as expected")
+
+        # assert
+        expected_object_array_wavefield = np.array(
+          [[[10., 20., 20., 20., 20., 10.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [10., 20., 20., 20., 20., 10.,  0.],
+            [ 0.,  0.,  0.,  0.,  0.,  0.,  0.]],
+
+           [[10., 20., 20., 20., 20., 10.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [10., 20., 20., 20., 20., 10.,  0.],
+            [ 0.,  0.,  0.,  0.,  0.,  0.,  0.]]], 
+        dtype=FLOAT_TYPE)
+        np.testing.assert_array_equal(object_array_wavefield, expected_object_array_wavefield,
+                                      err_msg="The object array wavefield has not been updated as expected")
 
     def test_pr_update_local(self):
         # setup
diff --git a/test/accelerate_tests/cuda_cupy_tests/engine_tests.py b/test/accelerate_tests/cuda_cupy_tests/engine_tests.py
index fe70b58bc..2dc64c34d 100644
--- a/test/accelerate_tests/cuda_cupy_tests/engine_tests.py
+++ b/test/accelerate_tests/cuda_cupy_tests/engine_tests.py
@@ -71,13 +71,13 @@ def check_engine_output(self, output, plotting=False, debug=False, scan="MF"):
         RMSE_ob = (np.mean(np.abs(OBJ_ML_cupy - OBJ_ML_serial)**2))
         RMSE_pr = (np.mean(np.abs(PRB_ML_cupy - PRB_ML_serial)**2))
         # RMSE_LL = (np.mean(np.abs(LL_ML_serial - LL_ML)**2))
-        np.testing.assert_allclose(RMSE_ob, 0.0, atol=1e-2, 
+        np.testing.assert_allclose(RMSE_ob, 0.0, atol=1e-2,
                                     err_msg="The object arrays are not matching as expected")
-        np.testing.assert_allclose(RMSE_pr, 0.0, atol=1e-2, 
+        np.testing.assert_allclose(RMSE_pr, 0.0, atol=1e-2,
                                     err_msg="The object arrays are not matching as expected")
         # np.testing.assert_allclose(RMSE_LL, 0.0, atol=1e-7,
         #                             err_msg="The log-likelihood errors are not matching as expected")
-    
+
     def test_ML_cupy_base(self):
         out = []
         for eng in ["ML_serial", "ML_cupy"]:
@@ -151,6 +151,25 @@ def test_ML_cupy_smoothing_regularizer(self):
                                            scanmodel="BlockFull", autosave=False, verbose_level="critical"))
         self.check_engine_output(out, plotting=False, debug=False)
 
+    def test_ML_cupy_wavefield_preconditioner(self):
+        out = []
+        for eng in ["ML_serial", "ML_cupy"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 200
+            engine_params.floating_intensities = False
+            engine_params.reg_del2 = False
+            engine_params.reg_del2_amplitude = 1.
+            engine_params.smooth_gradient = 0
+            engine_params.smooth_gradient_decay = 0.
+            engine_params.scale_precond = False
+            engine_params.wavefield_precond = True
+            engine_params.wavefield_delta_object = 0.1
+            engine_params.wavefield_delta_probe = 0.1
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+        self.check_engine_output(out, plotting=False, debug=False)
+
     def test_ML_cupy_all(self):
         out = []
         for eng in ["ML_serial", "ML_cupy"]:
diff --git a/test/accelerate_tests/cuda_cupy_tests/po_update_kernel_test.py b/test/accelerate_tests/cuda_cupy_tests/po_update_kernel_test.py
index 5b2751877..bad8da21b 100644
--- a/test/accelerate_tests/cuda_cupy_tests/po_update_kernel_test.py
+++ b/test/accelerate_tests/cuda_cupy_tests/po_update_kernel_test.py
@@ -648,6 +648,127 @@ def test_ob_update_ML_atomics_REGRESSION(self):
     def test_ob_update_ML_tiled_REGRESSION(self):
         self.ob_update_ML_tester(False)
 
+    def pr_update_ML_wavefield_tester(self, atomics=False):
+        '''
+        setup
+        '''
+        addr, object_array, object_array_denominator, probe, exit_wave, probe_denominator = self.prepare_arrays()
+        probe_wavefield = cp.asarray(np.zeros_like(probe, dtype=FLOAT_TYPE))
+        '''
+        test
+        '''
+        POUK = PoUpdateKernel()
+
+        POUK.allocate()  # this doesn't do anything, but is the call pattern.
+
+        if not atomics:
+            addr2 = np.ascontiguousarray(np.transpose(addr.get(), (2, 3, 0, 1)))
+            addr = cp.asarray(addr2)
+
+        POUK.pr_update_ML_wavefield(addr, probe, probe_wavefield, object_array, exit_wave, atomics=atomics)
+
+        expected_probe = np.array([[[625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j],
+                                    [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j],
+                                    [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j],
+                                    [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j],
+                                    [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j]],
+
+                                   [[786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j],
+                                    [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j],
+                                    [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j],
+                                    [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j],
+                                    [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j]]],
+                                  dtype=COMPLEX_TYPE)
+        np.testing.assert_array_equal(probe.get(), expected_probe,
+                                      err_msg="The probe has not been updated as expected")
+
+        expected_probe_wavefield = np.array(
+              [[[136., 136., 136., 136., 136.],
+                [136., 136., 136., 136., 136.],
+                [136., 136., 136., 136., 136.],
+                [136., 136., 136., 136., 136.],
+                [136., 136., 136., 136., 136.]],
+
+               [[136., 136., 136., 136., 136.],
+                [136., 136., 136., 136., 136.],
+                [136., 136., 136., 136., 136.],
+                [136., 136., 136., 136., 136.],
+                [136., 136., 136., 136., 136.]]], 
+                dtype=FLOAT_TYPE)
+        np.testing.assert_array_equal(probe_wavefield.get(), expected_probe_wavefield,
+                                      err_msg="The probe wavefield has not been updated as expected")
+
+    def test_pr_update_ML_wavefield_atomics_REGRESSION(self):
+        self.pr_update_ML_wavefield_tester(True)
+
+    def test_pr_update_ML_wavefield_tiled_REGRESSION(self):
+        self.pr_update_ML_wavefield_tester(False)
+
+    def ob_update_ML_wavefield_tester(self, atomics=True):
+        '''
+        setup
+        '''
+        addr, object_array, object_array_denominator, probe, exit_wave, probe_denominator = self.prepare_arrays()
+        object_array_wavefield = cp.asarray(np.zeros_like(object_array, dtype=FLOAT_TYPE))
+        '''
+        test
+        '''
+        POUK = PoUpdateKernel()
+
+        POUK.allocate()  # this doesn't do anything, but is the call pattern.
+
+        if not atomics:
+            addr2 = np.ascontiguousarray(np.transpose(addr.get(), (2, 3, 0, 1)))
+            addr = cp.asarray(addr2)
+
+        POUK.ob_update_ML_wavefield(addr, object_array, object_array_wavefield, probe, exit_wave, atomics=atomics)
+
+        expected_object_array = np.array(
+            [[[29. + 1.j, 105. + 1.j, 105. + 1.j, 105. + 1.j, 105. + 1.j, 77. + 1.j, 1. + 1.j],
+              [153. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 249. + 1.j, 1. + 1.j],
+              [153. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 249. + 1.j, 1. + 1.j],
+              [153. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 249. + 1.j, 1. + 1.j],
+              [153. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 249. + 1.j, 1. + 1.j],
+              [125. + 1.j, 297. + 1.j, 297. + 1.j, 297. + 1.j, 297. + 1.j, 173. + 1.j, 1. + 1.j],
+              [1. + 1.j, 1. + 1.j, 1. + 1.j, 1. + 1.j, 1. + 1.j, 1. + 1.j, 1. + 1.j]],
+
+             [[44. + 4.j, 132. + 4.j, 132. + 4.j, 132. + 4.j, 132. + 4.j, 92. + 4.j, 4. + 4.j],
+              [180. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 276. + 4.j, 4. + 4.j],
+              [180. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 276. + 4.j, 4. + 4.j],
+              [180. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 276. + 4.j, 4. + 4.j],
+              [180. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 276. + 4.j, 4. + 4.j],
+              [140. + 4.j, 324. + 4.j, 324. + 4.j, 324. + 4.j, 324. + 4.j, 188. + 4.j, 4. + 4.j],
+              [4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j]]],
+            dtype=COMPLEX_TYPE)
+        np.testing.assert_array_equal(object_array.get(), expected_object_array,
+                                      err_msg="The object array has not been updated as expected")
+
+        expected_object_array_wavefield = np.array(
+          [[[10., 20., 20., 20., 20., 10.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [10., 20., 20., 20., 20., 10.,  0.],
+            [ 0.,  0.,  0.,  0.,  0.,  0.,  0.]],
+
+           [[10., 20., 20., 20., 20., 10.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [10., 20., 20., 20., 20., 10.,  0.],
+            [ 0.,  0.,  0.,  0.,  0.,  0.,  0.]]], 
+            dtype=FLOAT_TYPE)
+        np.testing.assert_array_equal(object_array_wavefield.get(), expected_object_array_wavefield,
+                                      err_msg="The object array wavefield has not been updated as expected")
+
+    def test_ob_update_ML_wavefield_atomics_REGRESSION(self):
+        self.ob_update_ML_wavefield_tester(True)
+
+    def test_ob_update_ML_wavefield_tiled_REGRESSION(self):
+        self.ob_update_ML_wavefield_tester(False)
+
     def test_ob_update_local_UNITY(self):
         '''
         setup
diff --git a/test/accelerate_tests/cuda_pycuda_tests/engine_tests.py b/test/accelerate_tests/cuda_pycuda_tests/engine_tests.py
index 20b950c61..bf1223c2c 100644
--- a/test/accelerate_tests/cuda_pycuda_tests/engine_tests.py
+++ b/test/accelerate_tests/cuda_pycuda_tests/engine_tests.py
@@ -71,13 +71,13 @@ def check_engine_output(self, output, plotting=False, debug=False, scan="MF"):
         RMSE_ob = (np.mean(np.abs(OBJ_ML_pycuda - OBJ_ML_serial)**2))
         RMSE_pr = (np.mean(np.abs(PRB_ML_pycuda - PRB_ML_serial)**2))
         # RMSE_LL = (np.mean(np.abs(LL_ML_serial - LL_ML)**2))
-        np.testing.assert_allclose(RMSE_ob, 0.0, atol=1e-2, 
+        np.testing.assert_allclose(RMSE_ob, 0.0, atol=1e-2,
                                     err_msg="The object arrays are not matching as expected")
-        np.testing.assert_allclose(RMSE_pr, 0.0, atol=1e-2, 
+        np.testing.assert_allclose(RMSE_pr, 0.0, atol=1e-2,
                                     err_msg="The object arrays are not matching as expected")
         # np.testing.assert_allclose(RMSE_LL, 0.0, atol=1e-7,
         #                             err_msg="The log-likelihood errors are not matching as expected")
-    
+
     def test_ML_pycuda_base(self):
         out = []
         for eng in ["ML_serial", "ML_pycuda"]:
@@ -151,6 +151,25 @@ def test_ML_pycuda_smoothing_regularizer(self):
                                            scanmodel="BlockFull", autosave=False, verbose_level="critical"))
         self.check_engine_output(out, plotting=False, debug=False)
 
+    def test_ML_pycuda_wavefield_preconditioner(self):
+        out = []
+        for eng in ["ML_serial", "ML_pycuda"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 200
+            engine_params.floating_intensities = False
+            engine_params.reg_del2 = False
+            engine_params.reg_del2_amplitude = 1.
+            engine_params.smooth_gradient = 0
+            engine_params.smooth_gradient_decay = 0.
+            engine_params.scale_precond = False
+            engine_params.wavefield_precond = True
+            engine_params.wavefield_delta_object = 0.1
+            engine_params.wavefield_delta_probe = 0.1
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+        self.check_engine_output(out, plotting=False, debug=False)
+
     def test_ML_pycuda_all(self):
         out = []
         for eng in ["ML_serial", "ML_pycuda"]:
diff --git a/test/accelerate_tests/cuda_pycuda_tests/po_update_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/po_update_kernel_test.py
index 5fc1112cd..5a0167d01 100644
--- a/test/accelerate_tests/cuda_pycuda_tests/po_update_kernel_test.py
+++ b/test/accelerate_tests/cuda_pycuda_tests/po_update_kernel_test.py
@@ -648,6 +648,127 @@ def test_ob_update_ML_atomics_REGRESSION(self):
     def test_ob_update_ML_tiled_REGRESSION(self):
         self.ob_update_ML_tester(False)
 
+    def pr_update_ML_wavefield_tester(self, atomics=False):
+        '''
+        setup
+        '''
+        addr, object_array, object_array_denominator, probe, exit_wave, probe_denominator = self.prepare_arrays()
+        probe_wavefield = gpuarray.to_gpu(np.zeros_like(probe, dtype=FLOAT_TYPE))
+        '''
+        test
+        '''
+        POUK = PoUpdateKernel()
+
+        POUK.allocate()  # this doesn't do anything, but is the call pattern.
+
+        if not atomics:
+            addr2 = np.ascontiguousarray(np.transpose(addr.get(), (2, 3, 0, 1)))
+            addr = gpuarray.to_gpu(addr2)
+
+        POUK.pr_update_ML_wavefield(addr, probe, probe_wavefield, object_array, exit_wave, atomics=atomics)
+
+        expected_probe = np.array([[[625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j],
+                                    [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j],
+                                    [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j],
+                                    [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j],
+                                    [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j]],
+
+                                   [[786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j],
+                                    [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j],
+                                    [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j],
+                                    [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j],
+                                    [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j]]],
+                                  dtype=COMPLEX_TYPE)
+        np.testing.assert_array_equal(probe.get(), expected_probe,
+                                      err_msg="The probe has not been updated as expected")
+
+        expected_probe_wavefield = np.array(
+                                  [[[136., 136., 136., 136., 136.],
+                                    [136., 136., 136., 136., 136.],
+                                    [136., 136., 136., 136., 136.],
+                                    [136., 136., 136., 136., 136.],
+                                    [136., 136., 136., 136., 136.]],
+
+                                   [[136., 136., 136., 136., 136.],
+                                    [136., 136., 136., 136., 136.],
+                                    [136., 136., 136., 136., 136.],
+                                    [136., 136., 136., 136., 136.],
+                                    [136., 136., 136., 136., 136.]]], 
+                                    dtype=FLOAT_TYPE)
+        np.testing.assert_array_equal(probe_wavefield.get(), expected_probe_wavefield,
+                                      err_msg="The probe wavefield has not been updated as expected")
+
+    def test_pr_update_ML_wavefield_atomics_REGRESSION(self):
+        self.pr_update_ML_wavefield_tester(True)
+
+    def test_pr_update_ML_wavefield_tiled_REGRESSION(self):
+        self.pr_update_ML_wavefield_tester(False)
+
+    def ob_update_ML_wavefield_tester(self, atomics=True):
+        '''
+        setup
+        '''
+        addr, object_array, object_array_denominator, probe, exit_wave, probe_denominator = self.prepare_arrays()
+        object_array_wavefield = gpuarray.to_gpu(np.zeros_like(object_array, dtype=FLOAT_TYPE))
+        '''
+        test
+        '''
+        POUK = PoUpdateKernel()
+
+        POUK.allocate()  # this doesn't do anything, but is the call pattern.
+
+        if not atomics:
+            addr2 = np.ascontiguousarray(np.transpose(addr.get(), (2, 3, 0, 1)))
+            addr = gpuarray.to_gpu(addr2)
+
+        POUK.ob_update_ML_wavefield(addr, object_array, object_array_wavefield, probe, exit_wave, atomics=atomics)
+
+        expected_object_array = np.array(
+            [[[29. + 1.j, 105. + 1.j, 105. + 1.j, 105. + 1.j, 105. + 1.j, 77. + 1.j, 1. + 1.j],
+              [153. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 249. + 1.j, 1. + 1.j],
+              [153. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 249. + 1.j, 1. + 1.j],
+              [153. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 249. + 1.j, 1. + 1.j],
+              [153. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 249. + 1.j, 1. + 1.j],
+              [125. + 1.j, 297. + 1.j, 297. + 1.j, 297. + 1.j, 297. + 1.j, 173. + 1.j, 1. + 1.j],
+              [1. + 1.j, 1. + 1.j, 1. + 1.j, 1. + 1.j, 1. + 1.j, 1. + 1.j, 1. + 1.j]],
+
+             [[44. + 4.j, 132. + 4.j, 132. + 4.j, 132. + 4.j, 132. + 4.j, 92. + 4.j, 4. + 4.j],
+              [180. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 276. + 4.j, 4. + 4.j],
+              [180. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 276. + 4.j, 4. + 4.j],
+              [180. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 276. + 4.j, 4. + 4.j],
+              [180. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 276. + 4.j, 4. + 4.j],
+              [140. + 4.j, 324. + 4.j, 324. + 4.j, 324. + 4.j, 324. + 4.j, 188. + 4.j, 4. + 4.j],
+              [4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j]]],
+            dtype=COMPLEX_TYPE)
+        np.testing.assert_array_equal(object_array.get(), expected_object_array,
+                                      err_msg="The object array has not been updated as expected")
+
+        expected_object_array_wavefield = np.array(
+          [[[10., 20., 20., 20., 20., 10.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [10., 20., 20., 20., 20., 10.,  0.],
+            [ 0.,  0.,  0.,  0.,  0.,  0.,  0.]],
+
+           [[10., 20., 20., 20., 20., 10.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [20., 40., 40., 40., 40., 20.,  0.],
+            [10., 20., 20., 20., 20., 10.,  0.],
+            [ 0.,  0.,  0.,  0.,  0.,  0.,  0.]]], 
+            dtype=FLOAT_TYPE)
+        np.testing.assert_array_equal(object_array_wavefield.get(), expected_object_array_wavefield,
+                                      err_msg="The object array wavefield has not been updated as expected")
+
+    def test_ob_update_ML_wavefield_atomics_REGRESSION(self):
+        self.ob_update_ML_wavefield_tester(True)
+
+    def test_ob_update_ML_wavefield_tiled_REGRESSION(self):
+        self.ob_update_ML_wavefield_tester(False)
+
     def test_ob_update_local_UNITY(self):
         '''
         setup

From 3c4a686e68367b5c35c363c3f02a85a278b4489a Mon Sep 17 00:00:00 2001
From: "Benedikt J. Daurer" <bjdaurer@gmail.com>
Date: Tue, 29 Apr 2025 16:49:03 +0100
Subject: [PATCH 10/12] Fixed issues with ML engine tests by adding
 normalisation (#613)

---
 .../base_tests/engine_test.py                 | 27 +++++++++++--------
 .../{engine_tests.py => engine_test.py}       | 17 +++++++-----
 .../{engine_tests.py => engine_test.py}       | 19 ++++++++-----
 3 files changed, 39 insertions(+), 24 deletions(-)
 rename test/accelerate_tests/cuda_cupy_tests/{engine_tests.py => engine_test.py} (94%)
 rename test/accelerate_tests/cuda_pycuda_tests/{engine_tests.py => engine_test.py} (93%)

diff --git a/test/accelerate_tests/base_tests/engine_test.py b/test/accelerate_tests/base_tests/engine_test.py
index 30cb84a55..5b108ce88 100644
--- a/test/accelerate_tests/base_tests/engine_test.py
+++ b/test/accelerate_tests/base_tests/engine_test.py
@@ -24,7 +24,7 @@ def setUp(self):
     def tearDown(self):
         shutil.rmtree(self.outpath)
 
-    def check_engine_output(self, output, plotting=False, debug=False):
+    def check_engine_output(self, output, plotting=False, debug=False, tol=0.1):
         P_ML, P_ML_serial = output
         numiter = len(P_ML.runtime["iter_info"])
         LL_ML = np.array([P_ML.runtime["iter_info"][i]["error"][1] for i in range(numiter)])
@@ -34,6 +34,11 @@ def check_engine_output(self, output, plotting=False, debug=False):
         PRB_ML_serial, PRB_ML = P_ML_serial.probe.S["SMFG00"].data[0], P_ML.probe.S["SMFG00"].data[0]
         eng_ML = P_ML.engines["engine00"]
         eng_ML_serial = P_ML_serial.engines["engine00"]
+        # Normalize the outputs
+        PRB_ML_max = np.abs(PRB_ML).max()
+        PRB_ML_serial_max = np.abs(PRB_ML_serial).max()
+        OBJ_ML_serial *= (PRB_ML_serial_max / PRB_ML_max)
+        PRB_ML_serial /= (PRB_ML_serial_max / PRB_ML_max)
         if debug:
             import matplotlib.pyplot as plt
             plt.figure("ML debug")
@@ -67,14 +72,14 @@ def check_engine_output(self, output, plotting=False, debug=False):
         # np.testing.assert_allclose(eng_ML.debug, eng_ML_serial.debug, atol=1e-7, rtol=1e-7,
         #                             err_msg="The debug arrays are not matching as expected")
         RMSE_ob = (np.mean(np.abs(OBJ_ML_serial - OBJ_ML)**2))
-        RMSE_pr = (np.mean(np.abs(PRB_ML_serial - PRB_ML)**2))
-        # RMSE_LL = (np.mean(np.abs(LL_ML_serial - LL_ML)**2))
-        np.testing.assert_allclose(RMSE_ob, 0.0, atol=1e-1, 
+        RMSE_pr = (np.mean(np.abs(PRB_ML_serial - PRB_ML)**2))        
+        #MSE_LL = (np.mean(np.abs(LL_ML_serial - LL_ML)**2))
+        np.testing.assert_allclose(RMSE_ob, 0.0, atol=tol, 
                                     err_msg="The object arrays are not matching as expected")
-        np.testing.assert_allclose(RMSE_pr, 0.0, atol=1e-1, 
-                                    err_msg="The object arrays are not matching as expected")
-        # np.testing.assert_allclose(RMSE_LL, 0.0, atol=1e-7,
-        #                             err_msg="The log-likelihood errors are not matching as expected")
+        np.testing.assert_allclose(RMSE_pr, 0.0, atol=tol, 
+                                    err_msg="The probe arrays are not matching as expected")
+        #p.testing.assert_allclose(RMSE_LL, 0.0, atol=1e-7,
+        #                           err_msg="The log-likelihood errors are not matching as expected")
 
 
     def test_ML_serial_base(self):
@@ -152,13 +157,13 @@ def test_ML_serial_smoothing_regularizer(self):
                                            scanmodel="BlockFull", autosave=False, verbose_level="critical"))
         self.check_engine_output(out, plotting=False, debug=False)
 
-    @pytest.mark.skip(reason="Funny behaviour with this test, most likely due to numerical instabilities, see issue #612")
+    @pytest.mark.skip(reason="Funny behaviour with this test, most likely related to Gaussian filter, see issue #607")
     def test_ML_serial_wavefield_preconditioner(self):
         out = []
         for eng in ["ML", "ML_serial"]:
             engine_params = u.Param()
             engine_params.name = eng
-            engine_params.numiter = 100
+            engine_params.numiter = 500
             engine_params.floating_intensities = False
             engine_params.reg_del2 = False
             engine_params.reg_del2_amplitude = 1.
@@ -170,7 +175,7 @@ def test_ML_serial_wavefield_preconditioner(self):
             engine_params.wavefield_delta_probe = 0.1
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
                                            scanmodel="BlockFull", autosave=False, verbose_level="critical"))
-        self.check_engine_output(out, plotting=False, debug=False)
+        self.check_engine_output(out, plotting=False, debug=False, tol=0.3)
 
     def test_ML_serial_all(self):
         out = []
diff --git a/test/accelerate_tests/cuda_cupy_tests/engine_tests.py b/test/accelerate_tests/cuda_cupy_tests/engine_test.py
similarity index 94%
rename from test/accelerate_tests/cuda_cupy_tests/engine_tests.py
rename to test/accelerate_tests/cuda_cupy_tests/engine_test.py
index 2dc64c34d..67851d2cc 100644
--- a/test/accelerate_tests/cuda_cupy_tests/engine_tests.py
+++ b/test/accelerate_tests/cuda_cupy_tests/engine_test.py
@@ -23,7 +23,7 @@ def setUp(self):
     def tearDown(self):
         shutil.rmtree(self.outpath)
 
-    def check_engine_output(self, output, plotting=False, debug=False, scan="MF"):
+    def check_engine_output(self, output, plotting=False, debug=False, scan="MF", tol=0.1):
         key = "S%sG00" %scan
         P_ML_serial, P_ML_cupy = output
         numiter = len(P_ML_serial.runtime["iter_info"])
@@ -36,6 +36,11 @@ def check_engine_output(self, output, plotting=False, debug=False, scan="MF"):
         MED_ML_cupy = np.median(np.angle(OBJ_ML_cupy))
         eng_ML_serial = P_ML_serial.engines["engine00"]
         eng_ML_cupy = P_ML_cupy.engines["engine00"]
+        # Normalize the outputs
+        PRB_ML_serial_max = np.abs(PRB_ML_serial).max()
+        PRB_ML_cupy_max = np.abs(PRB_ML_cupy).max()
+        OBJ_ML_cupy *= (PRB_ML_cupy_max / PRB_ML_serial_max)
+        PRB_ML_cupy /= (PRB_ML_cupy_max / PRB_ML_serial_max)
         if debug:
             import matplotlib.pyplot as plt
             plt.figure("ML serial debug")
@@ -71,10 +76,10 @@ def check_engine_output(self, output, plotting=False, debug=False, scan="MF"):
         RMSE_ob = (np.mean(np.abs(OBJ_ML_cupy - OBJ_ML_serial)**2))
         RMSE_pr = (np.mean(np.abs(PRB_ML_cupy - PRB_ML_serial)**2))
         # RMSE_LL = (np.mean(np.abs(LL_ML_serial - LL_ML)**2))
-        np.testing.assert_allclose(RMSE_ob, 0.0, atol=1e-2,
-                                    err_msg="The object arrays are not matching as expected")
-        np.testing.assert_allclose(RMSE_pr, 0.0, atol=1e-2,
+        np.testing.assert_allclose(RMSE_ob, 0.0, atol=tol,
                                     err_msg="The object arrays are not matching as expected")
+        np.testing.assert_allclose(RMSE_pr, 0.0, atol=tol,
+                                    err_msg="The probe arrays are not matching as expected")
         # np.testing.assert_allclose(RMSE_LL, 0.0, atol=1e-7,
         #                             err_msg="The log-likelihood errors are not matching as expected")
 
@@ -156,7 +161,7 @@ def test_ML_cupy_wavefield_preconditioner(self):
         for eng in ["ML_serial", "ML_cupy"]:
             engine_params = u.Param()
             engine_params.name = eng
-            engine_params.numiter = 200
+            engine_params.numiter = 500
             engine_params.floating_intensities = False
             engine_params.reg_del2 = False
             engine_params.reg_del2_amplitude = 1.
@@ -168,7 +173,7 @@ def test_ML_cupy_wavefield_preconditioner(self):
             engine_params.wavefield_delta_probe = 0.1
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
                                            scanmodel="BlockFull", autosave=False, verbose_level="critical"))
-        self.check_engine_output(out, plotting=False, debug=False)
+        self.check_engine_output(out, plotting=False, debug=False, tol=0.3)
 
     def test_ML_cupy_all(self):
         out = []
diff --git a/test/accelerate_tests/cuda_pycuda_tests/engine_tests.py b/test/accelerate_tests/cuda_pycuda_tests/engine_test.py
similarity index 93%
rename from test/accelerate_tests/cuda_pycuda_tests/engine_tests.py
rename to test/accelerate_tests/cuda_pycuda_tests/engine_test.py
index bf1223c2c..39184b2ff 100644
--- a/test/accelerate_tests/cuda_pycuda_tests/engine_tests.py
+++ b/test/accelerate_tests/cuda_pycuda_tests/engine_test.py
@@ -23,7 +23,7 @@ def setUp(self):
     def tearDown(self):
         shutil.rmtree(self.outpath)
 
-    def check_engine_output(self, output, plotting=False, debug=False, scan="MF"):
+    def check_engine_output(self, output, plotting=False, debug=False, scan="MF", tol=0.1):
         key = "S%sG00" %scan
         P_ML_serial, P_ML_pycuda = output
         numiter = len(P_ML_serial.runtime["iter_info"])
@@ -36,6 +36,11 @@ def check_engine_output(self, output, plotting=False, debug=False, scan="MF"):
         MED_ML_pycuda = np.median(np.angle(OBJ_ML_pycuda))
         eng_ML_serial = P_ML_serial.engines["engine00"]
         eng_ML_pycuda = P_ML_pycuda.engines["engine00"]
+        # Normalize the outputs
+        PRB_ML_serial_max = np.abs(PRB_ML_serial).max()
+        PRB_ML_pycuda_max = np.abs(PRB_ML_pycuda).max()
+        OBJ_ML_pycuda *= (PRB_ML_pycuda_max / PRB_ML_serial_max)
+        PRB_ML_pycuda /= (PRB_ML_pycuda_max / PRB_ML_serial_max)
         if debug:
             import matplotlib.pyplot as plt
             plt.figure("ML serial debug")
@@ -71,10 +76,10 @@ def check_engine_output(self, output, plotting=False, debug=False, scan="MF"):
         RMSE_ob = (np.mean(np.abs(OBJ_ML_pycuda - OBJ_ML_serial)**2))
         RMSE_pr = (np.mean(np.abs(PRB_ML_pycuda - PRB_ML_serial)**2))
         # RMSE_LL = (np.mean(np.abs(LL_ML_serial - LL_ML)**2))
-        np.testing.assert_allclose(RMSE_ob, 0.0, atol=1e-2,
-                                    err_msg="The object arrays are not matching as expected")
-        np.testing.assert_allclose(RMSE_pr, 0.0, atol=1e-2,
+        np.testing.assert_allclose(RMSE_ob, 0.0, atol=tol,
                                     err_msg="The object arrays are not matching as expected")
+        np.testing.assert_allclose(RMSE_pr, 0.0, atol=tol,
+                                    err_msg="The probe arrays are not matching as expected")
         # np.testing.assert_allclose(RMSE_LL, 0.0, atol=1e-7,
         #                             err_msg="The log-likelihood errors are not matching as expected")
 
@@ -149,14 +154,14 @@ def test_ML_pycuda_smoothing_regularizer(self):
             engine_params.scale_precond = False
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
                                            scanmodel="BlockFull", autosave=False, verbose_level="critical"))
-        self.check_engine_output(out, plotting=False, debug=False)
+        self.check_engine_output(out, plotting=False, debug=False, tol=0.3)
 
     def test_ML_pycuda_wavefield_preconditioner(self):
         out = []
         for eng in ["ML_serial", "ML_pycuda"]:
             engine_params = u.Param()
             engine_params.name = eng
-            engine_params.numiter = 200
+            engine_params.numiter = 700
             engine_params.floating_intensities = False
             engine_params.reg_del2 = False
             engine_params.reg_del2_amplitude = 1.
@@ -168,7 +173,7 @@ def test_ML_pycuda_wavefield_preconditioner(self):
             engine_params.wavefield_delta_probe = 0.1
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
                                            scanmodel="BlockFull", autosave=False, verbose_level="critical"))
-        self.check_engine_output(out, plotting=False, debug=False)
+        self.check_engine_output(out, plotting=False, debug=False, tol=0.3)
 
     def test_ML_pycuda_all(self):
         out = []

From d7ec1c9c6216cfad58d42a031bd67e8201a28bbd Mon Sep 17 00:00:00 2001
From: "Benedikt J. Daurer" <bjdaurer@gmail.com>
Date: Tue, 29 Apr 2025 21:26:05 +0100
Subject: [PATCH 11/12] Add option to provide block size for engine test runner
 (#614)

---
 test/accelerate_tests/base_tests/engine_test.py    | 12 ++++++------
 .../cuda_cupy_tests/engine_test.py                 | 14 +++++++-------
 .../cuda_pycuda_tests/engine_test.py               | 14 +++++++-------
 test/utils.py                                      |  4 +++-
 4 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/test/accelerate_tests/base_tests/engine_test.py b/test/accelerate_tests/base_tests/engine_test.py
index 5b108ce88..3177906fe 100644
--- a/test/accelerate_tests/base_tests/engine_test.py
+++ b/test/accelerate_tests/base_tests/engine_test.py
@@ -93,7 +93,7 @@ def test_ML_serial_base(self):
             engine_params.reg_del2_amplitude = 1.
             engine_params.scale_precond = False
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False)
 
     def test_ML_serial_regularizer(self):
@@ -107,7 +107,7 @@ def test_ML_serial_regularizer(self):
             engine_params.reg_del2_amplitude = 1.
             engine_params.scale_precond = False
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False)
 
 
@@ -123,7 +123,7 @@ def test_ML_serial_preconditioner(self):
             engine_params.scale_precond = True
             engine_params.scale_probe_object = 1e-6
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False)
 
     def test_ML_serial_floating(self):
@@ -137,7 +137,7 @@ def test_ML_serial_floating(self):
             engine_params.reg_del2_amplitude = 1.
             engine_params.scale_precond = False
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False)
 
     @pytest.mark.skip(reason="Funny behaviour with this test, most likely related to Gaussian filter, see issue #607")
@@ -154,7 +154,7 @@ def test_ML_serial_smoothing_regularizer(self):
             engine_params.smooth_gradient_decay = 1/10.
             engine_params.scale_precond = False
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False)
 
     @pytest.mark.skip(reason="Funny behaviour with this test, most likely related to Gaussian filter, see issue #607")
@@ -174,7 +174,7 @@ def test_ML_serial_wavefield_preconditioner(self):
             engine_params.wavefield_delta_object = 0.1
             engine_params.wavefield_delta_probe = 0.1
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False, tol=0.3)
 
     def test_ML_serial_all(self):
diff --git a/test/accelerate_tests/cuda_cupy_tests/engine_test.py b/test/accelerate_tests/cuda_cupy_tests/engine_test.py
index 67851d2cc..2eae5c545 100644
--- a/test/accelerate_tests/cuda_cupy_tests/engine_test.py
+++ b/test/accelerate_tests/cuda_cupy_tests/engine_test.py
@@ -94,7 +94,7 @@ def test_ML_cupy_base(self):
             engine_params.reg_del2_amplitude = 1.
             engine_params.scale_precond = False
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False)
 
     def test_ML_cupy_regularizer(self):
@@ -108,7 +108,7 @@ def test_ML_cupy_regularizer(self):
             engine_params.reg_del2_amplitude = 1.
             engine_params.scale_precond = False
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False)
 
     def test_ML_cupy_preconditioner(self):
@@ -123,7 +123,7 @@ def test_ML_cupy_preconditioner(self):
             engine_params.scale_precond = True
             engine_params.scale_probe_object = 1e-6
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False)
 
     def test_ML_cupy_floating(self):
@@ -137,7 +137,7 @@ def test_ML_cupy_floating(self):
             engine_params.reg_del2_amplitude = 1.
             engine_params.scale_precond = False
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False)
 
     def test_ML_cupy_smoothing_regularizer(self):
@@ -153,7 +153,7 @@ def test_ML_cupy_smoothing_regularizer(self):
             engine_params.smooth_gradient_decay = 1/10.
             engine_params.scale_precond = False
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False)
 
     def test_ML_cupy_wavefield_preconditioner(self):
@@ -172,7 +172,7 @@ def test_ML_cupy_wavefield_preconditioner(self):
             engine_params.wavefield_delta_object = 0.1
             engine_params.wavefield_delta_probe = 0.1
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False, tol=0.3)
 
     def test_ML_cupy_all(self):
@@ -189,7 +189,7 @@ def test_ML_cupy_all(self):
             engine_params.scale_precond = True
             engine_params.scale_probe_object = 1e-6
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="info"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="info", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False)
 
 if __name__ == "__main__":
diff --git a/test/accelerate_tests/cuda_pycuda_tests/engine_test.py b/test/accelerate_tests/cuda_pycuda_tests/engine_test.py
index 39184b2ff..b1148b8a6 100644
--- a/test/accelerate_tests/cuda_pycuda_tests/engine_test.py
+++ b/test/accelerate_tests/cuda_pycuda_tests/engine_test.py
@@ -94,7 +94,7 @@ def test_ML_pycuda_base(self):
             engine_params.reg_del2_amplitude = 1.
             engine_params.scale_precond = False
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False)
 
     def test_ML_pycuda_regularizer(self):
@@ -108,7 +108,7 @@ def test_ML_pycuda_regularizer(self):
             engine_params.reg_del2_amplitude = 1.
             engine_params.scale_precond = False
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False)
 
     def test_ML_pycuda_preconditioner(self):
@@ -123,7 +123,7 @@ def test_ML_pycuda_preconditioner(self):
             engine_params.scale_precond = True
             engine_params.scale_probe_object = 1e-6
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False)
 
     def test_ML_pycuda_floating(self):
@@ -137,7 +137,7 @@ def test_ML_pycuda_floating(self):
             engine_params.reg_del2_amplitude = 1.
             engine_params.scale_precond = False
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False)
 
     def test_ML_pycuda_smoothing_regularizer(self):
@@ -153,7 +153,7 @@ def test_ML_pycuda_smoothing_regularizer(self):
             engine_params.smooth_gradient_decay = 1/10.
             engine_params.scale_precond = False
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False, tol=0.3)
 
     def test_ML_pycuda_wavefield_preconditioner(self):
@@ -172,7 +172,7 @@ def test_ML_pycuda_wavefield_preconditioner(self):
             engine_params.wavefield_delta_object = 0.1
             engine_params.wavefield_delta_probe = 0.1
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False, tol=0.3)
 
     def test_ML_pycuda_all(self):
@@ -189,7 +189,7 @@ def test_ML_pycuda_all(self):
             engine_params.scale_precond = True
             engine_params.scale_probe_object = 1e-6
             out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
-                                           scanmodel="BlockFull", autosave=False, verbose_level="info"))
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical", frames_per_block=100))
         self.check_engine_output(out, plotting=False, debug=False)
 
 if __name__ == "__main__":
diff --git a/test/utils.py b/test/utils.py
index 5e184faa6..1a4e9dd3b 100644
--- a/test/utils.py
+++ b/test/utils.py
@@ -45,9 +45,11 @@ def PtyscanTestRunner(ptyscan_instance, data_params, save_type='append', auto_fr
 
 
 def EngineTestRunner(engine_params,propagator='farfield',output_path='./', output_file=None,
-                    autosave=True, scanmodel="Full", verbose_level="info", init_correct_probe=False):
+                    autosave=True, scanmodel="Full", verbose_level="info", init_correct_probe=False,
+                    frames_per_block=100000):
 
     p = u.Param()
+    p.frames_per_block = frames_per_block
     p.verbose_level = verbose_level
     p.io = u.Param()
     p.io.home = output_path

From f0a721626a556f5c1990e026447bc241758837c5 Mon Sep 17 00:00:00 2001
From: "Benedikt J. Daurer" <bjdaurer@gmail.com>
Date: Thu, 8 May 2025 16:49:59 +0100
Subject: [PATCH 12/12] Added 0.9 release notes (#611)

---
 release_notes.md | 31 ++++++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/release_notes.md b/release_notes.md
index bed47e241..489885686 100644
--- a/release_notes.md
+++ b/release_notes.md
@@ -1,3 +1,29 @@
+# PtyPy 0.9 release notes
+
+We're excited to bring you a new release in preparation for the [PtyPy workshop 2025](https://www.synchrotron-soleil.fr/en/events/ptypy-2025)
+held at SOLEIL. This release provides a new pre-conditioner for the ML engine and a few bug fixes.
+
+## ML with wavefield pre-conditioner
+
+We have added a new pre-conditioner to the ML engines, which seems to significantly improve probe retrieval
+and overall convergence. It can be enabled by setting the following parameters
+
+```
+p.engines.engine.name = "ML"
+p.engines.engine.wavefield_precond = True
+p.engines.engine.wavefield_delta_object = 0.1
+p.engines.engine.wavefield_delta_probe = 0.1
+```
+
+where ```wavefield_delta_object``` and ```wavefield_delta_probe``` are regularisation parameters to avoid
+division by zero. 
+
+## Other changes
+
+We have dropped support for Python <= 3.8 and adopted Numpy >= 2.0.
+We have also fixed an issue with the filtered_fft build and made small improvements to the custom WASP engine.
+
+
 # PtyPy 0.8 release notes
 
 We're excited to bring you a new release, with new engines, CuPy support and
@@ -43,9 +69,8 @@ Patch release.
 
 # PtyPy 0.7 release notes
 
-This release is focused on improving the usability of PtyPy in Jupyter notebooks in preparation for the 
-[PtyPy workshop](https://www.diamond.ac.uk/Home/Events/2023/Ptychography--PtyPy--Software-Workshop-2023.html) 
-held at the Diamond Light Source in January 2023. The workshop features extensive interactive 
+This release is focused on improving the usability of PtyPy in Jupyter notebooks in preparation for the first 
+PtyPy workshop held at the Diamond Light Source in January 2023. The workshop features extensive interactive 
 [tutorials](https://ptycho.github.io/tutorials) delivered using Jupyter notebooks. 
 
 ## Build changes